1 /* Copyright (C) 2010-2017 The RetroArch team
2 *
3 * ---------------------------------------------------------------------------------------
4 * The following license statement only applies to this file (encoding_utf.c).
5 * ---------------------------------------------------------------------------------------
6 *
7 * Permission is hereby granted, free of charge,
8 * to any person obtaining a copy of this software and associated documentation files (the "Software"),
9 * to deal in the Software without restriction, including without limitation the rights to
10 * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software,
11 * and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED,
16 * INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
18 * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
19 * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
21 */
22
23 #include <stdint.h>
24 #include <stdlib.h>
25 #include <stddef.h>
26 #include <string.h>
27
28 #include <boolean.h>
29 #include <compat/strl.h>
30 #include <retro_inline.h>
31
32 #include <encodings/utf.h>
33
34 #if defined(_WIN32) && !defined(_XBOX)
35 #include <windows.h>
36 #elif defined(_XBOX)
37 #include <xtl.h>
38 #endif
39
leading_ones(uint8_t c)40 static unsigned leading_ones(uint8_t c)
41 {
42 unsigned ones = 0;
43 while (c & 0x80)
44 {
45 ones++;
46 c <<= 1;
47 }
48
49 return ones;
50 }
51
52 /* Simple implementation. Assumes the sequence is
53 * properly synchronized and terminated. */
54
utf8_conv_utf32(uint32_t * out,size_t out_chars,const char * in,size_t in_size)55 size_t utf8_conv_utf32(uint32_t *out, size_t out_chars,
56 const char *in, size_t in_size)
57 {
58 unsigned i;
59 size_t ret = 0;
60 while (in_size && out_chars)
61 {
62 unsigned extra, shift;
63 uint32_t c;
64 uint8_t first = *in++;
65 unsigned ones = leading_ones(first);
66
67 if (ones > 6 || ones == 1) /* Invalid or desync. */
68 break;
69
70 extra = ones ? ones - 1 : ones;
71 if (1 + extra > in_size) /* Overflow. */
72 break;
73
74 shift = (extra - 1) * 6;
75 c = (first & ((1 << (7 - ones)) - 1)) << (6 * extra);
76
77 for (i = 0; i < extra; i++, in++, shift -= 6)
78 c |= (*in & 0x3f) << shift;
79
80 *out++ = c;
81 in_size -= 1 + extra;
82 out_chars--;
83 ret++;
84 }
85
86 return ret;
87 }
88
utf16_conv_utf8(uint8_t * out,size_t * out_chars,const uint16_t * in,size_t in_size)89 bool utf16_conv_utf8(uint8_t *out, size_t *out_chars,
90 const uint16_t *in, size_t in_size)
91 {
92 static uint8_t kUtf8Limits[5] = { 0xC0, 0xE0, 0xF0, 0xF8, 0xFC };
93 size_t out_pos = 0;
94 size_t in_pos = 0;
95
96 for (;;)
97 {
98 unsigned numAdds;
99 uint32_t value;
100
101 if (in_pos == in_size)
102 {
103 *out_chars = out_pos;
104 return true;
105 }
106 value = in[in_pos++];
107 if (value < 0x80)
108 {
109 if (out)
110 out[out_pos] = (char)value;
111 out_pos++;
112 continue;
113 }
114
115 if (value >= 0xD800 && value < 0xE000)
116 {
117 uint32_t c2;
118
119 if (value >= 0xDC00 || in_pos == in_size)
120 break;
121 c2 = in[in_pos++];
122 if (c2 < 0xDC00 || c2 >= 0xE000)
123 break;
124 value = (((value - 0xD800) << 10) | (c2 - 0xDC00)) + 0x10000;
125 }
126
127 for (numAdds = 1; numAdds < 5; numAdds++)
128 if (value < (((uint32_t)1) << (numAdds * 5 + 6)))
129 break;
130 if (out)
131 out[out_pos] = (char)(kUtf8Limits[numAdds - 1]
132 + (value >> (6 * numAdds)));
133 out_pos++;
134 do
135 {
136 numAdds--;
137 if (out)
138 out[out_pos] = (char)(0x80
139 + ((value >> (6 * numAdds)) & 0x3F));
140 out_pos++;
141 }while (numAdds != 0);
142 }
143
144 *out_chars = out_pos;
145 return false;
146 }
147
148 /* Acts mostly like strlcpy.
149 *
150 * Copies the given number of UTF-8 characters,
151 * but at most d_len bytes.
152 *
153 * Always NULL terminates.
154 * Does not copy half a character.
155 *
156 * Returns number of bytes. 's' is assumed valid UTF-8.
157 * Use only if 'chars' is considerably less than 'd_len'. */
utf8cpy(char * d,size_t d_len,const char * s,size_t chars)158 size_t utf8cpy(char *d, size_t d_len, const char *s, size_t chars)
159 {
160 const uint8_t *sb = (const uint8_t*)s;
161 const uint8_t *sb_org = sb;
162
163 if (!s)
164 return 0;
165
166 while (*sb && chars-- > 0)
167 {
168 sb++;
169 while ((*sb & 0xC0) == 0x80) sb++;
170 }
171
172 if ((size_t)(sb - sb_org) > d_len-1 /* NUL */)
173 {
174 sb = sb_org + d_len-1;
175 while ((*sb & 0xC0) == 0x80) sb--;
176 }
177
178 memcpy(d, sb_org, sb-sb_org);
179 d[sb-sb_org] = '\0';
180
181 return sb-sb_org;
182 }
183
utf8skip(const char * str,size_t chars)184 const char *utf8skip(const char *str, size_t chars)
185 {
186 const uint8_t *strb = (const uint8_t*)str;
187 if (!chars)
188 return str;
189 do
190 {
191 strb++;
192 while ((*strb & 0xC0)==0x80) strb++;
193 chars--;
194 } while(chars);
195 return (const char*)strb;
196 }
197
utf8len(const char * string)198 size_t utf8len(const char *string)
199 {
200 size_t ret = 0;
201
202 if (!string)
203 return 0;
204
205 while (*string)
206 {
207 if ((*string & 0xC0) != 0x80)
208 ret++;
209 string++;
210 }
211 return ret;
212 }
213
utf8_walkbyte(const char ** string)214 static uint8_t utf8_walkbyte(const char **string)
215 {
216 return *((*string)++);
217 }
218
219 /* Does not validate the input, returns garbage if it's not UTF-8. */
utf8_walk(const char ** string)220 uint32_t utf8_walk(const char **string)
221 {
222 uint8_t first = utf8_walkbyte(string);
223 uint32_t ret = 0;
224
225 if (first < 128)
226 return first;
227
228 ret = (ret << 6) | (utf8_walkbyte(string) & 0x3F);
229 if (first >= 0xE0)
230 ret = (ret << 6) | (utf8_walkbyte(string) & 0x3F);
231 if (first >= 0xF0)
232 ret = (ret << 6) | (utf8_walkbyte(string) & 0x3F);
233
234 if (first >= 0xF0)
235 return ret | (first & 7) << 18;
236 if (first >= 0xE0)
237 return ret | (first & 15) << 12;
238 return ret | (first & 31) << 6;
239 }
240
utf16_to_char(uint8_t ** utf_data,size_t * dest_len,const uint16_t * in)241 static bool utf16_to_char(uint8_t **utf_data,
242 size_t *dest_len, const uint16_t *in)
243 {
244 unsigned len = 0;
245
246 while (in[len] != '\0')
247 len++;
248
249 utf16_conv_utf8(NULL, dest_len, in, len);
250 *dest_len += 1;
251 *utf_data = (uint8_t*)malloc(*dest_len);
252 if (*utf_data == 0)
253 return false;
254
255 return utf16_conv_utf8(*utf_data, dest_len, in, len);
256 }
257
utf16_to_char_string(const uint16_t * in,char * s,size_t len)258 bool utf16_to_char_string(const uint16_t *in, char *s, size_t len)
259 {
260 size_t dest_len = 0;
261 uint8_t *utf16_data = NULL;
262 bool ret = utf16_to_char(&utf16_data, &dest_len, in);
263
264 if (ret)
265 {
266 utf16_data[dest_len] = 0;
267 strlcpy(s, (const char*)utf16_data, len);
268 }
269
270 free(utf16_data);
271 utf16_data = NULL;
272
273 return ret;
274 }
275
276 /* Returned pointer MUST be freed by the caller if non-NULL. */
mb_to_mb_string_alloc(const char * str,enum CodePage cp_in,enum CodePage cp_out)277 static char* mb_to_mb_string_alloc(const char *str,
278 enum CodePage cp_in, enum CodePage cp_out)
279 {
280 char *path_buf = NULL;
281 wchar_t *path_buf_wide = NULL;
282 int path_buf_len = 0;
283 int path_buf_wide_len = 0;
284
285 if (!str || !*str)
286 return NULL;
287
288 (void)path_buf;
289 (void)path_buf_wide;
290 (void)path_buf_len;
291 (void)path_buf_wide_len;
292
293 #if !defined(_WIN32) || defined(_XBOX)
294 /* assume string needs no modification if not on Windows */
295 return strdup(str);
296 #else
297 #ifdef UNICODE
298 /* TODO/FIXME: Not implemented. */
299 return strdup(str);
300 #else
301
302 /* Windows 95 will return 0 from these functions with a UTF8 codepage set without MSLU. From an unknown MSDN version (others omit this info):
303 * - CP_UTF8 Windows 98/Me, Windows NT 4.0 and later: Translate using UTF-8. When this is set, dwFlags must be zero.
304 * - Windows 95: Under the Microsoft Layer for Unicode, MultiByteToWideChar also supports CP_UTF7 and CP_UTF8.
305 */
306 path_buf_wide_len = MultiByteToWideChar(cp_in, 0, str, -1, NULL, 0);
307
308 if (path_buf_wide_len)
309 {
310 path_buf_wide = (wchar_t*)
311 calloc(path_buf_wide_len + sizeof(wchar_t), sizeof(wchar_t));
312
313 if (path_buf_wide)
314 {
315 MultiByteToWideChar(cp_in, 0,
316 str, -1, path_buf_wide, path_buf_wide_len);
317
318 if (*path_buf_wide)
319 {
320 path_buf_len = WideCharToMultiByte(cp_out, 0,
321 path_buf_wide, -1, NULL, 0, NULL, NULL);
322
323 if (path_buf_len)
324 {
325 path_buf = (char*)
326 calloc(path_buf_len + sizeof(char), sizeof(char));
327
328 if (path_buf)
329 {
330 WideCharToMultiByte(cp_out, 0,
331 path_buf_wide, -1, path_buf,
332 path_buf_len, NULL, NULL);
333
334 free(path_buf_wide);
335
336 if (*path_buf)
337 return path_buf;
338
339 free(path_buf);
340 return NULL;
341 }
342 }
343 else
344 {
345 free(path_buf_wide);
346 return strdup(str);
347 }
348 }
349 }
350 }
351 else
352 return strdup(str);
353
354 if (path_buf_wide)
355 free(path_buf_wide);
356
357 return NULL;
358 #endif
359 #endif
360 }
361
362 /* Returned pointer MUST be freed by the caller if non-NULL. */
utf8_to_local_string_alloc(const char * str)363 char* utf8_to_local_string_alloc(const char *str)
364 {
365 return mb_to_mb_string_alloc(str, CODEPAGE_UTF8, CODEPAGE_LOCAL);
366 }
367
368 /* Returned pointer MUST be freed by the caller if non-NULL. */
local_to_utf8_string_alloc(const char * str)369 char* local_to_utf8_string_alloc(const char *str)
370 {
371 return mb_to_mb_string_alloc(str, CODEPAGE_LOCAL, CODEPAGE_UTF8);
372 }
373
374 /* Returned pointer MUST be freed by the caller if non-NULL. */
utf8_to_utf16_string_alloc(const char * str)375 wchar_t* utf8_to_utf16_string_alloc(const char *str)
376 {
377 #ifdef _WIN32
378 int len = 0;
379 int out_len = 0;
380 #else
381 size_t len = 0;
382 size_t out_len = 0;
383 #endif
384 wchar_t *buf = NULL;
385
386 if (!str || !*str)
387 return NULL;
388
389 #ifdef _WIN32
390 len = MultiByteToWideChar(CP_UTF8, 0, str, -1, NULL, 0);
391
392 if (len)
393 {
394 buf = (wchar_t*)calloc(len, sizeof(wchar_t));
395
396 if (!buf)
397 return NULL;
398
399 out_len = MultiByteToWideChar(CP_UTF8, 0, str, -1, buf, len);
400 }
401 else
402 {
403 /* fallback to ANSI codepage instead */
404 len = MultiByteToWideChar(CP_ACP, 0, str, -1, NULL, 0);
405
406 if (len)
407 {
408 buf = (wchar_t*)calloc(len, sizeof(wchar_t));
409
410 if (!buf)
411 return NULL;
412
413 out_len = MultiByteToWideChar(CP_ACP, 0, str, -1, buf, len);
414 }
415 }
416
417 if (out_len < 0)
418 {
419 free(buf);
420 return NULL;
421 }
422 #else
423 /* NOTE: For now, assume non-Windows platforms' locale is already UTF-8. */
424 len = mbstowcs(NULL, str, 0) + 1;
425
426 if (len)
427 {
428 buf = (wchar_t*)calloc(len, sizeof(wchar_t));
429
430 if (!buf)
431 return NULL;
432
433 out_len = mbstowcs(buf, str, len);
434 }
435
436 if (out_len == (size_t)-1)
437 {
438 free(buf);
439 return NULL;
440 }
441 #endif
442
443 return buf;
444 }
445
446 /* Returned pointer MUST be freed by the caller if non-NULL. */
utf16_to_utf8_string_alloc(const wchar_t * str)447 char* utf16_to_utf8_string_alloc(const wchar_t *str)
448 {
449 #ifdef _WIN32
450 int len = 0;
451 int out_len = 0;
452 #else
453 size_t len = 0;
454 size_t out_len = 0;
455 #endif
456 char *buf = NULL;
457
458 if (!str || !*str)
459 return NULL;
460
461 #ifdef _WIN32
462 len = WideCharToMultiByte(CP_UTF8, 0, str, -1, NULL, 0, NULL, NULL);
463
464 if (len)
465 {
466 buf = (char*)calloc(len, sizeof(char));
467
468 if (!buf)
469 return NULL;
470
471 out_len = WideCharToMultiByte(CP_UTF8, 0, str, -1, buf, len, NULL, NULL);
472 }
473 else
474 {
475 /* fallback to ANSI codepage instead */
476 len = WideCharToMultiByte(CP_ACP, 0, str, -1, NULL, 0, NULL, NULL);
477
478 if (len)
479 {
480 buf = (char*)calloc(len, sizeof(char));
481
482 if (!buf)
483 return NULL;
484
485 out_len = WideCharToMultiByte(CP_ACP, 0, str, -1, buf, len, NULL, NULL);
486 }
487 }
488
489 if (out_len < 0)
490 {
491 free(buf);
492 return NULL;
493 }
494 #else
495 /* NOTE: For now, assume non-Windows platforms' locale is already UTF-8. */
496 len = wcstombs(NULL, str, 0) + 1;
497
498 if (len)
499 {
500 buf = (char*)calloc(len, sizeof(char));
501
502 if (!buf)
503 return NULL;
504
505 out_len = wcstombs(buf, str, len);
506 }
507
508 if (out_len == (size_t)-1)
509 {
510 free(buf);
511 return NULL;
512 }
513 #endif
514
515 return buf;
516 }
517