1 /* Copyright (C) 2010-2018 The RetroArch team
2 *
3 * ---------------------------------------------------------------------------------------
4 * The following license statement only applies to this file (encoding_utf.c).
5 * ---------------------------------------------------------------------------------------
6 *
7 * Permission is hereby granted, free of charge,
8 * to any person obtaining a copy of this software and associated documentation files (the "Software"),
9 * to deal in the Software without restriction, including without limitation the rights to
10 * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software,
11 * and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED,
16 * INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
18 * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
19 * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
21 */
22
23 #include <stdint.h>
24 #include <stdlib.h>
25 #include <stddef.h>
26 #include <string.h>
27
28 #include <boolean.h>
29 #include <compat/strl.h>
30 #include <retro_inline.h>
31
32 #include <encodings/utf.h>
33
34 #if defined(_WIN32) && !defined(_XBOX)
35 #include <windows.h>
36 #elif defined(_XBOX)
37 #include <xtl.h>
38 #endif
39
leading_ones(uint8_t c)40 static unsigned leading_ones(uint8_t c)
41 {
42 unsigned ones = 0;
43 while (c & 0x80)
44 {
45 ones++;
46 c <<= 1;
47 }
48
49 return ones;
50 }
51
52 /* Simple implementation. Assumes the sequence is
53 * properly synchronized and terminated. */
54
utf8_conv_utf32(uint32_t * out,size_t out_chars,const char * in,size_t in_size)55 size_t utf8_conv_utf32(uint32_t *out, size_t out_chars,
56 const char *in, size_t in_size)
57 {
58 unsigned i;
59 size_t ret = 0;
60 while (in_size && out_chars)
61 {
62 unsigned extra, shift;
63 uint32_t c;
64 uint8_t first = *in++;
65 unsigned ones = leading_ones(first);
66
67 if (ones > 6 || ones == 1) /* Invalid or desync. */
68 break;
69
70 extra = ones ? ones - 1 : ones;
71 if (1 + extra > in_size) /* Overflow. */
72 break;
73
74 shift = (extra - 1) * 6;
75 c = (first & ((1 << (7 - ones)) - 1)) << (6 * extra);
76
77 for (i = 0; i < extra; i++, in++, shift -= 6)
78 c |= (*in & 0x3f) << shift;
79
80 *out++ = c;
81 in_size -= 1 + extra;
82 out_chars--;
83 ret++;
84 }
85
86 return ret;
87 }
88
utf16_conv_utf8(uint8_t * out,size_t * out_chars,const uint16_t * in,size_t in_size)89 bool utf16_conv_utf8(uint8_t *out, size_t *out_chars,
90 const uint16_t *in, size_t in_size)
91 {
92 static uint8_t kUtf8Limits[5] = { 0xC0, 0xE0, 0xF0, 0xF8, 0xFC };
93 size_t out_pos = 0;
94 size_t in_pos = 0;
95
96 for (;;)
97 {
98 unsigned numAdds;
99 uint32_t value;
100
101 if (in_pos == in_size)
102 {
103 *out_chars = out_pos;
104 return true;
105 }
106 value = in[in_pos++];
107 if (value < 0x80)
108 {
109 if (out)
110 out[out_pos] = (char)value;
111 out_pos++;
112 continue;
113 }
114
115 if (value >= 0xD800 && value < 0xE000)
116 {
117 uint32_t c2;
118
119 if (value >= 0xDC00 || in_pos == in_size)
120 break;
121 c2 = in[in_pos++];
122 if (c2 < 0xDC00 || c2 >= 0xE000)
123 break;
124 value = (((value - 0xD800) << 10) | (c2 - 0xDC00)) + 0x10000;
125 }
126
127 for (numAdds = 1; numAdds < 5; numAdds++)
128 if (value < (((uint32_t)1) << (numAdds * 5 + 6)))
129 break;
130 if (out)
131 out[out_pos] = (char)(kUtf8Limits[numAdds - 1]
132 + (value >> (6 * numAdds)));
133 out_pos++;
134 do
135 {
136 numAdds--;
137 if (out)
138 out[out_pos] = (char)(0x80
139 + ((value >> (6 * numAdds)) & 0x3F));
140 out_pos++;
141 }while (numAdds != 0);
142 }
143
144 *out_chars = out_pos;
145 return false;
146 }
147
148 /* Acts mostly like strlcpy.
149 *
150 * Copies the given number of UTF-8 characters,
151 * but at most d_len bytes.
152 *
153 * Always NULL terminates.
154 * Does not copy half a character.
155 *
156 * Returns number of bytes. 's' is assumed valid UTF-8.
157 * Use only if 'chars' is considerably less than 'd_len'. */
utf8cpy(char * d,size_t d_len,const char * s,size_t chars)158 size_t utf8cpy(char *d, size_t d_len, const char *s, size_t chars)
159 {
160 const uint8_t *sb = (const uint8_t*)s;
161 const uint8_t *sb_org = sb;
162
163 if (!s)
164 return 0;
165
166 while (*sb && chars-- > 0)
167 {
168 sb++;
169 while ((*sb & 0xC0) == 0x80) sb++;
170 }
171
172 if ((size_t)(sb - sb_org) > d_len-1 /* NUL */)
173 {
174 sb = sb_org + d_len-1;
175 while ((*sb & 0xC0) == 0x80) sb--;
176 }
177
178 memcpy(d, sb_org, sb-sb_org);
179 d[sb-sb_org] = '\0';
180
181 return sb-sb_org;
182 }
183
utf8skip(const char * str,size_t chars)184 const char *utf8skip(const char *str, size_t chars)
185 {
186 const uint8_t *strb = (const uint8_t*)str;
187 if (!chars)
188 return str;
189 do
190 {
191 strb++;
192 while ((*strb & 0xC0)==0x80) strb++;
193 chars--;
194 } while(chars);
195 return (const char*)strb;
196 }
197
utf8len(const char * string)198 size_t utf8len(const char *string)
199 {
200 size_t ret = 0;
201
202 if (!string)
203 return 0;
204
205 while (*string)
206 {
207 if ((*string & 0xC0) != 0x80)
208 ret++;
209 string++;
210 }
211 return ret;
212 }
213
214 #define utf8_walkbyte(string) (*((*(string))++))
215
216 /* Does not validate the input, returns garbage if it's not UTF-8. */
utf8_walk(const char ** string)217 uint32_t utf8_walk(const char **string)
218 {
219 uint8_t first = utf8_walkbyte(string);
220 uint32_t ret = 0;
221
222 if (first < 128)
223 return first;
224
225 ret = (ret << 6) | (utf8_walkbyte(string) & 0x3F);
226 if (first >= 0xE0)
227 {
228 ret = (ret << 6) | (utf8_walkbyte(string) & 0x3F);
229 if (first >= 0xF0)
230 {
231 ret = (ret << 6) | (utf8_walkbyte(string) & 0x3F);
232 return ret | (first & 7) << 18;
233 }
234 return ret | (first & 15) << 12;
235 }
236
237 return ret | (first & 31) << 6;
238 }
239
utf16_to_char(uint8_t ** utf_data,size_t * dest_len,const uint16_t * in)240 static bool utf16_to_char(uint8_t **utf_data,
241 size_t *dest_len, const uint16_t *in)
242 {
243 unsigned len = 0;
244
245 while (in[len] != '\0')
246 len++;
247
248 utf16_conv_utf8(NULL, dest_len, in, len);
249 *dest_len += 1;
250 *utf_data = (uint8_t*)malloc(*dest_len);
251 if (*utf_data == 0)
252 return false;
253
254 return utf16_conv_utf8(*utf_data, dest_len, in, len);
255 }
256
utf16_to_char_string(const uint16_t * in,char * s,size_t len)257 bool utf16_to_char_string(const uint16_t *in, char *s, size_t len)
258 {
259 size_t dest_len = 0;
260 uint8_t *utf16_data = NULL;
261 bool ret = utf16_to_char(&utf16_data, &dest_len, in);
262
263 if (ret)
264 {
265 utf16_data[dest_len] = 0;
266 strlcpy(s, (const char*)utf16_data, len);
267 }
268
269 free(utf16_data);
270 utf16_data = NULL;
271
272 return ret;
273 }
274
275 #if defined(_WIN32) && !defined(_XBOX) && !defined(UNICODE)
276 /* Returned pointer MUST be freed by the caller if non-NULL. */
mb_to_mb_string_alloc(const char * str,enum CodePage cp_in,enum CodePage cp_out)277 static char *mb_to_mb_string_alloc(const char *str,
278 enum CodePage cp_in, enum CodePage cp_out)
279 {
280 char *path_buf = NULL;
281 wchar_t *path_buf_wide = NULL;
282 int path_buf_len = 0;
283 int path_buf_wide_len = MultiByteToWideChar(cp_in, 0, str, -1, NULL, 0);
284
285 /* Windows 95 will return 0 from these functions with
286 * a UTF8 codepage set without MSLU.
287 *
288 * From an unknown MSDN version (others omit this info):
289 * - CP_UTF8 Windows 98/Me, Windows NT 4.0 and later:
290 * Translate using UTF-8. When this is set, dwFlags must be zero.
291 * - Windows 95: Under the Microsoft Layer for Unicode,
292 * MultiByteToWideChar also supports CP_UTF7 and CP_UTF8.
293 */
294
295 if (path_buf_wide_len)
296 {
297 path_buf_wide = (wchar_t*)
298 calloc(path_buf_wide_len + sizeof(wchar_t), sizeof(wchar_t));
299
300 if (path_buf_wide)
301 {
302 MultiByteToWideChar(cp_in, 0,
303 str, -1, path_buf_wide, path_buf_wide_len);
304
305 if (*path_buf_wide)
306 {
307 path_buf_len = WideCharToMultiByte(cp_out, 0,
308 path_buf_wide, -1, NULL, 0, NULL, NULL);
309
310 if (path_buf_len)
311 {
312 path_buf = (char*)
313 calloc(path_buf_len + sizeof(char), sizeof(char));
314
315 if (path_buf)
316 {
317 WideCharToMultiByte(cp_out, 0,
318 path_buf_wide, -1, path_buf,
319 path_buf_len, NULL, NULL);
320
321 free(path_buf_wide);
322
323 if (*path_buf)
324 return path_buf;
325
326 free(path_buf);
327 return NULL;
328 }
329 }
330 else
331 {
332 free(path_buf_wide);
333 return strdup(str);
334 }
335 }
336 }
337 }
338 else
339 return strdup(str);
340
341 if (path_buf_wide)
342 free(path_buf_wide);
343
344 return NULL;
345 }
346 #endif
347
348 /* Returned pointer MUST be freed by the caller if non-NULL. */
utf8_to_local_string_alloc(const char * str)349 char* utf8_to_local_string_alloc(const char *str)
350 {
351 if (str && *str)
352 {
353 #if defined(_WIN32) && !defined(_XBOX) && !defined(UNICODE)
354 return mb_to_mb_string_alloc(str, CODEPAGE_UTF8, CODEPAGE_LOCAL);
355 #else
356 /* assume string needs no modification if not on Windows */
357 return strdup(str);
358 #endif
359 }
360 return NULL;
361 }
362
363 /* Returned pointer MUST be freed by the caller if non-NULL. */
local_to_utf8_string_alloc(const char * str)364 char* local_to_utf8_string_alloc(const char *str)
365 {
366 if (str && *str)
367 {
368 #if defined(_WIN32) && !defined(_XBOX) && !defined(UNICODE)
369 return mb_to_mb_string_alloc(str, CODEPAGE_LOCAL, CODEPAGE_UTF8);
370 #else
371 /* assume string needs no modification if not on Windows */
372 return strdup(str);
373 #endif
374 }
375 return NULL;
376 }
377
378 /* Returned pointer MUST be freed by the caller if non-NULL. */
utf8_to_utf16_string_alloc(const char * str)379 wchar_t* utf8_to_utf16_string_alloc(const char *str)
380 {
381 #ifdef _WIN32
382 int len = 0;
383 int out_len = 0;
384 #else
385 size_t len = 0;
386 size_t out_len = 0;
387 #endif
388 wchar_t *buf = NULL;
389
390 if (!str || !*str)
391 return NULL;
392
393 #ifdef _WIN32
394 len = MultiByteToWideChar(CP_UTF8, 0, str, -1, NULL, 0);
395
396 if (len)
397 {
398 buf = (wchar_t*)calloc(len, sizeof(wchar_t));
399
400 if (!buf)
401 return NULL;
402
403 out_len = MultiByteToWideChar(CP_UTF8, 0, str, -1, buf, len);
404 }
405 else
406 {
407 /* fallback to ANSI codepage instead */
408 len = MultiByteToWideChar(CP_ACP, 0, str, -1, NULL, 0);
409
410 if (len)
411 {
412 buf = (wchar_t*)calloc(len, sizeof(wchar_t));
413
414 if (!buf)
415 return NULL;
416
417 out_len = MultiByteToWideChar(CP_ACP, 0, str, -1, buf, len);
418 }
419 }
420
421 if (out_len < 0)
422 {
423 free(buf);
424 return NULL;
425 }
426 #else
427 /* NOTE: For now, assume non-Windows platforms' locale is already UTF-8. */
428 len = mbstowcs(NULL, str, 0) + 1;
429
430 if (len)
431 {
432 buf = (wchar_t*)calloc(len, sizeof(wchar_t));
433
434 if (!buf)
435 return NULL;
436
437 out_len = mbstowcs(buf, str, len);
438 }
439
440 if (out_len == (size_t)-1)
441 {
442 free(buf);
443 return NULL;
444 }
445 #endif
446
447 return buf;
448 }
449
450 /* Returned pointer MUST be freed by the caller if non-NULL. */
utf16_to_utf8_string_alloc(const wchar_t * str)451 char* utf16_to_utf8_string_alloc(const wchar_t *str)
452 {
453 #ifdef _WIN32
454 int len = 0;
455 #else
456 size_t len = 0;
457 #endif
458 char *buf = NULL;
459
460 if (!str || !*str)
461 return NULL;
462
463 #ifdef _WIN32
464 {
465 UINT code_page = CP_UTF8;
466 len = WideCharToMultiByte(code_page,
467 0, str, -1, NULL, 0, NULL, NULL);
468
469 /* fallback to ANSI codepage instead */
470 if (!len)
471 {
472 code_page = CP_ACP;
473 len = WideCharToMultiByte(code_page,
474 0, str, -1, NULL, 0, NULL, NULL);
475 }
476
477 buf = (char*)calloc(len, sizeof(char));
478
479 if (!buf)
480 return NULL;
481
482 if (WideCharToMultiByte(code_page,
483 0, str, -1, buf, len, NULL, NULL) < 0)
484 {
485 free(buf);
486 return NULL;
487 }
488 }
489 #else
490 /* NOTE: For now, assume non-Windows platforms'
491 * locale is already UTF-8. */
492 len = wcstombs(NULL, str, 0) + 1;
493
494 if (len)
495 {
496 buf = (char*)calloc(len, sizeof(char));
497
498 if (!buf)
499 return NULL;
500
501 if (wcstombs(buf, str, len) == (size_t)-1)
502 {
503 free(buf);
504 return NULL;
505 }
506 }
507 #endif
508
509 return buf;
510 }
511