1 /* Copyright  (C) 2010-2018 The RetroArch team
2  *
3  * ---------------------------------------------------------------------------------------
4  * The following license statement only applies to this file (encoding_utf.c).
5  * ---------------------------------------------------------------------------------------
6  *
7  * Permission is hereby granted, free of charge,
8  * to any person obtaining a copy of this software and associated documentation files (the "Software"),
9  * to deal in the Software without restriction, including without limitation the rights to
10  * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software,
11  * and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED,
16  * INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
18  * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
19  * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
21  */
22 
23 #include <stdint.h>
24 #include <stdlib.h>
25 #include <stddef.h>
26 #include <string.h>
27 
28 #include <boolean.h>
29 #include <compat/strl.h>
30 #include <retro_inline.h>
31 
32 #include <encodings/utf.h>
33 
34 #if defined(_WIN32) && !defined(_XBOX)
35 #include <windows.h>
36 #elif defined(_XBOX)
37 #include <xtl.h>
38 #endif
39 
leading_ones(uint8_t c)40 static unsigned leading_ones(uint8_t c)
41 {
42    unsigned ones = 0;
43    while (c & 0x80)
44    {
45       ones++;
46       c <<= 1;
47    }
48 
49    return ones;
50 }
51 
52 /* Simple implementation. Assumes the sequence is
53  * properly synchronized and terminated. */
54 
utf8_conv_utf32(uint32_t * out,size_t out_chars,const char * in,size_t in_size)55 size_t utf8_conv_utf32(uint32_t *out, size_t out_chars,
56       const char *in, size_t in_size)
57 {
58    unsigned i;
59    size_t ret = 0;
60    while (in_size && out_chars)
61    {
62       unsigned extra, shift;
63       uint32_t c;
64       uint8_t first = *in++;
65       unsigned ones = leading_ones(first);
66 
67       if (ones > 6 || ones == 1) /* Invalid or desync. */
68          break;
69 
70       extra = ones ? ones - 1 : ones;
71       if (1 + extra > in_size) /* Overflow. */
72          break;
73 
74       shift = (extra - 1) * 6;
75       c     = (first & ((1 << (7 - ones)) - 1)) << (6 * extra);
76 
77       for (i = 0; i < extra; i++, in++, shift -= 6)
78          c |= (*in & 0x3f) << shift;
79 
80       *out++ = c;
81       in_size -= 1 + extra;
82       out_chars--;
83       ret++;
84    }
85 
86    return ret;
87 }
88 
utf16_conv_utf8(uint8_t * out,size_t * out_chars,const uint16_t * in,size_t in_size)89 bool utf16_conv_utf8(uint8_t *out, size_t *out_chars,
90      const uint16_t *in, size_t in_size)
91 {
92    static uint8_t kUtf8Limits[5] = { 0xC0, 0xE0, 0xF0, 0xF8, 0xFC };
93    size_t out_pos = 0;
94    size_t in_pos  = 0;
95 
96    for (;;)
97    {
98       unsigned numAdds;
99       uint32_t value;
100 
101       if (in_pos == in_size)
102       {
103          *out_chars = out_pos;
104          return true;
105       }
106       value = in[in_pos++];
107       if (value < 0x80)
108       {
109          if (out)
110             out[out_pos] = (char)value;
111          out_pos++;
112          continue;
113       }
114 
115       if (value >= 0xD800 && value < 0xE000)
116       {
117          uint32_t c2;
118 
119          if (value >= 0xDC00 || in_pos == in_size)
120             break;
121          c2 = in[in_pos++];
122          if (c2 < 0xDC00 || c2 >= 0xE000)
123             break;
124          value = (((value - 0xD800) << 10) | (c2 - 0xDC00)) + 0x10000;
125       }
126 
127       for (numAdds = 1; numAdds < 5; numAdds++)
128          if (value < (((uint32_t)1) << (numAdds * 5 + 6)))
129             break;
130       if (out)
131          out[out_pos] = (char)(kUtf8Limits[numAdds - 1]
132                + (value >> (6 * numAdds)));
133       out_pos++;
134       do
135       {
136          numAdds--;
137          if (out)
138             out[out_pos] = (char)(0x80
139                   + ((value >> (6 * numAdds)) & 0x3F));
140          out_pos++;
141       }while (numAdds != 0);
142    }
143 
144    *out_chars = out_pos;
145    return false;
146 }
147 
148 /* Acts mostly like strlcpy.
149  *
150  * Copies the given number of UTF-8 characters,
151  * but at most d_len bytes.
152  *
153  * Always NULL terminates.
154  * Does not copy half a character.
155  *
156  * Returns number of bytes. 's' is assumed valid UTF-8.
157  * Use only if 'chars' is considerably less than 'd_len'. */
utf8cpy(char * d,size_t d_len,const char * s,size_t chars)158 size_t utf8cpy(char *d, size_t d_len, const char *s, size_t chars)
159 {
160    const uint8_t *sb     = (const uint8_t*)s;
161    const uint8_t *sb_org = sb;
162 
163    if (!s)
164       return 0;
165 
166    while (*sb && chars-- > 0)
167    {
168       sb++;
169       while ((*sb & 0xC0) == 0x80) sb++;
170    }
171 
172    if ((size_t)(sb - sb_org) > d_len-1 /* NUL */)
173    {
174       sb = sb_org + d_len-1;
175       while ((*sb & 0xC0) == 0x80) sb--;
176    }
177 
178    memcpy(d, sb_org, sb-sb_org);
179    d[sb-sb_org] = '\0';
180 
181    return sb-sb_org;
182 }
183 
utf8skip(const char * str,size_t chars)184 const char *utf8skip(const char *str, size_t chars)
185 {
186    const uint8_t *strb = (const uint8_t*)str;
187    if (!chars)
188       return str;
189    do
190    {
191       strb++;
192       while ((*strb & 0xC0)==0x80) strb++;
193       chars--;
194    } while(chars);
195    return (const char*)strb;
196 }
197 
utf8len(const char * string)198 size_t utf8len(const char *string)
199 {
200    size_t ret = 0;
201 
202    if (!string)
203       return 0;
204 
205    while (*string)
206    {
207       if ((*string & 0xC0) != 0x80)
208          ret++;
209       string++;
210    }
211    return ret;
212 }
213 
214 #define utf8_walkbyte(string) (*((*(string))++))
215 
216 /* Does not validate the input, returns garbage if it's not UTF-8. */
utf8_walk(const char ** string)217 uint32_t utf8_walk(const char **string)
218 {
219    uint8_t first = utf8_walkbyte(string);
220    uint32_t ret  = 0;
221 
222    if (first < 128)
223       return first;
224 
225    ret    = (ret << 6) | (utf8_walkbyte(string) & 0x3F);
226    if (first >= 0xE0)
227    {
228       ret = (ret << 6) | (utf8_walkbyte(string) & 0x3F);
229       if (first >= 0xF0)
230       {
231          ret = (ret << 6) | (utf8_walkbyte(string) & 0x3F);
232          return ret | (first & 7) << 18;
233       }
234       return ret | (first & 15) << 12;
235    }
236 
237    return ret | (first & 31) << 6;
238 }
239 
utf16_to_char(uint8_t ** utf_data,size_t * dest_len,const uint16_t * in)240 static bool utf16_to_char(uint8_t **utf_data,
241       size_t *dest_len, const uint16_t *in)
242 {
243    unsigned len    = 0;
244 
245    while (in[len] != '\0')
246       len++;
247 
248    utf16_conv_utf8(NULL, dest_len, in, len);
249    *dest_len  += 1;
250    *utf_data   = (uint8_t*)malloc(*dest_len);
251    if (*utf_data == 0)
252       return false;
253 
254    return utf16_conv_utf8(*utf_data, dest_len, in, len);
255 }
256 
utf16_to_char_string(const uint16_t * in,char * s,size_t len)257 bool utf16_to_char_string(const uint16_t *in, char *s, size_t len)
258 {
259    size_t     dest_len  = 0;
260    uint8_t *utf16_data  = NULL;
261    bool            ret  = utf16_to_char(&utf16_data, &dest_len, in);
262 
263    if (ret)
264    {
265       utf16_data[dest_len] = 0;
266       strlcpy(s, (const char*)utf16_data, len);
267    }
268 
269    free(utf16_data);
270    utf16_data = NULL;
271 
272    return ret;
273 }
274 
275 #if defined(_WIN32) && !defined(_XBOX) && !defined(UNICODE)
276 /* Returned pointer MUST be freed by the caller if non-NULL. */
mb_to_mb_string_alloc(const char * str,enum CodePage cp_in,enum CodePage cp_out)277 static char *mb_to_mb_string_alloc(const char *str,
278       enum CodePage cp_in, enum CodePage cp_out)
279 {
280    char *path_buf         = NULL;
281    wchar_t *path_buf_wide = NULL;
282    int path_buf_len       = 0;
283    int path_buf_wide_len  = MultiByteToWideChar(cp_in, 0, str, -1, NULL, 0);
284 
285    /* Windows 95 will return 0 from these functions with
286     * a UTF8 codepage set without MSLU.
287     *
288     * From an unknown MSDN version (others omit this info):
289     *   - CP_UTF8 Windows 98/Me, Windows NT 4.0 and later:
290     *   Translate using UTF-8. When this is set, dwFlags must be zero.
291     *   - Windows 95: Under the Microsoft Layer for Unicode,
292     *   MultiByteToWideChar also supports CP_UTF7 and CP_UTF8.
293     */
294 
295    if (path_buf_wide_len)
296    {
297       path_buf_wide = (wchar_t*)
298          calloc(path_buf_wide_len + sizeof(wchar_t), sizeof(wchar_t));
299 
300       if (path_buf_wide)
301       {
302          MultiByteToWideChar(cp_in, 0,
303                str, -1, path_buf_wide, path_buf_wide_len);
304 
305          if (*path_buf_wide)
306          {
307             path_buf_len = WideCharToMultiByte(cp_out, 0,
308                   path_buf_wide, -1, NULL, 0, NULL, NULL);
309 
310             if (path_buf_len)
311             {
312                path_buf = (char*)
313                   calloc(path_buf_len + sizeof(char), sizeof(char));
314 
315                if (path_buf)
316                {
317                   WideCharToMultiByte(cp_out, 0,
318                         path_buf_wide, -1, path_buf,
319                         path_buf_len, NULL, NULL);
320 
321                   free(path_buf_wide);
322 
323                   if (*path_buf)
324                      return path_buf;
325 
326                   free(path_buf);
327                   return NULL;
328                }
329             }
330             else
331             {
332                free(path_buf_wide);
333                return strdup(str);
334             }
335          }
336       }
337    }
338    else
339       return strdup(str);
340 
341    if (path_buf_wide)
342       free(path_buf_wide);
343 
344    return NULL;
345 }
346 #endif
347 
348 /* Returned pointer MUST be freed by the caller if non-NULL. */
utf8_to_local_string_alloc(const char * str)349 char* utf8_to_local_string_alloc(const char *str)
350 {
351    if (str && *str)
352    {
353 #if defined(_WIN32) && !defined(_XBOX) && !defined(UNICODE)
354       return mb_to_mb_string_alloc(str, CODEPAGE_UTF8, CODEPAGE_LOCAL);
355 #else
356       /* assume string needs no modification if not on Windows */
357       return strdup(str);
358 #endif
359    }
360    return NULL;
361 }
362 
363 /* Returned pointer MUST be freed by the caller if non-NULL. */
local_to_utf8_string_alloc(const char * str)364 char* local_to_utf8_string_alloc(const char *str)
365 {
366    if (str && *str)
367    {
368 #if defined(_WIN32) && !defined(_XBOX) && !defined(UNICODE)
369       return mb_to_mb_string_alloc(str, CODEPAGE_LOCAL, CODEPAGE_UTF8);
370 #else
371       /* assume string needs no modification if not on Windows */
372       return strdup(str);
373 #endif
374    }
375    return NULL;
376 }
377 
378 /* Returned pointer MUST be freed by the caller if non-NULL. */
utf8_to_utf16_string_alloc(const char * str)379 wchar_t* utf8_to_utf16_string_alloc(const char *str)
380 {
381 #ifdef _WIN32
382    int len = 0;
383    int out_len = 0;
384 #else
385    size_t len = 0;
386    size_t out_len = 0;
387 #endif
388    wchar_t *buf = NULL;
389 
390    if (!str || !*str)
391       return NULL;
392 
393 #ifdef _WIN32
394    len = MultiByteToWideChar(CP_UTF8, 0, str, -1, NULL, 0);
395 
396    if (len)
397    {
398       buf = (wchar_t*)calloc(len, sizeof(wchar_t));
399 
400       if (!buf)
401          return NULL;
402 
403       out_len = MultiByteToWideChar(CP_UTF8, 0, str, -1, buf, len);
404    }
405    else
406    {
407       /* fallback to ANSI codepage instead */
408       len = MultiByteToWideChar(CP_ACP, 0, str, -1, NULL, 0);
409 
410       if (len)
411       {
412          buf = (wchar_t*)calloc(len, sizeof(wchar_t));
413 
414          if (!buf)
415             return NULL;
416 
417          out_len = MultiByteToWideChar(CP_ACP, 0, str, -1, buf, len);
418       }
419    }
420 
421    if (out_len < 0)
422    {
423       free(buf);
424       return NULL;
425    }
426 #else
427    /* NOTE: For now, assume non-Windows platforms' locale is already UTF-8. */
428    len = mbstowcs(NULL, str, 0) + 1;
429 
430    if (len)
431    {
432       buf = (wchar_t*)calloc(len, sizeof(wchar_t));
433 
434       if (!buf)
435          return NULL;
436 
437       out_len = mbstowcs(buf, str, len);
438    }
439 
440    if (out_len == (size_t)-1)
441    {
442       free(buf);
443       return NULL;
444    }
445 #endif
446 
447    return buf;
448 }
449 
450 /* Returned pointer MUST be freed by the caller if non-NULL. */
utf16_to_utf8_string_alloc(const wchar_t * str)451 char* utf16_to_utf8_string_alloc(const wchar_t *str)
452 {
453 #ifdef _WIN32
454    int len        = 0;
455 #else
456    size_t len     = 0;
457 #endif
458    char *buf      = NULL;
459 
460    if (!str || !*str)
461       return NULL;
462 
463 #ifdef _WIN32
464    {
465       UINT code_page = CP_UTF8;
466       len            = WideCharToMultiByte(code_page,
467             0, str, -1, NULL, 0, NULL, NULL);
468 
469       /* fallback to ANSI codepage instead */
470       if (!len)
471       {
472          code_page   = CP_ACP;
473          len         = WideCharToMultiByte(code_page,
474                0, str, -1, NULL, 0, NULL, NULL);
475       }
476 
477       buf = (char*)calloc(len, sizeof(char));
478 
479       if (!buf)
480          return NULL;
481 
482       if (WideCharToMultiByte(code_page,
483             0, str, -1, buf, len, NULL, NULL) < 0)
484       {
485          free(buf);
486          return NULL;
487       }
488    }
489 #else
490    /* NOTE: For now, assume non-Windows platforms'
491     * locale is already UTF-8. */
492    len = wcstombs(NULL, str, 0) + 1;
493 
494    if (len)
495    {
496       buf = (char*)calloc(len, sizeof(char));
497 
498       if (!buf)
499          return NULL;
500 
501       if (wcstombs(buf, str, len) == (size_t)-1)
502       {
503          free(buf);
504          return NULL;
505       }
506    }
507 #endif
508 
509    return buf;
510 }
511