1 /* Copyright  (C) 2010-2018 The RetroArch team
2  *
3  * ---------------------------------------------------------------------------------------
4  * The following license statement only applies to this file (encoding_utf.c).
5  * ---------------------------------------------------------------------------------------
6  *
7  * Permission is hereby granted, free of charge,
8  * to any person obtaining a copy of this software and associated documentation files (the "Software"),
9  * to deal in the Software without restriction, including without limitation the rights to
10  * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software,
11  * and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED,
16  * INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
18  * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
19  * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
21  */
22 
23 #include <stdint.h>
24 #include <stdlib.h>
25 #include <stddef.h>
26 #include <string.h>
27 
28 #include <boolean.h>
29 #include <compat/strl.h>
30 #include <retro_inline.h>
31 
32 #include <encodings/utf.h>
33 
34 #if defined(_WIN32) && !defined(_XBOX)
35 #include <windows.h>
36 #elif defined(_XBOX)
37 #include <xtl.h>
38 #endif
39 
leading_ones(uint8_t c)40 static unsigned leading_ones(uint8_t c)
41 {
42    unsigned ones = 0;
43    while (c & 0x80)
44    {
45       ones++;
46       c <<= 1;
47    }
48 
49    return ones;
50 }
51 
52 /* Simple implementation. Assumes the sequence is
53  * properly synchronized and terminated. */
54 
utf8_conv_utf32(uint32_t * out,size_t out_chars,const char * in,size_t in_size)55 size_t utf8_conv_utf32(uint32_t *out, size_t out_chars,
56       const char *in, size_t in_size)
57 {
58    unsigned i;
59    size_t ret = 0;
60    while (in_size && out_chars)
61    {
62       unsigned extra, shift;
63       uint32_t c;
64       uint8_t first = *in++;
65       unsigned ones = leading_ones(first);
66 
67       if (ones > 6 || ones == 1) /* Invalid or desync. */
68          break;
69 
70       extra = ones ? ones - 1 : ones;
71       if (1 + extra > in_size) /* Overflow. */
72          break;
73 
74       shift = (extra - 1) * 6;
75       c     = (first & ((1 << (7 - ones)) - 1)) << (6 * extra);
76 
77       for (i = 0; i < extra; i++, in++, shift -= 6)
78          c |= (*in & 0x3f) << shift;
79 
80       *out++ = c;
81       in_size -= 1 + extra;
82       out_chars--;
83       ret++;
84    }
85 
86    return ret;
87 }
88 
utf16_conv_utf8(uint8_t * out,size_t * out_chars,const uint16_t * in,size_t in_size)89 bool utf16_conv_utf8(uint8_t *out, size_t *out_chars,
90      const uint16_t *in, size_t in_size)
91 {
92    static uint8_t kUtf8Limits[5] = { 0xC0, 0xE0, 0xF0, 0xF8, 0xFC };
93    size_t out_pos = 0;
94    size_t in_pos  = 0;
95 
96    for (;;)
97    {
98       unsigned numAdds;
99       uint32_t value;
100 
101       if (in_pos == in_size)
102       {
103          *out_chars = out_pos;
104          return true;
105       }
106       value = in[in_pos++];
107       if (value < 0x80)
108       {
109          if (out)
110             out[out_pos] = (char)value;
111          out_pos++;
112          continue;
113       }
114 
115       if (value >= 0xD800 && value < 0xE000)
116       {
117          uint32_t c2;
118 
119          if (value >= 0xDC00 || in_pos == in_size)
120             break;
121          c2 = in[in_pos++];
122          if (c2 < 0xDC00 || c2 >= 0xE000)
123             break;
124          value = (((value - 0xD800) << 10) | (c2 - 0xDC00)) + 0x10000;
125       }
126 
127       for (numAdds = 1; numAdds < 5; numAdds++)
128          if (value < (((uint32_t)1) << (numAdds * 5 + 6)))
129             break;
130       if (out)
131          out[out_pos] = (char)(kUtf8Limits[numAdds - 1]
132                + (value >> (6 * numAdds)));
133       out_pos++;
134       do
135       {
136          numAdds--;
137          if (out)
138             out[out_pos] = (char)(0x80
139                   + ((value >> (6 * numAdds)) & 0x3F));
140          out_pos++;
141       }while (numAdds != 0);
142    }
143 
144    *out_chars = out_pos;
145    return false;
146 }
147 
148 /* Acts mostly like strlcpy.
149  *
150  * Copies the given number of UTF-8 characters,
151  * but at most d_len bytes.
152  *
153  * Always NULL terminates.
154  * Does not copy half a character.
155  *
156  * Returns number of bytes. 's' is assumed valid UTF-8.
157  * Use only if 'chars' is considerably less than 'd_len'. */
utf8cpy(char * d,size_t d_len,const char * s,size_t chars)158 size_t utf8cpy(char *d, size_t d_len, const char *s, size_t chars)
159 {
160    const uint8_t *sb     = (const uint8_t*)s;
161    const uint8_t *sb_org = sb;
162 
163    if (!s)
164       return 0;
165 
166    while (*sb && chars-- > 0)
167    {
168       sb++;
169       while ((*sb & 0xC0) == 0x80) sb++;
170    }
171 
172    if ((size_t)(sb - sb_org) > d_len-1 /* NUL */)
173    {
174       sb = sb_org + d_len-1;
175       while ((*sb & 0xC0) == 0x80) sb--;
176    }
177 
178    memcpy(d, sb_org, sb-sb_org);
179    d[sb-sb_org] = '\0';
180 
181    return sb-sb_org;
182 }
183 
utf8skip(const char * str,size_t chars)184 const char *utf8skip(const char *str, size_t chars)
185 {
186    const uint8_t *strb = (const uint8_t*)str;
187    if (!chars)
188       return str;
189    do
190    {
191       strb++;
192       while ((*strb & 0xC0)==0x80) strb++;
193       chars--;
194    } while(chars);
195    return (const char*)strb;
196 }
197 
utf8len(const char * string)198 size_t utf8len(const char *string)
199 {
200    size_t ret = 0;
201 
202    if (!string)
203       return 0;
204 
205    while (*string)
206    {
207       if ((*string & 0xC0) != 0x80)
208          ret++;
209       string++;
210    }
211    return ret;
212 }
213 
utf8_walkbyte(const char ** string)214 static uint8_t utf8_walkbyte(const char **string)
215 {
216    return *((*string)++);
217 }
218 
219 /* Does not validate the input, returns garbage if it's not UTF-8. */
utf8_walk(const char ** string)220 uint32_t utf8_walk(const char **string)
221 {
222    uint8_t first = utf8_walkbyte(string);
223    uint32_t ret  = 0;
224 
225    if (first < 128)
226       return first;
227 
228    ret    = (ret << 6) | (utf8_walkbyte(string) & 0x3F);
229    if (first >= 0xE0)
230       ret = (ret << 6) | (utf8_walkbyte(string) & 0x3F);
231    if (first >= 0xF0)
232       ret = (ret << 6) | (utf8_walkbyte(string) & 0x3F);
233 
234    if (first >= 0xF0)
235       return ret | (first & 7) << 18;
236    if (first >= 0xE0)
237       return ret | (first & 15) << 12;
238    return ret | (first & 31) << 6;
239 }
240 
utf16_to_char(uint8_t ** utf_data,size_t * dest_len,const uint16_t * in)241 static bool utf16_to_char(uint8_t **utf_data,
242       size_t *dest_len, const uint16_t *in)
243 {
244    unsigned len    = 0;
245 
246    while (in[len] != '\0')
247       len++;
248 
249    utf16_conv_utf8(NULL, dest_len, in, len);
250    *dest_len  += 1;
251    *utf_data   = (uint8_t*)malloc(*dest_len);
252    if (*utf_data == 0)
253       return false;
254 
255    return utf16_conv_utf8(*utf_data, dest_len, in, len);
256 }
257 
utf16_to_char_string(const uint16_t * in,char * s,size_t len)258 bool utf16_to_char_string(const uint16_t *in, char *s, size_t len)
259 {
260    size_t     dest_len  = 0;
261    uint8_t *utf16_data  = NULL;
262    bool            ret  = utf16_to_char(&utf16_data, &dest_len, in);
263 
264    if (ret)
265    {
266       utf16_data[dest_len] = 0;
267       strlcpy(s, (const char*)utf16_data, len);
268    }
269 
270    free(utf16_data);
271    utf16_data = NULL;
272 
273    return ret;
274 }
275 
276 /* Returned pointer MUST be freed by the caller if non-NULL. */
mb_to_mb_string_alloc(const char * str,enum CodePage cp_in,enum CodePage cp_out)277 static char* mb_to_mb_string_alloc(const char *str,
278       enum CodePage cp_in, enum CodePage cp_out)
279 {
280    char *path_buf         = NULL;
281    wchar_t *path_buf_wide = NULL;
282    int path_buf_len       = 0;
283    int path_buf_wide_len  = 0;
284 
285    if (!str || !*str)
286       return NULL;
287 
288    (void)path_buf;
289    (void)path_buf_wide;
290    (void)path_buf_len;
291    (void)path_buf_wide_len;
292 
293 #if !defined(_WIN32) || defined(_XBOX)
294    /* assume string needs no modification if not on Windows */
295    return strdup(str);
296 #else
297 #ifdef UNICODE
298    /* TODO/FIXME: Not implemented. */
299    return strdup(str);
300 #else
301 
302    /* Windows 95 will return 0 from these functions with a UTF8 codepage set without MSLU. From an unknown MSDN version (others omit this info):
303     *   - CP_UTF8 Windows 98/Me, Windows NT 4.0 and later: Translate using UTF-8. When this is set, dwFlags must be zero.
304     *   - Windows 95: Under the Microsoft Layer for Unicode, MultiByteToWideChar also supports CP_UTF7 and CP_UTF8.
305     */
306    path_buf_wide_len = MultiByteToWideChar(cp_in, 0, str, -1, NULL, 0);
307 
308    if (path_buf_wide_len)
309    {
310       path_buf_wide = (wchar_t*)
311          calloc(path_buf_wide_len + sizeof(wchar_t), sizeof(wchar_t));
312 
313       if (path_buf_wide)
314       {
315          MultiByteToWideChar(cp_in, 0,
316                str, -1, path_buf_wide, path_buf_wide_len);
317 
318          if (*path_buf_wide)
319          {
320             path_buf_len = WideCharToMultiByte(cp_out, 0,
321                   path_buf_wide, -1, NULL, 0, NULL, NULL);
322 
323             if (path_buf_len)
324             {
325                path_buf = (char*)
326                   calloc(path_buf_len + sizeof(char), sizeof(char));
327 
328                if (path_buf)
329                {
330                   WideCharToMultiByte(cp_out, 0,
331                         path_buf_wide, -1, path_buf,
332                         path_buf_len, NULL, NULL);
333 
334                   free(path_buf_wide);
335 
336                   if (*path_buf)
337                      return path_buf;
338 
339                   free(path_buf);
340                   return NULL;
341                }
342             }
343             else
344             {
345                free(path_buf_wide);
346                return strdup(str);
347             }
348          }
349       }
350    }
351    else
352       return strdup(str);
353 
354    if (path_buf_wide)
355       free(path_buf_wide);
356 
357    return NULL;
358 #endif
359 #endif
360 }
361 
362 /* Returned pointer MUST be freed by the caller if non-NULL. */
utf8_to_local_string_alloc(const char * str)363 char* utf8_to_local_string_alloc(const char *str)
364 {
365    return mb_to_mb_string_alloc(str, CODEPAGE_UTF8, CODEPAGE_LOCAL);
366 }
367 
368 /* Returned pointer MUST be freed by the caller if non-NULL. */
local_to_utf8_string_alloc(const char * str)369 char* local_to_utf8_string_alloc(const char *str)
370 {
371    return mb_to_mb_string_alloc(str, CODEPAGE_LOCAL, CODEPAGE_UTF8);
372 }
373 
374 /* Returned pointer MUST be freed by the caller if non-NULL. */
utf8_to_utf16_string_alloc(const char * str)375 wchar_t* utf8_to_utf16_string_alloc(const char *str)
376 {
377 #ifdef _WIN32
378    int len = 0;
379    int out_len = 0;
380 #else
381    size_t len = 0;
382    size_t out_len = 0;
383 #endif
384    wchar_t *buf = NULL;
385 
386    if (!str || !*str)
387       return NULL;
388 
389 #ifdef _WIN32
390    len = MultiByteToWideChar(CP_UTF8, 0, str, -1, NULL, 0);
391 
392    if (len)
393    {
394       buf = (wchar_t*)calloc(len, sizeof(wchar_t));
395 
396       if (!buf)
397          return NULL;
398 
399       out_len = MultiByteToWideChar(CP_UTF8, 0, str, -1, buf, len);
400    }
401    else
402    {
403       /* fallback to ANSI codepage instead */
404       len = MultiByteToWideChar(CP_ACP, 0, str, -1, NULL, 0);
405 
406       if (len)
407       {
408          buf = (wchar_t*)calloc(len, sizeof(wchar_t));
409 
410          if (!buf)
411             return NULL;
412 
413          out_len = MultiByteToWideChar(CP_ACP, 0, str, -1, buf, len);
414       }
415    }
416 
417    if (out_len < 0)
418    {
419       free(buf);
420       return NULL;
421    }
422 #else
423    /* NOTE: For now, assume non-Windows platforms' locale is already UTF-8. */
424    len = mbstowcs(NULL, str, 0) + 1;
425 
426    if (len)
427    {
428       buf = (wchar_t*)calloc(len, sizeof(wchar_t));
429 
430       if (!buf)
431          return NULL;
432 
433       out_len = mbstowcs(buf, str, len);
434    }
435 
436    if (out_len == (size_t)-1)
437    {
438       free(buf);
439       return NULL;
440    }
441 #endif
442 
443    return buf;
444 }
445 
446 /* Returned pointer MUST be freed by the caller if non-NULL. */
utf16_to_utf8_string_alloc(const wchar_t * str)447 char* utf16_to_utf8_string_alloc(const wchar_t *str)
448 {
449 #ifdef _WIN32
450    int len = 0;
451    int out_len = 0;
452 #else
453    size_t len = 0;
454    size_t out_len = 0;
455 #endif
456    char *buf = NULL;
457 
458    if (!str || !*str)
459       return NULL;
460 
461 #ifdef _WIN32
462    len = WideCharToMultiByte(CP_UTF8, 0, str, -1, NULL, 0, NULL, NULL);
463 
464    if (len)
465    {
466       buf = (char*)calloc(len, sizeof(char));
467 
468       if (!buf)
469          return NULL;
470 
471       out_len = WideCharToMultiByte(CP_UTF8, 0, str, -1, buf, len, NULL, NULL);
472    }
473    else
474    {
475       /* fallback to ANSI codepage instead */
476       len = WideCharToMultiByte(CP_ACP, 0, str, -1, NULL, 0, NULL, NULL);
477 
478       if (len)
479       {
480          buf = (char*)calloc(len, sizeof(char));
481 
482          if (!buf)
483             return NULL;
484 
485          out_len = WideCharToMultiByte(CP_ACP, 0, str, -1, buf, len, NULL, NULL);
486       }
487    }
488 
489    if (out_len < 0)
490    {
491       free(buf);
492       return NULL;
493    }
494 #else
495    /* NOTE: For now, assume non-Windows platforms' locale is already UTF-8. */
496    len = wcstombs(NULL, str, 0) + 1;
497 
498    if (len)
499    {
500       buf = (char*)calloc(len, sizeof(char));
501 
502       if (!buf)
503          return NULL;
504 
505       out_len = wcstombs(buf, str, len);
506    }
507 
508    if (out_len == (size_t)-1)
509    {
510       free(buf);
511       return NULL;
512    }
513 #endif
514 
515    return buf;
516 }
517