1 /*
2  * Copyright (C) 2020 Linux Studio Plugins Project <https://lsp-plug.in/>
3  *           (C) 2020 Vladimir Sadovnikov <sadko4u@gmail.com>
4  *
5  * This file is part of lsp-plugins
6  * Created on: 18 июн. 2018 г.
7  *
8  * lsp-plugins is free software: you can redistribute it and/or modify
9  * it under the terms of the GNU Lesser General Public License as published by
10  * the Free Software Foundation, either version 3 of the License, or
11  * any later version.
12  *
13  * lsp-plugins is distributed in the hope that it will be useful,
14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16  * GNU Lesser General Public License for more details.
17  *
18  * You should have received a copy of the GNU Lesser General Public License
19  * along with lsp-plugins. If not, see <https://www.gnu.org/licenses/>.
20  */
21 
22 #include <core/io/charset.h>
23 #include <dsp/endian.h>
24 #include <errno.h>
25 #include <stdlib.h>
26 #include <stdio.h>
27 
28 namespace lsp
29 {
30 #if defined(PLATFORM_WINDOWS)
31     typedef struct codepage_t
32     {
33         const char *name;
34         size_t      codepage;
35     } codepage_t;
36 
37     // This is a generated list of codepages supported by Windows,
38     // see script:   scripts/perl/core/oi/charset/gen_cp.pl
39     static const codepage_t win_codepages[] = {
40             { "037", 37 },
41             { "10000", 10000 },
42             { "10001", 10001 },
43             { "10002", 10002 },
44             { "10003", 10003 },
45             { "10004", 10004 },
46             { "10005", 10005 },
47             { "10006", 10006 },
48             { "10007", 10007 },
49             { "10008", 10008 },
50             { "10010", 10010 },
51             { "10017", 10017 },
52             { "10021", 10021 },
53             { "10029", 10029 },
54             { "10079", 10079 },
55             { "10081", 10081 },
56             { "10082", 10082 },
57             { "1026", 1026 },
58             { "1047", 1047 },
59             { "1140", 1140 },
60             { "1141", 1141 },
61             { "1142", 1142 },
62             { "1143", 1143 },
63             { "1144", 1144 },
64             { "1145", 1145 },
65             { "1146", 1146 },
66             { "1147", 1147 },
67             { "1148", 1148 },
68             { "1149", 1149 },
69             { "1200", 1200 },
70             { "12000", 12000 },
71             { "12001", 12001 },
72             { "1201", 1201 },
73             { "1250", 1250 },
74             { "1251", 1251 },
75             { "1252", 1252 },
76             { "1253", 1253 },
77             { "1254", 1254 },
78             { "1255", 1255 },
79             { "1256", 1256 },
80             { "1257", 1257 },
81             { "1258", 1258 },
82             { "1361", 1361 },
83             { "20000", 20000 },
84             { "20001", 20001 },
85             { "20002", 20002 },
86             { "20003", 20003 },
87             { "20004", 20004 },
88             { "20005", 20005 },
89             { "20105", 20105 },
90             { "20106", 20106 },
91             { "20107", 20107 },
92             { "20108", 20108 },
93             { "20127", 20127 },
94             { "20261", 20261 },
95             { "20269", 20269 },
96             { "20273", 20273 },
97             { "20277", 20277 },
98             { "20278", 20278 },
99             { "20280", 20280 },
100             { "20284", 20284 },
101             { "20285", 20285 },
102             { "20290", 20290 },
103             { "20297", 20297 },
104             { "20420", 20420 },
105             { "20423", 20423 },
106             { "20424", 20424 },
107             { "20833", 20833 },
108             { "20838", 20838 },
109             { "20866", 20866 },
110             { "20871", 20871 },
111             { "20880", 20880 },
112             { "20905", 20905 },
113             { "20924", 20924 },
114             { "20932", 20932 },
115             { "20936", 20936 },
116             { "20949", 20949 },
117             { "21025", 21025 },
118             { "21027", 21027 },
119             { "21866", 21866 },
120             { "28591", 28591 },
121             { "28592", 28592 },
122             { "28593", 28593 },
123             { "28594", 28594 },
124             { "28595", 28595 },
125             { "28596", 28596 },
126             { "28597", 28597 },
127             { "28598", 28598 },
128             { "28599", 28599 },
129             { "28603", 28603 },
130             { "28605", 28605 },
131             { "29001", 29001 },
132             { "37", 37 },
133             { "38598", 38598 },
134             { "437", 437 },
135             { "500", 500 },
136             { "50220", 50220 },
137             { "50221", 50221 },
138             { "50222", 50222 },
139             { "50225", 50225 },
140             { "50227", 50227 },
141             { "50229", 50229 },
142             { "50930", 50930 },
143             { "50931", 50931 },
144             { "50933", 50933 },
145             { "50935", 50935 },
146             { "50936", 50936 },
147             { "50937", 50937 },
148             { "50939", 50939 },
149             { "51932", 51932 },
150             { "51936", 51936 },
151             { "51949", 51949 },
152             { "51950", 51950 },
153             { "52936", 52936 },
154             { "54936", 54936 },
155             { "57002", 57002 },
156             { "57003", 57003 },
157             { "57004", 57004 },
158             { "57005", 57005 },
159             { "57006", 57006 },
160             { "57007", 57007 },
161             { "57008", 57008 },
162             { "57009", 57009 },
163             { "57010", 57010 },
164             { "57011", 57011 },
165             { "65000", 65000 },
166             { "65001", 65001 },
167             { "708", 708 },
168             { "709", 709 },
169             { "710", 710 },
170             { "720", 720 },
171             { "737", 737 },
172             { "775", 775 },
173             { "850", 850 },
174             { "852", 852 },
175             { "855", 855 },
176             { "857", 857 },
177             { "858", 858 },
178             { "860", 860 },
179             { "861", 861 },
180             { "862", 862 },
181             { "863", 863 },
182             { "864", 864 },
183             { "865", 865 },
184             { "866", 866 },
185             { "869", 869 },
186             { "870", 870 },
187             { "874", 874 },
188             { "875", 875 },
189             { "932", 932 },
190             { "936", 936 },
191             { "949", 949 },
192             { "950", 950 },
193             { "asmo-708", 708 },
194             { "asmo708", 708 },
195             { "big5", 950 },
196             { "cp-1025", 21025 },
197             { "cp-1250", 1250 },
198             { "cp-1251", 1251 },
199             { "cp-1252", 1252 },
200             { "cp-1253", 1253 },
201             { "cp-1254", 1254 },
202             { "cp-1255", 1255 },
203             { "cp-1256", 1256 },
204             { "cp-1257", 1257 },
205             { "cp-1258", 1258 },
206             { "cp-21027", 21027 },
207             { "cp-50229", 50229 },
208             { "cp-50930", 50930 },
209             { "cp-50931", 50931 },
210             { "cp-50933", 50933 },
211             { "cp-50935", 50935 },
212             { "cp-50936", 50936 },
213             { "cp-50937", 50937 },
214             { "cp-50939", 50939 },
215             { "cp-51950", 51950 },
216             { "cp-709", 709 },
217             { "cp-710", 710 },
218             { "cp-866", 866 },
219             { "cp-874", 874 },
220             { "cp-875", 875 },
221             { "cp1025", 21025 },
222             { "cp1250", 1250 },
223             { "cp1251", 1251 },
224             { "cp1252", 1252 },
225             { "cp1253", 1253 },
226             { "cp1254", 1254 },
227             { "cp1255", 1255 },
228             { "cp1256", 1256 },
229             { "cp1257", 1257 },
230             { "cp1258", 1258 },
231             { "cp21027", 21027 },
232             { "cp50229", 50229 },
233             { "cp50930", 50930 },
234             { "cp50931", 50931 },
235             { "cp50933", 50933 },
236             { "cp50935", 50935 },
237             { "cp50936", 50936 },
238             { "cp50937", 50937 },
239             { "cp50939", 50939 },
240             { "cp51950", 51950 },
241             { "cp709", 709 },
242             { "cp710", 710 },
243             { "cp866", 866 },
244             { "cp874", 874 },
245             { "cp875", 875 },
246             { "csiso2022jp", 50221 },
247             { "dos-720", 720 },
248             { "dos-862", 862 },
249             { "dos720", 720 },
250             { "dos862", 862 },
251             { "euc-cn", 51936 },
252             { "euc-jp", 51932 },
253             { "euc-kr", 51949 },
254             { "gb18030", 54936 },
255             { "gb2312", 936 },
256             { "hz-gb-2312", 52936 },
257             { "hz-gb2312", 52936 },
258             { "ibm-thai", 20838 },
259             { "ibm00858", 858 },
260             { "ibm00924", 20924 },
261             { "ibm01047", 1047 },
262             { "ibm01140", 1140 },
263             { "ibm01141", 1141 },
264             { "ibm01142", 1142 },
265             { "ibm01143", 1143 },
266             { "ibm01144", 1144 },
267             { "ibm01145", 1145 },
268             { "ibm01146", 1146 },
269             { "ibm01147", 1147 },
270             { "ibm01148", 1148 },
271             { "ibm01149", 1149 },
272             { "ibm037", 37 },
273             { "ibm1026", 1026 },
274             { "ibm273", 20273 },
275             { "ibm277", 20277 },
276             { "ibm278", 20278 },
277             { "ibm280", 20280 },
278             { "ibm284", 20284 },
279             { "ibm285", 20285 },
280             { "ibm290", 20290 },
281             { "ibm297", 20297 },
282             { "ibm420", 20420 },
283             { "ibm423", 20423 },
284             { "ibm424", 20424 },
285             { "ibm437", 437 },
286             { "ibm500", 500 },
287             { "ibm737", 737 },
288             { "ibm775", 775 },
289             { "ibm850", 850 },
290             { "ibm852", 852 },
291             { "ibm855", 855 },
292             { "ibm857", 857 },
293             { "ibm860", 860 },
294             { "ibm861", 861 },
295             { "ibm863", 863 },
296             { "ibm864", 864 },
297             { "ibm865", 865 },
298             { "ibm869", 869 },
299             { "ibm870", 870 },
300             { "ibm871", 20871 },
301             { "ibm880", 20880 },
302             { "ibm905", 20905 },
303             { "iso-2022-jp", 50222 },
304             { "iso-2022-kr", 50225 },
305             { "iso-2022jp", 50222 },
306             { "iso-2022kr", 50225 },
307             { "iso-8859-1", 28591 },
308             { "iso-8859-13", 28603 },
309             { "iso-8859-15", 28605 },
310             { "iso-8859-2", 28592 },
311             { "iso-8859-3", 28593 },
312             { "iso-8859-4", 28594 },
313             { "iso-8859-5", 28595 },
314             { "iso-8859-6", 28596 },
315             { "iso-8859-7", 28597 },
316             { "iso-8859-8", 28598 },
317             { "iso-8859-8-i", 38598 },
318             { "iso-8859-8i", 38598 },
319             { "iso-8859-9", 28599 },
320             { "iso2022-jp", 50222 },
321             { "iso2022-kr", 50225 },
322             { "iso2022jp", 50222 },
323             { "iso2022kr", 50225 },
324             { "iso8859-1", 28591 },
325             { "iso8859-13", 28603 },
326             { "iso8859-15", 28605 },
327             { "iso8859-2", 28592 },
328             { "iso8859-3", 28593 },
329             { "iso8859-4", 28594 },
330             { "iso8859-5", 28595 },
331             { "iso8859-6", 28596 },
332             { "iso8859-7", 28597 },
333             { "iso8859-8", 28598 },
334             { "iso8859-8-i", 38598 },
335             { "iso8859-8i", 38598 },
336             { "iso8859-9", 28599 },
337             { "johab", 1361 },
338             { "koi8-r", 20866 },
339             { "koi8-u", 21866 },
340             { "koi8r", 20866 },
341             { "koi8u", 21866 },
342             { "ks-c-5601-1987", 949 },
343             { "ks_c_5601-1987", 949 },
344             { "macintosh", 10000 },
345             { "shift-jis", 932 },
346             { "shift_jis", 932 },
347             { "unicodefffe", 1201 },
348             { "us-ascii", 20127 },
349             { "utf-16", 1200 },
350             { "utf-16be", 1201 },
351             { "utf-16le", 1200 },
352             { "utf-32", 12000 },
353             { "utf-32be", 12001 },
354             { "utf-32le", 12000 },
355             { "utf-7", 65000 },
356             { "utf-8", 65001 },
357             { "utf16", 1200 },
358             { "utf16be", 1201 },
359             { "utf16le", 1200 },
360             { "utf32", 12000 },
361             { "utf32be", 12001 },
362             { "utf32le", 12000 },
363             { "utf7", 65000 },
364             { "utf8", 65001 },
365             { "windows-1250", 1250 },
366             { "windows-1251", 1251 },
367             { "windows-1252", 1252 },
368             { "windows-1253", 1253 },
369             { "windows-1254", 1254 },
370             { "windows-1255", 1255 },
371             { "windows-1256", 1256 },
372             { "windows-1257", 1257 },
373             { "windows-1258", 1258 },
374             { "windows-874", 874 },
375             { "windows1250", 1250 },
376             { "windows1251", 1251 },
377             { "windows1252", 1252 },
378             { "windows1253", 1253 },
379             { "windows1254", 1254 },
380             { "windows1255", 1255 },
381             { "windows1256", 1256 },
382             { "windows1257", 1257 },
383             { "windows1258", 1258 },
384             { "windows874", 874 },
385             { "x-chinese-cns", 20000 },
386             { "x-chinese-eten", 20002 },
387             { "x-chinese_cns", 20000 },
388             { "x-cp-20001", 20001 },
389             { "x-cp-20003", 20003 },
390             { "x-cp-20004", 20004 },
391             { "x-cp-20005", 20005 },
392             { "x-cp-20261", 20261 },
393             { "x-cp-20269", 20269 },
394             { "x-cp-20936", 20936 },
395             { "x-cp-20949", 20949 },
396             { "x-cp-50227", 50227 },
397             { "x-cp20001", 20001 },
398             { "x-cp20003", 20003 },
399             { "x-cp20004", 20004 },
400             { "x-cp20005", 20005 },
401             { "x-cp20261", 20261 },
402             { "x-cp20269", 20269 },
403             { "x-cp20936", 20936 },
404             { "x-cp20949", 20949 },
405             { "x-cp50227", 50227 },
406             { "x-ebcdic-koreanextended", 20833 },
407             { "x-europa", 29001 },
408             { "x-ia5", 20105 },
409             { "x-ia5-german", 20106 },
410             { "x-ia5-norwegian", 20108 },
411             { "x-ia5-swedish", 20107 },
412             { "x-ia5german", 20106 },
413             { "x-ia5norwegian", 20108 },
414             { "x-ia5swedish", 20107 },
415             { "x-iscii-as", 57006 },
416             { "x-iscii-be", 57003 },
417             { "x-iscii-de", 57002 },
418             { "x-iscii-gu", 57010 },
419             { "x-iscii-ka", 57008 },
420             { "x-iscii-ma", 57009 },
421             { "x-iscii-or", 57007 },
422             { "x-iscii-pa", 57011 },
423             { "x-iscii-ta", 57004 },
424             { "x-iscii-te", 57005 },
425             { "x-mac-arabic", 10004 },
426             { "x-mac-ce", 10029 },
427             { "x-mac-chinesesimp", 10008 },
428             { "x-mac-chinesetrad", 10002 },
429             { "x-mac-croatian", 10082 },
430             { "x-mac-cyrillic", 10007 },
431             { "x-mac-greek", 10006 },
432             { "x-mac-hebrew", 10005 },
433             { "x-mac-icelandic", 10079 },
434             { "x-mac-japanese", 10001 },
435             { "x-mac-korean", 10003 },
436             { "x-mac-romanian", 10010 },
437             { "x-mac-thai", 10021 },
438             { "x-mac-turkish", 10081 },
439             { "x-mac-ukrainian", 10017 },
440             { "x_chinese-eten", 20002 }
441     };
442 
get_codepage(LCID locale,bool ansi)443     ssize_t get_codepage(LCID locale, bool ansi)
444     {
445         char buf[32];
446 
447         int res = GetLocaleInfoA(locale, (ansi) ? LOCALE_IDEFAULTANSICODEPAGE : LOCALE_IDEFAULTCODEPAGE, buf, sizeof(buf)-1);
448         if (res == 0)
449         {
450             switch (GetLastError())
451             {
452                 case ERROR_INSUFFICIENT_BUFFER:
453                     return -STATUS_NO_MEM;
454                 case ERROR_INVALID_FLAGS:
455                 case ERROR_INVALID_PARAMETER:
456                     return -STATUS_BAD_ARGUMENTS;
457                 default:
458                     return -STATUS_UNKNOWN_ERR;
459             }
460         }
461 
462         errno = 0;
463         ssize_t cp_num = strtol(buf, NULL, 10);
464         if (errno != 0)
465             return -STATUS_UNSUPPORTED_FORMAT;
466         return cp_num;
467     }
468 
codepage_from_name(const char * charset)469     ssize_t codepage_from_name(const char *charset)
470     {
471         if (charset != NULL)
472         {
473             // Do lower-case the character set
474             size_t n = strlen(charset) + 1;
475             char *lower = static_cast<char *>(alloca(n));
476             for (size_t i=0; i<n; ++i)
477                 lower[i] = tolower(charset[i]);
478 
479             // Perform binary search of character set
480             size_t first = 0, last = sizeof(win_codepages)/sizeof(codepage_t);
481             while (first < last)
482             {
483                 size_t middle = (first + last) >> 1;
484                 int n = strcmp(lower, win_codepages[middle].name);
485                 if (n == 0)
486                     return win_codepages[middle].codepage;
487                 else if (n < 0)
488                     last = middle;
489                 else
490                     first = middle + 1;
491             }
492 
493             return -1;
494         }
495 
496 //        printf("LOCALE_CUSTOM_DEFAULT = %d\n", int(get_codepage(LOCALE_CUSTOM_DEFAULT)));
497 //        printf("LOCALE_USER_DEFAULT = %d\n", int(get_codepage(LOCALE_USER_DEFAULT)));
498 //        printf("LOCALE_SYSTEM_DEFAULT = %d\n", int(get_codepage(LOCALE_SYSTEM_DEFAULT)));
499 //        printf("LOCALE_CUSTOM_UNSPECIFIED = %d\n", int(get_codepage(LOCALE_CUSTOM_UNSPECIFIED)));
500 //        printf("LOCALE_CUSTOM_UI_DEFAULT = %d\n", int(get_codepage(LOCALE_CUSTOM_UI_DEFAULT)));
501 //        printf("LOCALE_INVARIANT = %d\n", int(get_codepage(LOCALE_INVARIANT)));
502 //        printf("GetConsoleWindow() = %d\n", int(GetConsoleWindow()));
503 //        printf("GetConsoleOutputCP() = %d\n", int(GetConsoleOutputCP()));
504 //        fflush(stdout);
505 
506         // Obtain system character set
507         //ssize_t cp = (GetConsoleWindow() != 0) ? GetConsoleOutputCP() : get_codepage(LOCALE_CUSTOM_DEFAULT);
508         ssize_t cp = get_codepage(LOCALE_CUSTOM_DEFAULT);
509         if (cp < 0)
510             cp = get_codepage(LOCALE_USER_DEFAULT);
511         if (cp < 0)
512             cp = get_codepage(LOCALE_SYSTEM_DEFAULT);
513         return cp;
514     }
515 
516 #else
517     iconv_t init_iconv_to_wchar_t(const char *charset)
518     {
519         // Fetch system character set if it is not set
520         if (charset == NULL)
521         {
522             // Save current locale
523             char *current = setlocale(LC_CTYPE, NULL);
524             if (current == NULL)
525                 return iconv_t(-1);
526             size_t len = strlen(current) + 1;
527             char *psaved = static_cast<char *>(alloca(len));
528             ::memcpy(psaved, current, len);
529             charset = psaved;
530 
531             // Get system locale
532             current = setlocale(LC_CTYPE, "");
533             if (current != NULL)
534                 current = strchr(current, '.');
535 
536             // Scan for character set
537             if (current != NULL)
538             {
539                 len = strlen(current);
540                 psaved = static_cast<char *>(alloca(len));
541                 ::memcpy(psaved, &current[1], len);
542             }
543 
544             // Restore saved locale
545             setlocale(LC_CTYPE, charset);
546 
547             // Update locale
548             charset  = (current != NULL) ? psaved : "UTF-8";
549         }
550 
551         // Open conversion
552         iconv_t res = iconv_open(__IF_LEBE("UTF-32LE", "UTF-32BE"), charset);
553         if (res != iconv_t(-1))
554             return res;
555 
556         res = iconv_open(__IF_LEBE("UTF-32LE", "UTF-32BE"), "UTF-8");
557         if (res != iconv_t(-1))
558             return res;
559 
560         return iconv_open("WCHAR_T", "UTF-8");
561     }
562 
563     iconv_t init_iconv_from_wchar_t(const char *charset)
564     {
565         // Fetch system charset if it is not set
566         if (charset == NULL)
567         {
568             // Save current locale
569             char *current = setlocale(LC_CTYPE, NULL);
570             if (current == NULL)
571                 return iconv_t(-1);
572             size_t len = strlen(current) + 1;
573             char *psaved = static_cast<char *>(alloca(len));
574             ::memcpy(psaved, current, len);
575             charset = psaved;
576 
577             // Get system locale
578             current = setlocale(LC_CTYPE, "");
579             if (current != NULL)
580                 current = strchr(current, '.');
581 
582             // Scan for character set
583             if (current != NULL)
584             {
585                 len = strlen(current);
586                 psaved = static_cast<char *>(alloca(len));
587                 ::memcpy(psaved, &current[1], len);
588             }
589 
590             // Restore saved locale
591             setlocale(LC_CTYPE, charset);
592 
593             // Update charset
594             charset  = (current != NULL) ? psaved : "UTF-8";
595         }
596 
597         // Open conversion
598         iconv_t res = iconv_open(charset, __IF_LEBE("UTF-32LE", "UTF-32BE"));
599         if (res != iconv_t(-1))
600             return res;
601 
602         res = iconv_open("UTF-8", __IF_LEBE("UTF-32LE", "UTF-32BE"));
603         if (res != iconv_t(-1))
604             return res;
605 
606         return iconv_open("UTF-8", "WCHAR_T");
607     }
608 #endif
609 
610     //-------------------------------------------------------------------------
611     // UTF-16 helper routines
read_utf16le_codepoint(const lsp_utf16_t ** str)612     lsp_utf32_t read_utf16le_codepoint(const lsp_utf16_t **str)
613     {
614         uint32_t cp, sc;
615         const lsp_utf16_t *s = *str;
616 
617         cp = LE_TO_CPU(*(s++));
618         if (cp == 0)
619             return cp;
620 
621         sc = cp & 0xfc00;
622         if (sc == 0xd800) // cp = Surrogate high
623         {
624             sc = LE_TO_CPU(*s);
625             if ((sc & 0xfc00) == 0xdc00)
626             {
627                 ++s;
628                 cp  = 0x10000 + (((cp & 0x3ff) << 10) | (sc & 0x3ff));
629             }
630             else
631                 cp  = 0xfffd;
632         }
633         else if (sc == 0xdc00) // Surrogate low?
634         {
635             sc = LE_TO_CPU(*s);
636             if ((sc & 0xfc00) == 0xd800)
637             {
638                 ++s;
639                 cp  = 0x10000 + (((sc & 0x3ff) << 10) | (cp & 0x3ff));
640             }
641             else
642                 cp  = 0xfffd;
643         }
644 
645         *str = s;
646         return cp;
647     }
648 
read_utf16be_codepoint(const lsp_utf16_t ** str)649     lsp_utf32_t read_utf16be_codepoint(const lsp_utf16_t **str)
650     {
651         uint32_t cp, sc;
652         const lsp_utf16_t *s = *str;
653 
654         cp = BE_TO_CPU(*(s++));
655         if (cp == 0)
656             return cp;
657 
658         sc = cp & 0xfc00;
659         if (sc == 0xd800) // cp = Surrogate high
660         {
661             sc = BE_TO_CPU(*s);
662             if ((sc & 0xfc00) == 0xdc00)
663             {
664                 ++s;
665                 cp  = 0x10000 + (((cp & 0x3ff) << 10) | (sc & 0x3ff));
666             }
667             else
668                 cp  = 0xfffd;
669         }
670         else if (sc == 0xdc00) // Surrogate low?
671         {
672             sc = BE_TO_CPU(*s);
673             if ((sc & 0xfc00) == 0xd800)
674             {
675                 ++s;
676                 cp  = 0x10000 + (((sc & 0x3ff) << 10) | (cp & 0x3ff));
677             }
678             else
679                 cp  = 0xfffd;
680         }
681 
682         *str = s;
683         return cp;
684     }
685 
read_utf16le_streaming(const lsp_utf16_t ** str,size_t * nsrc,bool force)686     lsp_utf32_t read_utf16le_streaming(const lsp_utf16_t **str, size_t *nsrc, bool force)
687     {
688         if (*nsrc <= 0)
689             return LSP_UTF32_EOF;
690 
691         uint32_t cp, sc;
692         const lsp_utf16_t *s = *str;
693 
694         cp = LE_TO_CPU(*(s++));
695         sc = cp & 0xfc00;
696         if (sc == 0xd800) // cp = Surrogate high
697         {
698             if (*nsrc > 1)
699                 sc      = LE_TO_CPU(*s);
700             else if (force)
701                 sc      = 0;
702             else
703                 return LSP_UTF32_EOF;
704 
705             if ((sc & 0xfc00) == 0xdc00)
706             {
707                 ++s;
708                 cp  = 0x10000 + (((cp & 0x3ff) << 10) | (sc & 0x3ff));
709             }
710             else
711                 cp  = 0xfffd;
712         }
713         else if (sc == 0xdc00) // Surrogate low?
714         {
715             if (*nsrc > 1)
716                 sc      = LE_TO_CPU(*s);
717             else if (force)
718                 sc      = 0;
719             else
720                 return LSP_UTF32_EOF;
721 
722             if ((sc & 0xfc00) == 0xd800)
723             {
724                 ++s;
725                 cp  = 0x10000 + (((sc & 0x3ff) << 10) | (cp & 0x3ff));
726             }
727             else
728                 cp  = 0xfffd;
729         }
730 
731         *nsrc  -= (s - *str);
732         *str    = s;
733         return cp;
734     }
735 
read_utf16be_streaming(const lsp_utf16_t ** str,size_t * nsrc,bool force)736     lsp_utf32_t read_utf16be_streaming(const lsp_utf16_t **str, size_t *nsrc, bool force)
737     {
738         if (*nsrc <= 0)
739             return LSP_UTF32_EOF;
740 
741         uint32_t cp, sc;
742         const lsp_utf16_t *s = *str;
743 
744         cp = BE_TO_CPU(*(s++));
745         sc = cp & 0xfc00;
746         if (sc == 0xd800) // cp = Surrogate high
747         {
748             if (*nsrc > 1)
749                 sc      = BE_TO_CPU(*s);
750             else if (force)
751                 sc      = 0;
752             else
753                 return LSP_UTF32_EOF;
754 
755             if ((sc & 0xfc00) == 0xdc00)
756             {
757                 ++s;
758                 cp  = 0x10000 + (((cp & 0x3ff) << 10) | (sc & 0x3ff));
759             }
760             else
761                 cp  = 0xfffd;
762         }
763         else if (sc == 0xdc00) // Surrogate low?
764         {
765             if (*nsrc > 1)
766                 sc      = BE_TO_CPU(*s);
767             else if (force)
768                 sc      = 0;
769             else
770                 return LSP_UTF32_EOF;
771 
772             if ((sc & 0xfc00) == 0xd800)
773             {
774                 ++s;
775                 cp  = 0x10000 + (((sc & 0x3ff) << 10) | (cp & 0x3ff));
776             }
777             else
778                 cp  = 0xfffd;
779         }
780 
781         *nsrc  -= (s - *str);
782         *str    = s;
783         return cp;
784     }
785 
sizeof_utf16(lsp_utf32_t cp)786     inline size_t sizeof_utf16(lsp_utf32_t cp)
787     {
788         return (cp < 0x10000) ? 2 : 4;
789     }
790 
count_utf16(lsp_utf32_t cp)791     inline size_t count_utf16(lsp_utf32_t cp)
792     {
793         return (cp < 0x10000) ? 1 : 2;
794     }
795 
write_utf16le_codepoint(lsp_utf16_t ** str,lsp_utf32_t cp)796     void write_utf16le_codepoint(lsp_utf16_t **str, lsp_utf32_t cp)
797     {
798         lsp_utf16_t *dst = *str;
799         if (cp < 0x10000)
800             *(dst++)        = CPU_TO_LE(lsp_utf16_t(cp));
801         else
802         {
803             cp     -= 0x10000;
804             dst[0]  = CPU_TO_LE(lsp_utf16_t(0xd800 | (cp >> 10)));
805             dst[1]  = CPU_TO_LE(lsp_utf16_t(0xdc00 | (cp & 0x3ff)));
806             dst    += 2;
807         }
808         *str    = dst;
809     }
810 
write_utf16be_codepoint(lsp_utf16_t ** str,lsp_utf32_t cp)811     void write_utf16be_codepoint(lsp_utf16_t **str, lsp_utf32_t cp)
812     {
813         lsp_utf16_t *dst = *str;
814         if (cp < 0x10000)
815             *(dst++)        = CPU_TO_BE(lsp_utf16_t(cp));
816         else
817         {
818             cp     -= 0x10000;
819             dst[0]  = CPU_TO_BE(lsp_utf16_t(0xd800 | (cp >> 10)));
820             dst[1]  = CPU_TO_BE(lsp_utf16_t(0xdc00 | (cp & 0x3ff)));
821             dst    += 2;
822         }
823         *str    = dst;
824     }
825 
826     //-------------------------------------------------------------------------
827     // UTF-8 helper routines
read_utf8_codepoint(const char ** str)828     lsp_utf32_t read_utf8_codepoint(const char **str)
829     {
830         lsp_utf32_t cp, sp;
831         size_t bytes;
832         const char *s = *str;
833 
834         // Decode primary byte
835         cp = uint8_t(*s);
836         if (cp <= 0x7f)
837         {
838             *str    = (cp == 0) ? s : s+1;
839             return cp;
840         }
841 
842         ++s;
843         if ((cp & 0xe0) == 0xc0) // 2 bytes: 110xxxxx 10xxxxxx
844         {
845             cp     &= 0x1f;
846             bytes   = (cp >= 0x02) ? 1 : 0;
847         }
848         else if ((cp & 0xf0) == 0xe0) // 3 bytes: 1110xxxx 10xxxxxx 10xxxxxx
849         {
850             cp     &= 0x0f;
851             bytes   = (cp) ? 2 : 0;
852         }
853         else if ((cp & 0xf8) == 0xf0) // 4 bytes: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
854         {
855             cp     &= 0x07;
856             bytes   = 3;
857         }
858         else
859             bytes   = 0;
860 
861         // Invalid first byte sequence?
862         if (!bytes)
863         {
864             *str    = s;
865             return 0xfffd;
866         }
867 
868         // Decode extension bytes
869         for (size_t i=0; i<bytes; ++i)
870         {
871             sp  = uint8_t(*s);
872             if ((sp & 0xc0) != 0x80) // Invalid sequence?
873             {
874                 *str    = (sp == 0) ? s : s+1;
875                 return 0xfffd;
876             }
877             cp     = (cp << 6) | (sp & 0x3f);
878             ++s;
879         }
880 
881         if ((bytes == 3) && (cp < 0x10000)) // Check that 4-byte sequence is valid
882             cp      = 0xfffd;
883         else if ((cp >= 0xd800) && (cp < 0xe000)) // Check for surrogates
884             cp      = 0xfffd;
885 
886         *str = s;
887         return cp;
888     }
889 
read_utf8_streaming(const char ** str,size_t * nsrc,bool force)890     lsp_utf32_t read_utf8_streaming(const char **str, size_t *nsrc, bool force)
891     {
892         if (*nsrc <= 0)
893             return LSP_UTF32_EOF;
894 
895         lsp_utf32_t cp, sp;
896         size_t bytes;
897         const char *s = *str;
898 
899         // Decode primary byte
900         cp = uint8_t(*s);
901         if (cp <= 0x7f)
902         {
903             *str    = (cp == 0) ? s : s+1;
904             --(*nsrc);
905             return cp;
906         }
907 
908         // Multi-byte sequence
909         ++s;
910         if ((cp & 0xe0) == 0xc0) // 2 bytes: 110xxxxx 10xxxxxx
911         {
912             cp     &= 0x1f;
913             bytes   = (cp >= 0x02) ? 1 : 0;
914         }
915         else if ((cp & 0xf0) == 0xe0) // 3 bytes: 1110xxxx 10xxxxxx 10xxxxxx
916         {
917             cp     &= 0x0f;
918             bytes   = (cp) ? 2 : 0;
919         }
920         else if ((cp & 0xf8) == 0xf0) // 4 bytes: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
921         {
922             cp     &= 0x07;
923             bytes   = 3;
924         }
925         else
926             bytes   = 0;
927 
928         // Invalid first byte sequence?
929         if (!bytes)
930         {
931             *str    = s;
932             --(*nsrc);
933             return 0xfffd;
934         }
935         else if (bytes >= *nsrc)
936         {
937             if (force)
938             {
939                 *nsrc   = 0;
940                 return 0xfffd;
941             }
942             return LSP_UTF32_EOF;
943         }
944 
945         // Decode extension bytes
946         for (size_t i=0; i<bytes; ++i)
947         {
948             sp  = uint8_t(*s);
949             if ((sp & 0xc0) != 0x80) // Invalid sequence?
950             {
951                 if (sp == 0)
952                     ++s;
953                 *nsrc  -= (s - *str);
954                 *str    = s;
955                 return 0xfffd;
956             }
957             cp     = (cp << 6) | (sp & 0x3f);
958             ++s;
959         }
960 
961         if ((bytes == 3) && (cp < 0x10000)) // Check that 4-byte sequence is valid
962             cp      = 0xfffd;
963         else if ((cp >= 0xd800) && (cp < 0xe000)) // Check for surrogates
964             cp      = 0xfffd;
965 
966         *nsrc      -= (s - *str);
967         *str        = s;
968         return cp;
969     }
970 
sizeof_utf8(lsp_utf32_t cp)971     inline size_t sizeof_utf8(lsp_utf32_t cp)
972     {
973         if (cp >= 0x800)
974             return ((cp < 0x10000) || (cp >= 0x200000)) ? 3 : 4;
975         else
976             return (cp >= 0x80) ? 2 : 1;
977     }
978 
count_utf8(lsp_utf32_t cp)979     inline size_t count_utf8(lsp_utf32_t cp)
980     {
981         if (cp >= 0x800)
982             return ((cp < 0x10000) || (cp >= 0x200000)) ? 3 : 4;
983         else
984             return (cp >= 0x80) ? 2 : 1;
985     }
986 
write_utf8_codepoint(char ** str,lsp_utf32_t cp)987     void write_utf8_codepoint(char **str, lsp_utf32_t cp)
988     {
989         char *dst = *str;
990         if (cp >= 0x800) // 3-4 bytes
991         {
992             if (cp < 0x10000) // 3 bytes
993             {
994                 dst[0]      = (cp >> 12) | 0xe0;
995                 dst[1]      = ((cp >> 6) & 0x3f) | 0x80;
996                 dst[2]      = (cp & 0x3f) | 0x80;
997                 dst        += 3;
998             }
999             else if (cp < 0x200000) // 4 bytes
1000             {
1001                 dst[0]      = (cp >> 16) | 0xf0;
1002                 dst[1]      = ((cp >> 12) & 0x3f) | 0x80;
1003                 dst[2]      = ((cp >> 6) & 0x3f) | 0x80;
1004                 dst[3]      = (cp & 0x3f) | 0x80;
1005                 dst        += 4;
1006             }
1007             else // Invalid character, emit 3 bytes of 0xfffd code point value
1008             {
1009                 dst[0]      = 0xef;
1010                 dst[1]      = 0xbf;
1011                 dst[2]      = 0xbd;
1012                 dst        += 3;
1013             }
1014         }
1015         else // 1-2 bytes
1016         {
1017             if (cp >= 0x80) // 2 bytes
1018             {
1019                 dst[0]      = (cp >> 6) | 0xc0;
1020                 dst[1]      = (cp & 0x3f) | 0x80;
1021                 dst        += 2;
1022             }
1023             else // 1 byte
1024                 *(dst++)    = char(cp);
1025         }
1026         *str    = dst;
1027     }
1028 
1029     //-------------------------------------------------------------------------
1030     // UTF-8 non-streaming routines
utf8_to_utf16le(const char * str)1031     lsp_utf16_t *utf8_to_utf16le(const char *str)
1032     {
1033         // Estimate number of bytes
1034         lsp_utf32_t cp;
1035         size_t bytes    = 0;
1036         const char *p = str;
1037         do
1038         {
1039             cp      = read_utf8_codepoint(&p);
1040             bytes  += sizeof_utf16(cp);
1041         } while (cp != 0);
1042 
1043         // Allocate memory
1044         lsp_utf16_t *utf16  = reinterpret_cast<lsp_utf16_t *>(::malloc(bytes));
1045         if (utf16 == NULL)
1046             return NULL;
1047 
1048         // Perform encoding
1049         lsp_utf16_t *dst = utf16;
1050         p               = str;
1051         while ((cp = read_utf8_codepoint(&p)) != 0)
1052             write_utf16le_codepoint(&dst, cp);
1053         *dst        = 0;
1054 
1055         return utf16;
1056     }
1057 
utf8_to_utf16be(const char * str)1058     lsp_utf16_t *utf8_to_utf16be(const char *str)
1059     {
1060         // Estimate number of bytes
1061         lsp_utf32_t cp;
1062         size_t bytes    = 0;
1063         const char *p = str;
1064         do
1065         {
1066             cp      = read_utf8_codepoint(&p);
1067             bytes  += sizeof_utf16(cp);
1068         } while (cp != 0);
1069 
1070         // Allocate memory
1071         lsp_utf16_t *utf16  = reinterpret_cast<lsp_utf16_t *>(::malloc(bytes));
1072         if (utf16 == NULL)
1073             return NULL;
1074 
1075         // Perform encoding
1076         lsp_utf16_t *dst = utf16;
1077         p               = str;
1078         while ((cp = read_utf8_codepoint(&p)) != 0)
1079             write_utf16be_codepoint(&dst, cp);
1080         *dst        = 0;
1081 
1082         return utf16;
1083     }
1084 
utf8_to_utf32le(const char * str)1085     lsp_utf32_t *utf8_to_utf32le(const char *str)
1086     {
1087         // Estimate number of bytes
1088         lsp_utf32_t cp;
1089         size_t bytes    = 0;
1090         const char *p = str;
1091         do
1092         {
1093             cp      = read_utf8_codepoint(&p);
1094             bytes  += sizeof(lsp_utf32_t);
1095         } while (cp != 0);
1096 
1097         // Allocate memory
1098         lsp_utf32_t *utf32  = reinterpret_cast<lsp_utf32_t *>(::malloc(bytes));
1099         if (utf32 == NULL)
1100             return NULL;
1101 
1102         // Perform encoding
1103         lsp_utf32_t *dst = utf32;
1104         p               = str;
1105         while ((cp = read_utf8_codepoint(&p)) != 0)
1106             *(dst++)    = CPU_TO_LE(cp);
1107         *dst        = 0;
1108 
1109         return utf32;
1110     }
1111 
utf8_to_utf32be(const char * str)1112     lsp_utf32_t *utf8_to_utf32be(const char *str)
1113     {
1114         // Estimate number of bytes
1115         lsp_utf32_t cp;
1116         size_t bytes    = 0;
1117         const char *p = str;
1118         do
1119         {
1120             cp      = read_utf8_codepoint(&p);
1121             bytes  += sizeof(lsp_utf32_t);
1122         } while (cp != 0);
1123 
1124         // Allocate memory
1125         lsp_utf32_t *utf32  = reinterpret_cast<lsp_utf32_t *>(::malloc(bytes));
1126         if (utf32 == NULL)
1127             return NULL;
1128 
1129         // Perform encoding
1130         lsp_utf32_t *dst = utf32;
1131         p               = str;
1132         while ((cp = read_utf8_codepoint(&p)) != 0)
1133             *(dst++)    = CPU_TO_BE(cp);
1134         *dst        = 0;
1135 
1136         return utf32;
1137     }
1138 
1139     //-------------------------------------------------------------------------
1140     // UTF-16 non-streaming routines
utf16le_to_utf8(const lsp_utf16_t * str)1141     char *utf16le_to_utf8(const lsp_utf16_t *str)
1142     {
1143         // Estimate number of bytes
1144         lsp_utf32_t cp;
1145         size_t bytes = 0;
1146         const lsp_utf16_t *p = str;
1147         do
1148         {
1149             cp          = read_utf16le_codepoint(&p);
1150             bytes      += sizeof_utf8(cp);
1151         } while (cp != 0);
1152 
1153         // Allocate memory
1154         char *utf8  = reinterpret_cast<char *>(::malloc(bytes));
1155         if (utf8 == NULL)
1156             return NULL;
1157 
1158         // Now perform encoding
1159         char *dst   = utf8;
1160         p           = str;
1161         while ((cp = read_utf16le_codepoint(&p)) != 0)
1162             write_utf8_codepoint(&dst, cp);
1163         *dst = '\0';
1164 
1165         return utf8;
1166     }
1167 
utf16be_to_utf8(const lsp_utf16_t * str)1168     char *utf16be_to_utf8(const lsp_utf16_t *str)
1169     {
1170         // Estimate number of bytes
1171         lsp_utf32_t cp;
1172         size_t bytes = 0;
1173         const lsp_utf16_t *p = str;
1174         do
1175         {
1176             cp          = read_utf16be_codepoint(&p);
1177             bytes      += sizeof_utf8(cp);
1178         } while (cp != 0);
1179 
1180         // Allocate memory
1181         char *utf8  = reinterpret_cast<char *>(::malloc(bytes));
1182         if (utf8 == NULL)
1183             return NULL;
1184 
1185         // Now perform encoding
1186         char *dst   = utf8;
1187         p           = str;
1188         while ((cp = read_utf16be_codepoint(&p)) != 0)
1189             write_utf8_codepoint(&dst, cp);
1190         *dst = '\0';
1191 
1192         return utf8;
1193     }
1194 
utf16le_to_utf32le(const lsp_utf16_t * str)1195     lsp_utf32_t *utf16le_to_utf32le(const lsp_utf16_t *str)
1196     {
1197         // Estimate number of bytes
1198         lsp_utf32_t cp;
1199         size_t bytes = 0;
1200         const lsp_utf16_t *p = str;
1201         do
1202         {
1203             cp          = read_utf16le_codepoint(&p);
1204             bytes      += sizeof(lsp_utf32_t);
1205         } while (cp != 0);
1206 
1207         // Allocate memory
1208         lsp_utf32_t *utf32  = reinterpret_cast<lsp_utf32_t *>(::malloc(bytes));
1209         if (utf32 == NULL)
1210             return NULL;
1211 
1212         // Perform encoding
1213         p               = str;
1214         lsp_utf32_t *dst= utf32;
1215         while ((cp = read_utf16le_codepoint(&p)) != 0)
1216             *(dst++)        = CPU_TO_LE(cp);
1217         *dst            = 0;
1218 
1219         return utf32;
1220     }
1221 
utf16le_to_utf32be(const lsp_utf16_t * str)1222     lsp_utf32_t *utf16le_to_utf32be(const lsp_utf16_t *str)
1223     {
1224         // Estimate number of bytes
1225         lsp_utf32_t cp;
1226         size_t bytes = 0;
1227         const lsp_utf16_t *p = str;
1228         do
1229         {
1230             cp          = read_utf16le_codepoint(&p);
1231             bytes      += sizeof(lsp_utf32_t);
1232         } while (cp != 0);
1233 
1234         // Allocate memory
1235         lsp_utf32_t *utf32  = reinterpret_cast<lsp_utf32_t *>(::malloc(bytes));
1236         if (utf32 == NULL)
1237             return NULL;
1238 
1239         // Perform encoding
1240         p               = str;
1241         lsp_utf32_t *dst= utf32;
1242         while ((cp = read_utf16le_codepoint(&p)) != 0)
1243             *(dst++)        = CPU_TO_BE(cp);
1244         *dst            = 0;
1245 
1246         return utf32;
1247     }
1248 
utf16be_to_utf32le(const lsp_utf16_t * str)1249     lsp_utf32_t *utf16be_to_utf32le(const lsp_utf16_t *str)
1250     {
1251         // Estimate number of bytes
1252         lsp_utf32_t cp;
1253         size_t bytes = 0;
1254         const lsp_utf16_t *p = str;
1255         do
1256         {
1257             cp          = read_utf16be_codepoint(&p);
1258             bytes      += sizeof(lsp_utf32_t);
1259         } while (cp != 0);
1260 
1261         // Allocate memory
1262         lsp_utf32_t *utf32  = reinterpret_cast<lsp_utf32_t *>(::malloc(bytes));
1263         if (utf32 == NULL)
1264             return NULL;
1265 
1266         // Perform encoding
1267         p               = str;
1268         lsp_utf32_t *dst= utf32;
1269         while ((cp = read_utf16be_codepoint(&p)) != 0)
1270             *(dst++)        = CPU_TO_LE(cp);
1271         *dst            = 0;
1272 
1273         return utf32;
1274     }
1275 
utf16be_to_utf32be(const lsp_utf16_t * str)1276     lsp_utf32_t *utf16be_to_utf32be(const lsp_utf16_t *str)
1277     {
1278         // Estimate number of bytes
1279         lsp_utf32_t cp;
1280         size_t bytes = 0;
1281         const lsp_utf16_t *p = str;
1282         do
1283         {
1284             cp          = read_utf16be_codepoint(&p);
1285             bytes      += sizeof(lsp_utf32_t);
1286         } while (cp != 0);
1287 
1288         // Allocate memory
1289         lsp_utf32_t *utf32  = reinterpret_cast<lsp_utf32_t *>(::malloc(bytes));
1290         if (utf32 == NULL)
1291             return NULL;
1292 
1293         // Perform encoding
1294         p               = str;
1295         lsp_utf32_t *dst= utf32;
1296         while ((cp = read_utf16be_codepoint(&p)) != 0)
1297             *(dst++)        = CPU_TO_BE(cp);
1298         *dst            = 0;
1299 
1300         return utf32;
1301     }
1302 
1303     //-------------------------------------------------------------------------
1304     // UTF-32 non-streaming routines
utf32le_to_utf8(const lsp_utf32_t * str)1305     char *utf32le_to_utf8(const lsp_utf32_t *str)
1306     {
1307         lsp_utf32_t cp;
1308         size_t bytes = 0;
1309         const lsp_utf32_t *p = str;
1310 
1311         // Estimate length
1312         do
1313         {
1314             cp          = LE_TO_CPU(*(p++));
1315             bytes      += sizeof_utf8(cp);
1316         } while (cp != 0);
1317 
1318         // Allocate memory
1319         char *utf8      = reinterpret_cast<char *>(::malloc(bytes));
1320         if (utf8 == NULL)
1321             return NULL;
1322 
1323         // Perform encoding
1324         p               = str;
1325         char *dst       = utf8;
1326         while ((cp = *(p++)) != 0)
1327             write_utf8_codepoint(&dst, cp);
1328 
1329         *dst = 0;
1330         return utf8;
1331     }
1332 
utf32be_to_utf8(const lsp_utf32_t * str)1333     char *utf32be_to_utf8(const lsp_utf32_t *str)
1334     {
1335         lsp_utf32_t cp;
1336         size_t bytes = 0;
1337         const lsp_utf32_t *p = str;
1338 
1339         // Estimate length
1340         do
1341         {
1342             cp          = BE_TO_CPU(*(p++));
1343             bytes      += sizeof_utf8(cp);
1344         } while (cp != 0);
1345 
1346         // Allocate memory
1347         char *utf8      = reinterpret_cast<char *>(::malloc(bytes));
1348         if (utf8 == NULL)
1349             return NULL;
1350 
1351         // Perform encoding
1352         p               = str;
1353         char *dst       = utf8;
1354         while ((cp = *(p++)) != 0)
1355             write_utf8_codepoint(&dst, cp);
1356 
1357         *dst = 0;
1358         return utf8;
1359     }
1360 
utf32le_to_utf16le(const lsp_utf32_t * str)1361     lsp_utf16_t *utf32le_to_utf16le(const lsp_utf32_t *str)
1362     {
1363         lsp_utf32_t cp;
1364         size_t bytes = 0;
1365         const lsp_utf32_t *p = str;
1366 
1367         // Estimate length
1368         do
1369         {
1370             cp          = LE_TO_CPU(*(p++));
1371             bytes      += sizeof_utf16(cp);
1372         } while (cp != 0);
1373 
1374         // Allocate memory
1375         lsp_utf16_t *utf16  = reinterpret_cast<lsp_utf16_t *>(::malloc(bytes));
1376         if (utf16 == NULL)
1377             return NULL;
1378 
1379         // Perform encoding
1380         p               = str;
1381         lsp_utf16_t *dst= utf16;
1382         while ((cp = *(p++)) != 0)
1383             write_utf16le_codepoint(&dst, cp);
1384 
1385         *dst = 0;
1386         return utf16;
1387     }
1388 
utf32le_to_utf16be(const lsp_utf32_t * str)1389     lsp_utf16_t *utf32le_to_utf16be(const lsp_utf32_t *str)
1390     {
1391         lsp_utf32_t cp;
1392         size_t bytes = 0;
1393         const lsp_utf32_t *p = str;
1394 
1395         // Estimate length
1396         do
1397         {
1398             cp          = LE_TO_CPU(*(p++));
1399             bytes      += sizeof_utf16(cp);
1400         } while (cp != 0);
1401 
1402         // Allocate memory
1403         lsp_utf16_t *utf16  = reinterpret_cast<lsp_utf16_t *>(::malloc(bytes));
1404         if (utf16 == NULL)
1405             return NULL;
1406 
1407         // Perform encoding
1408         p               = str;
1409         lsp_utf16_t *dst= utf16;
1410         while ((cp = *(p++)) != 0)
1411             write_utf16be_codepoint(&dst, cp);
1412 
1413         *dst = 0;
1414         return utf16;
1415     }
1416 
utf32be_to_utf16le(const lsp_utf32_t * str)1417     lsp_utf16_t *utf32be_to_utf16le(const lsp_utf32_t *str)
1418     {
1419         lsp_utf32_t cp;
1420         size_t bytes = 0;
1421         const lsp_utf32_t *p = str;
1422 
1423         // Estimate length
1424         do
1425         {
1426             cp          = BE_TO_CPU(*(p++));
1427             bytes      += sizeof_utf16(cp);
1428         } while (cp != 0);
1429 
1430         // Allocate memory
1431         lsp_utf16_t *utf16  = reinterpret_cast<lsp_utf16_t *>(::malloc(bytes));
1432         if (utf16 == NULL)
1433             return NULL;
1434 
1435         // Perform encoding
1436         p               = str;
1437         lsp_utf16_t *dst= utf16;
1438         while ((cp = *(p++)) != 0)
1439             write_utf16le_codepoint(&dst, cp);
1440 
1441         *dst = 0;
1442         return utf16;
1443     }
1444 
utf32be_to_utf16be(const lsp_utf32_t * str)1445     lsp_utf16_t *utf32be_to_utf16be(const lsp_utf32_t *str)
1446     {
1447         lsp_utf32_t cp;
1448         size_t bytes = 0;
1449         const lsp_utf32_t *p = str;
1450 
1451         // Estimate length
1452         do
1453         {
1454             cp          = BE_TO_CPU(*(p++));
1455             bytes      += sizeof_utf16(cp);
1456         } while (cp != 0);
1457 
1458         // Allocate memory
1459         lsp_utf16_t *utf16  = reinterpret_cast<lsp_utf16_t *>(::malloc(bytes));
1460         if (utf16 == NULL)
1461             return NULL;
1462 
1463         // Perform encoding
1464         p               = str;
1465         lsp_utf16_t *dst= utf16;
1466         while ((cp = *(p++)) != 0)
1467             write_utf16be_codepoint(&dst, cp);
1468 
1469         *dst = 0;
1470         return utf16;
1471     }
1472 
1473     //-------------------------------------------------------------------------
1474     // UTF-8 streaming routines
utf8_to_utf16le(lsp_utf16_t * dst,size_t * ndst,const char * src,size_t * nsrc,bool force)1475     size_t utf8_to_utf16le(lsp_utf16_t *dst, size_t *ndst, const char *src, size_t *nsrc, bool force)
1476     {
1477         lsp_utf32_t cp;
1478         size_t processed = 0;
1479 
1480         while (*ndst > 0)
1481         {
1482             // Read code point
1483             size_t nin  = *nsrc;
1484             cp          = read_utf8_streaming(&src, &nin, force);
1485             if (cp == LSP_UTF32_EOF) // No data ?
1486                 break;
1487 
1488             // Encode code point
1489             size_t nout = count_utf16(cp);
1490             if (nout > *ndst)
1491                 break;
1492             write_utf16le_codepoint(&dst, cp);
1493             *nsrc       = nin;
1494             *ndst      -= nout;
1495 
1496             // Update statistics
1497             ++processed;
1498         }
1499 
1500         return processed;
1501     }
1502 
utf8_to_utf16be(lsp_utf16_t * dst,size_t * ndst,const char * src,size_t * nsrc,bool force)1503     size_t utf8_to_utf16be(lsp_utf16_t *dst, size_t *ndst, const char *src, size_t *nsrc, bool force)
1504     {
1505         lsp_utf32_t cp;
1506         size_t processed = 0;
1507 
1508         while (*ndst > 0)
1509         {
1510             // Read code point
1511             size_t nin  = *nsrc;
1512             cp          = read_utf8_streaming(&src, &nin, force);
1513             if (cp == LSP_UTF32_EOF) // No data ?
1514                 break;
1515 
1516             // Encode code point
1517             size_t nout = count_utf16(cp);
1518             if (nout > *ndst)
1519                 break;
1520             write_utf16be_codepoint(&dst, cp);
1521             *nsrc       = nin;
1522             *ndst      -= nout;
1523 
1524             // Update statistics
1525             ++processed;
1526         }
1527 
1528         return processed;
1529     }
1530 
utf8_to_utf32le(lsp_utf32_t * dst,size_t * ndst,const char * src,size_t * nsrc,bool force)1531     size_t utf8_to_utf32le(lsp_utf32_t *dst, size_t *ndst, const char *src, size_t *nsrc, bool force)
1532     {
1533         lsp_utf32_t cp;
1534         size_t processed = 0;
1535 
1536         while (*ndst > 0)
1537         {
1538             // Read code point
1539             size_t nin  = *nsrc;
1540             cp          = read_utf8_streaming(&src, &nin, force);
1541             if (cp == LSP_UTF32_EOF) // No data ?
1542                 break;
1543 
1544             // Encode code point
1545             *(dst++)    = CPU_TO_LE(cp);
1546             *nsrc       = nin;
1547             --(*ndst);
1548 
1549             // Update statistics
1550             ++processed;
1551         }
1552 
1553         return processed;
1554     }
1555 
utf8_to_utf32be(lsp_utf32_t * dst,size_t * ndst,const char * src,size_t * nsrc,bool force)1556     size_t utf8_to_utf32be(lsp_utf32_t *dst, size_t *ndst, const char *src, size_t *nsrc, bool force)
1557     {
1558         lsp_utf32_t cp;
1559         size_t processed = 0;
1560 
1561         while (*ndst > 0)
1562         {
1563             // Read code point
1564             size_t nin  = *nsrc;
1565             cp          = read_utf8_streaming(&src, &nin, force);
1566             if (cp == LSP_UTF32_EOF) // No data ?
1567                 break;
1568 
1569             // Encode code point
1570             *(dst++)    = CPU_TO_BE(cp);
1571             *nsrc       = nin;
1572             --(*ndst);
1573 
1574             // Update statistics
1575             ++processed;
1576         }
1577 
1578         return processed;
1579     }
1580 
1581     //-------------------------------------------------------------------------
1582     // UTF-16 streaming routines
utf16le_to_utf8(char * dst,size_t * ndst,const lsp_utf16_t * src,size_t * nsrc,bool force)1583     size_t utf16le_to_utf8(char *dst, size_t *ndst, const lsp_utf16_t *src, size_t *nsrc, bool force)
1584     {
1585         lsp_utf32_t cp;
1586         size_t processed = 0;
1587 
1588         while (*ndst > 0)
1589         {
1590             // Read code point
1591             size_t nin  = *nsrc;
1592             cp          = read_utf16le_streaming(&src, &nin, force);
1593             if (cp == LSP_UTF32_EOF) // No data ?
1594                 break;
1595 
1596             // Encode code point
1597             size_t nout = count_utf8(cp);
1598             if (nout > *ndst)
1599                 break;
1600             write_utf8_codepoint(&dst, cp);
1601             *nsrc       = nin;
1602             *ndst      -= nout;
1603 
1604             // Update statistics
1605             ++processed;
1606         }
1607 
1608         return processed;
1609     }
1610 
utf16be_to_utf8(char * dst,size_t * ndst,const lsp_utf16_t * src,size_t * nsrc,bool force)1611     size_t utf16be_to_utf8(char *dst, size_t *ndst, const lsp_utf16_t *src, size_t *nsrc, bool force)
1612     {
1613         lsp_utf32_t cp;
1614         size_t processed = 0;
1615 
1616         while (*ndst > 0)
1617         {
1618             // Read code point
1619             size_t nin  = *nsrc;
1620             cp          = read_utf16be_streaming(&src, &nin, force);
1621             if (cp == LSP_UTF32_EOF) // No data ?
1622                 break;
1623 
1624             // Encode code point
1625             size_t nout = count_utf8(cp);
1626             if (nout > *ndst)
1627                 break;
1628             write_utf8_codepoint(&dst, cp);
1629             *nsrc       = nin;
1630             *ndst      -= nout;
1631 
1632             // Update statistics
1633             ++processed;
1634         }
1635 
1636         return processed;
1637     }
1638 
utf16le_to_utf32le(lsp_utf32_t * dst,size_t * ndst,const lsp_utf16_t * src,size_t * nsrc,bool force)1639     size_t utf16le_to_utf32le(lsp_utf32_t *dst, size_t *ndst, const lsp_utf16_t *src, size_t *nsrc, bool force)
1640     {
1641         lsp_utf32_t cp;
1642         size_t processed = 0;
1643 
1644         while (*ndst > 0)
1645         {
1646             // Read code point
1647             size_t nin  = *nsrc;
1648             cp          = read_utf16le_streaming(&src, &nin, force);
1649             if (cp == LSP_UTF32_EOF) // No data ?
1650                 break;
1651 
1652             // Encode code point
1653             *(dst++)    = CPU_TO_LE(cp);
1654             *nsrc       = nin;
1655             --(*ndst);
1656 
1657             // Update statistics
1658             ++processed;
1659         }
1660 
1661         return processed;
1662     }
1663 
utf16be_to_utf32le(lsp_utf32_t * dst,size_t * ndst,const lsp_utf16_t * src,size_t * nsrc,bool force)1664     size_t utf16be_to_utf32le(lsp_utf32_t *dst, size_t *ndst, const lsp_utf16_t *src, size_t *nsrc, bool force)
1665     {
1666         lsp_utf32_t cp;
1667         size_t processed = 0;
1668 
1669         while (*ndst > 0)
1670         {
1671             // Read code point
1672             size_t nin  = *nsrc;
1673             cp          = read_utf16be_streaming(&src, &nin, force);
1674             if (cp == LSP_UTF32_EOF) // No data ?
1675                 break;
1676 
1677             // Encode code point
1678             *(dst++)    = CPU_TO_LE(cp);
1679             *nsrc       = nin;
1680             --(*ndst);
1681 
1682             // Update statistics
1683             ++processed;
1684         }
1685 
1686         return processed;
1687     }
1688 
utf16le_to_utf32be(lsp_utf32_t * dst,size_t * ndst,const lsp_utf16_t * src,size_t * nsrc,bool force)1689     size_t utf16le_to_utf32be(lsp_utf32_t *dst, size_t *ndst, const lsp_utf16_t *src, size_t *nsrc, bool force)
1690     {
1691         lsp_utf32_t cp;
1692         size_t processed = 0;
1693 
1694         while (*ndst > 0)
1695         {
1696             // Read code point
1697             size_t nin  = *nsrc;
1698             cp          = read_utf16le_streaming(&src, &nin, force);
1699             if (cp == LSP_UTF32_EOF) // No data ?
1700                 break;
1701 
1702             // Encode code point
1703             *(dst++)    = CPU_TO_BE(cp);
1704             *nsrc       = nin;
1705             --(*ndst);
1706 
1707             // Update statistics
1708             ++processed;
1709         }
1710 
1711         return processed;
1712     }
1713 
utf16be_to_utf32be(lsp_utf32_t * dst,size_t * ndst,const lsp_utf16_t * src,size_t * nsrc,bool force)1714     size_t utf16be_to_utf32be(lsp_utf32_t *dst, size_t *ndst, const lsp_utf16_t *src, size_t *nsrc, bool force)
1715     {
1716         lsp_utf32_t cp;
1717         size_t processed = 0;
1718 
1719         while (*ndst > 0)
1720         {
1721             // Read code point
1722             size_t nin  = *nsrc;
1723             cp          = read_utf16be_streaming(&src, &nin, force);
1724             if (cp == LSP_UTF32_EOF) // No data ?
1725                 break;
1726 
1727             // Encode code point
1728             *(dst++)    = CPU_TO_BE(cp);
1729             *nsrc       = nin;
1730             --(*ndst);
1731 
1732             // Update statistics
1733             ++processed;
1734         }
1735 
1736         return processed;
1737     }
1738 
1739     //-------------------------------------------------------------------------
1740     // UTF-32 streaming routines
utf32le_to_utf8(char * dst,size_t * ndst,const lsp_utf32_t * src,size_t * nsrc,bool force)1741     size_t utf32le_to_utf8(char *dst, size_t *ndst, const lsp_utf32_t *src, size_t *nsrc, bool force)
1742     {
1743         lsp_utf32_t cp;
1744         size_t processed = 0;
1745 
1746         while (*ndst > 0)
1747         {
1748             // Read code point
1749             if (*nsrc <= 0)
1750                 break;
1751             cp          = LE_TO_CPU(*(src++));
1752 
1753             // Encode code point
1754             size_t nout = count_utf8(cp);
1755             if (nout > *ndst)
1756                 break;
1757             write_utf8_codepoint(&dst, cp);
1758             --(*nsrc);
1759             *ndst      -= nout;
1760 
1761             // Update statistics
1762             ++processed;
1763         }
1764 
1765         return processed;
1766     }
1767 
utf32be_to_utf8(char * dst,size_t * ndst,const lsp_utf32_t * src,size_t * nsrc,bool force)1768     size_t utf32be_to_utf8(char *dst, size_t *ndst, const lsp_utf32_t *src, size_t *nsrc, bool force)
1769     {
1770         lsp_utf32_t cp;
1771         size_t processed = 0;
1772 
1773         while (*ndst > 0)
1774         {
1775             // Read code point
1776             if (*nsrc <= 0)
1777                 break;
1778             cp          = BE_TO_CPU(*(src++));
1779 
1780             // Encode code point
1781             size_t nout = count_utf8(cp);
1782             if (nout > *ndst)
1783                 break;
1784             write_utf8_codepoint(&dst, cp);
1785             --(*nsrc);
1786             *ndst      -= nout;
1787 
1788             // Update statistics
1789             ++processed;
1790         }
1791 
1792         return processed;
1793     }
1794 
utf32le_to_utf16le(lsp_utf16_t * dst,size_t * ndst,const lsp_utf32_t * src,size_t * nsrc,bool force)1795     size_t utf32le_to_utf16le(lsp_utf16_t *dst, size_t *ndst, const lsp_utf32_t *src, size_t *nsrc, bool force)
1796     {
1797         lsp_utf32_t cp;
1798         size_t processed = 0;
1799 
1800         while (*ndst > 0)
1801         {
1802             // Read code point
1803             if (*nsrc <= 0)
1804                 break;
1805             cp          = LE_TO_CPU(*(src++));
1806 
1807             // Encode code point
1808             size_t nout = count_utf16(cp);
1809             if (nout > *ndst)
1810                 break;
1811             write_utf16le_codepoint(&dst, cp);
1812             --(*nsrc);
1813             *ndst      -= nout;
1814 
1815             // Update statistics
1816             ++processed;
1817         }
1818 
1819         return processed;
1820     }
1821 
utf32le_to_utf16be(lsp_utf16_t * dst,size_t * ndst,const lsp_utf32_t * src,size_t * nsrc,bool force)1822     size_t utf32le_to_utf16be(lsp_utf16_t *dst, size_t *ndst, const lsp_utf32_t *src, size_t *nsrc, bool force)
1823     {
1824         lsp_utf32_t cp;
1825         size_t processed = 0;
1826 
1827         while (*ndst > 0)
1828         {
1829             // Read code point
1830             if (*nsrc <= 0)
1831                 break;
1832             cp          = LE_TO_CPU(*(src++));
1833 
1834             // Encode code point
1835             size_t nout = count_utf16(cp);
1836             if (nout > *ndst)
1837                 break;
1838             write_utf16be_codepoint(&dst, cp);
1839             --(*nsrc);
1840             *ndst      -= nout;
1841 
1842             // Update statistics
1843             ++processed;
1844         }
1845 
1846         return processed;
1847     }
1848 
utf32be_to_utf16le(lsp_utf16_t * dst,size_t * ndst,const lsp_utf32_t * src,size_t * nsrc,bool force)1849     size_t utf32be_to_utf16le(lsp_utf16_t *dst, size_t *ndst, const lsp_utf32_t *src, size_t *nsrc, bool force)
1850     {
1851         lsp_utf32_t cp;
1852         size_t processed = 0;
1853 
1854         while (*ndst > 0)
1855         {
1856             // Read code point
1857             if (*nsrc <= 0)
1858                 break;
1859             cp          = BE_TO_CPU(*(src++));
1860 
1861             // Encode code point
1862             size_t nout = count_utf16(cp);
1863             if (nout > *ndst)
1864                 break;
1865             write_utf16le_codepoint(&dst, cp);
1866             --(*nsrc);
1867             *ndst      -= nout;
1868 
1869             // Update statistics
1870             ++processed;
1871         }
1872 
1873         return processed;
1874     }
1875 
utf32be_to_utf16be(lsp_utf16_t * dst,size_t * ndst,const lsp_utf32_t * src,size_t * nsrc,bool force)1876     size_t utf32be_to_utf16be(lsp_utf16_t *dst, size_t *ndst, const lsp_utf32_t *src, size_t *nsrc, bool force)
1877     {
1878         lsp_utf32_t cp;
1879         size_t processed = 0;
1880 
1881         while (*ndst > 0)
1882         {
1883             // Read code point
1884             if (*nsrc <= 0)
1885                 break;
1886             cp          = BE_TO_CPU(*(src++));
1887 
1888             // Encode code point
1889             size_t nout = count_utf16(cp);
1890             if (nout > *ndst)
1891                 break;
1892             write_utf16be_codepoint(&dst, cp);
1893             --(*nsrc);
1894             *ndst      -= nout;
1895 
1896             // Update statistics
1897             ++processed;
1898         }
1899 
1900         return processed;
1901     }
1902 
1903 #if defined(PLATFORM_WINDOWS)
multibyte_to_widechar_utf16le(LPCCH src,size_t * nsrc,LPWSTR dst,size_t * ndst)1904     static ssize_t multibyte_to_widechar_utf16le(LPCCH src, size_t *nsrc, LPWSTR dst, size_t *ndst)
1905     {
1906         lsp_wchar_t cp;
1907         ssize_t nconv   = 0;
1908         const lsp_utf16_t *xsrc = reinterpret_cast<const lsp_utf16_t *>(src);
1909         size_t nin  = (*nsrc) >> 1;
1910         size_t nout = *ndst;
1911 
1912         while (nin > 0)
1913         {
1914             // Read code point
1915             size_t xin  = nin;
1916             cp          = read_utf16le_streaming(&xsrc, &xin, false);
1917             if (cp == LSP_UTF32_EOF) // No data ?
1918                 break;
1919 
1920             // Check that we have enough space
1921             size_t len = count_utf16(cp);
1922             if (nout < len)
1923                 break;
1924 
1925             // Write code point
1926             write_utf16_codepoint(&dst, cp);
1927             nin         = xin;
1928             nout       -= len;
1929             nconv      += len;
1930         }
1931 
1932         *nsrc       = ((*nsrc) & 1) + (nin << 1);
1933         *ndst       = nout;
1934         return nconv;
1935     }
1936 
multibyte_to_widechar_utf16be(LPCCH src,size_t * nsrc,LPWSTR dst,size_t * ndst)1937     static ssize_t multibyte_to_widechar_utf16be(LPCCH src, size_t *nsrc, LPWSTR dst, size_t *ndst)
1938     {
1939         lsp_wchar_t cp;
1940         ssize_t nconv   = 0;
1941         const lsp_utf16_t *xsrc = reinterpret_cast<const lsp_utf16_t *>(src);
1942         size_t nin  = (*nsrc) >> 1;
1943         size_t nout = *ndst;
1944 
1945         while (nin > 0)
1946         {
1947             // Read code point
1948             size_t xin  = nin;
1949             cp          = read_utf16be_streaming(&xsrc, &xin, false);
1950             if (cp == LSP_UTF32_EOF) // No data ?
1951                 break;
1952 
1953             // Check that we have enough space
1954             size_t len = count_utf16(cp);
1955             if (nout < len)
1956                 break;
1957 
1958             // Write code point
1959             write_utf16_codepoint(&dst, cp);
1960             nin         = xin;
1961             nout       -= len;
1962             nconv      += len;
1963         }
1964 
1965         *nsrc       = ((*nsrc) & 1) + (nin << 1);
1966         *ndst       = nout;
1967         return nconv;
1968     }
1969 
est_multibyte_to_widechar_utf16le(LPCCH src,size_t nsrc)1970     static ssize_t est_multibyte_to_widechar_utf16le(LPCCH src, size_t nsrc)
1971     {
1972         lsp_wchar_t cp;
1973         ssize_t nconv   = 0;
1974         const lsp_utf16_t *xsrc = reinterpret_cast<const lsp_utf16_t *>(src);
1975         nsrc >>= 1;
1976 
1977         while (nsrc > 0)
1978         {
1979             // Read code point
1980             cp         = read_utf16le_streaming(&xsrc, &nsrc, false);
1981             if (cp == LSP_UTF32_EOF) // No data ?
1982                 break;
1983 
1984             // Check that we have enough space
1985             nconv      += count_utf16(cp);
1986         }
1987 
1988         return nconv;
1989     }
1990 
est_multibyte_to_widechar_utf16be(LPCCH src,size_t nsrc)1991     static ssize_t est_multibyte_to_widechar_utf16be(LPCCH src, size_t nsrc)
1992     {
1993         lsp_wchar_t cp;
1994         ssize_t nconv   = 0;
1995         const lsp_utf16_t *xsrc = reinterpret_cast<const lsp_utf16_t *>(src);
1996         nsrc >>= 1;
1997 
1998         while (nsrc > 0)
1999         {
2000             // Read code point
2001             cp         = read_utf16le_streaming(&xsrc, &nsrc, false);
2002             if (cp == LSP_UTF32_EOF) // No data ?
2003                 break;
2004 
2005             // Check that we have enough space
2006             nconv      += count_utf16(cp);
2007         }
2008 
2009         return nconv;
2010     }
2011 
multibyte_to_widechar_utf32le(LPCCH src,size_t * nsrc,LPWSTR dst,size_t * ndst)2012     static ssize_t multibyte_to_widechar_utf32le(LPCCH src, size_t *nsrc, LPWSTR dst, size_t *ndst)
2013     {
2014         lsp_wchar_t cp;
2015         ssize_t nconv   = 0;
2016         const lsp_utf32_t *xsrc = reinterpret_cast<const lsp_utf32_t *>(src);
2017         size_t nin  = (*nsrc) >> 2;
2018         size_t nout = *ndst;
2019 
2020         while (nin > 0)
2021         {
2022             // Read code point
2023             cp          = LE_TO_CPU(*(xsrc++));
2024 
2025             // Check that we have enough space
2026             size_t len = count_utf16(cp);
2027             if (nout < len)
2028                 break;
2029 
2030             // Write code point
2031             write_utf16_codepoint(&dst, cp);
2032             nin        -= 1;
2033             nout       -= len;
2034             nconv      += len;
2035         }
2036 
2037         *nsrc       = ((*nsrc) & 3) + (nin << 2);
2038         *ndst       = nout;
2039         return nconv;
2040     }
2041 
multibyte_to_widechar_utf32be(LPCCH src,size_t * nsrc,LPWSTR dst,size_t * ndst)2042     static ssize_t multibyte_to_widechar_utf32be(LPCCH src, size_t *nsrc, LPWSTR dst, size_t *ndst)
2043     {
2044         lsp_wchar_t cp;
2045         ssize_t nconv   = 0;
2046         const lsp_utf32_t *xsrc = reinterpret_cast<const lsp_utf32_t *>(src);
2047         size_t nin  = (*nsrc) >> 2;
2048         size_t nout = *ndst;
2049 
2050         while (nin > 0)
2051         {
2052             // Read code point
2053             cp          = BE_TO_CPU(*(xsrc++));
2054 
2055             // Check that we have enough space
2056             size_t len = count_utf16(cp);
2057             if (nout < len)
2058                 break;
2059 
2060             // Write code point
2061             write_utf16_codepoint(&dst, cp);
2062             nin        -= 1;
2063             nout       -= len;
2064             nconv      += len;
2065         }
2066 
2067         *nsrc       = ((*nsrc) & 3) + (nin << 2);
2068         *ndst       = nout;
2069         return nconv;
2070     }
2071 
est_multibyte_to_widechar_utf32le(LPCCH src,size_t nsrc)2072     static ssize_t est_multibyte_to_widechar_utf32le(LPCCH src, size_t nsrc)
2073     {
2074         lsp_wchar_t cp;
2075         ssize_t nconv   = 0;
2076         const lsp_utf32_t *xsrc = reinterpret_cast<const lsp_utf32_t *>(src);
2077         nsrc           >>= 2;
2078 
2079         while (nsrc > 0)
2080         {
2081             // Read code point
2082             cp          = LE_TO_CPU(*(xsrc++));
2083             nconv      += count_utf16(cp);
2084         }
2085 
2086         return nconv;
2087     }
2088 
est_multibyte_to_widechar_utf32be(LPCCH src,size_t nsrc)2089     static ssize_t est_multibyte_to_widechar_utf32be(LPCCH src, size_t nsrc)
2090     {
2091         lsp_wchar_t cp;
2092         ssize_t nconv   = 0;
2093         const lsp_utf32_t *xsrc = reinterpret_cast<const lsp_utf32_t *>(src);
2094         nsrc           >>= 2;
2095 
2096         while (nsrc > 0)
2097         {
2098             // Read code point
2099             cp          = BE_TO_CPU(*(xsrc++));
2100             nconv      += count_utf16(cp);
2101         }
2102 
2103         return nconv;
2104     }
2105 
multibyte_to_widechar(size_t cp,LPCCH src,size_t * nsrc,LPWSTR dst,size_t * ndst)2106     ssize_t multibyte_to_widechar(size_t cp, LPCCH src, size_t *nsrc, LPWSTR dst, size_t *ndst)
2107     {
2108         ssize_t nconv;
2109 
2110         switch (cp)
2111         {
2112             case 1200:  // UTF-16LE
2113                 nconv = ((dst == NULL) || (ndst == NULL) || (ssize_t(*ndst) <= 0)) ?
2114                         est_multibyte_to_widechar_utf16le(src, *nsrc) :
2115                         multibyte_to_widechar_utf16le(src, nsrc, dst, ndst);
2116                 break;
2117             case 1201:  // UTF-16BE
2118                 nconv = ((dst == NULL) || (ndst == NULL) || (ssize_t(*ndst) <= 0)) ?
2119                         est_multibyte_to_widechar_utf16be(src, *nsrc) :
2120                         multibyte_to_widechar_utf16be(src, nsrc, dst, ndst);
2121                 break;
2122             case 12000: // UTF-32LE
2123                 nconv = ((dst == NULL) || (ndst == NULL) || (ssize_t(*ndst) <= 0)) ?
2124                         est_multibyte_to_widechar_utf32le(src, *nsrc) :
2125                         multibyte_to_widechar_utf32le(src, nsrc, dst, ndst);
2126                 break;
2127             case 12001: // UTF-32BE
2128                 nconv = ((dst == NULL) || (ndst == NULL) || (ssize_t(*ndst) <= 0)) ?
2129                         est_multibyte_to_widechar_utf32be(src, *nsrc) :
2130                         multibyte_to_widechar_utf32be(src, nsrc, dst, ndst);
2131                 break;
2132             default:
2133                 // We need just to estimate the size?
2134                 if ((dst == NULL) || (ndst == NULL) || (ssize_t(*ndst) <= 0))
2135                     return ::MultiByteToWideChar(cp, 0, src, *nsrc, 0, 0);
2136 
2137                 // Do the conversion
2138                 nconv = ::MultiByteToWideChar(cp, 0, src, *nsrc, dst, *ndst);
2139                 if (nconv == 0)
2140                 {
2141                     switch (GetLastError())
2142                     {
2143                         case ERROR_SUCCESS:
2144                             return 0;
2145                         case ERROR_INSUFFICIENT_BUFFER:
2146                             return -STATUS_NO_MEM;
2147                         case ERROR_INVALID_FLAGS:
2148                         case ERROR_INVALID_PARAMETER:
2149                             return -STATUS_BAD_STATE;
2150                         case ERROR_NO_UNICODE_TRANSLATION:
2151                             return -STATUS_BAD_LOCALE;
2152                         default:
2153                             return -STATUS_UNKNOWN_ERR;
2154                     }
2155                 }
2156 
2157                 // There are converted characters, analyze output
2158                 // If function meets invalid sequence, it replaces the code point with such magic value
2159                 // We should know if function has failed
2160                 if (dst[nconv-1] == 0xfffd)
2161                     --nconv;
2162 
2163                 if (nconv > 0)
2164                 {
2165                     // Estimate number of bytes decoded (yep, this is dumb but no way...)
2166                     ssize_t nbytes  = ::WideCharToMultiByte(cp, 0, dst, nconv, NULL, 0, 0, 0);
2167                     if (nbytes <= 0)
2168                         return -STATUS_IO_ERROR;
2169 
2170                     *nsrc  -= nbytes;
2171                     *ndst  -= nconv;
2172                 }
2173 
2174                 break;
2175         }
2176 
2177         return nconv;
2178     }
2179 
widechar_to_multibyte_utf16le(const lsp_utf16_t * src,size_t * nsrc,char * dst,size_t * ndst)2180     static ssize_t widechar_to_multibyte_utf16le(const lsp_utf16_t *src, size_t *nsrc, char *dst, size_t *ndst)
2181     {
2182         lsp_wchar_t cp;
2183         ssize_t nconv = 0;
2184         size_t nin  = *nsrc;
2185         size_t nout = *ndst;
2186         lsp_utf16_t *xdst = reinterpret_cast<lsp_utf16_t *>(dst);
2187 
2188         while (nin > 0)
2189         {
2190             size_t xin  = nin;
2191             cp          = read_utf16_streaming(&src, &xin, false);
2192             if (cp == LSP_UTF32_EOF) // No data ?
2193                 break;
2194 
2195             // Check that we have enough space
2196             size_t len = sizeof_utf16(cp);
2197             if (nout < len)
2198                 break;
2199 
2200             // Write code point
2201             write_utf16le_codepoint(&xdst, cp);
2202             nin         = xin;
2203             nout       -= len;
2204             nconv      += len;
2205         }
2206 
2207         *nsrc   = nin;
2208         *ndst   = nout;
2209         return nconv;
2210     }
2211 
widechar_to_multibyte_utf16be(const lsp_utf16_t * src,size_t * nsrc,char * dst,size_t * ndst)2212     static ssize_t widechar_to_multibyte_utf16be(const lsp_utf16_t *src, size_t *nsrc, char *dst, size_t *ndst)
2213     {
2214         lsp_wchar_t cp;
2215         ssize_t nconv = 0;
2216         size_t nin  = *nsrc;
2217         size_t nout = *ndst;
2218         lsp_utf16_t *xdst = reinterpret_cast<lsp_utf16_t *>(dst);
2219 
2220         while (nin > 0)
2221         {
2222             size_t xin  = nin;
2223             cp          = read_utf16_streaming(&src, &xin, false);
2224             if (cp == LSP_UTF32_EOF) // No data ?
2225                 break;
2226 
2227             // Check that we have enough space
2228             size_t len = sizeof_utf16(cp);
2229             if (nout < len)
2230                 break;
2231 
2232             // Write code point
2233             write_utf16be_codepoint(&xdst, cp);
2234             nin         = xin;
2235             nout       -= len;
2236             nconv      += len;
2237         }
2238 
2239         *nsrc   = nin;
2240         *ndst   = nout;
2241         return nconv;
2242     }
2243 
est_widechar_to_multibyte_utf16(const lsp_utf16_t * src,size_t nsrc)2244     static ssize_t est_widechar_to_multibyte_utf16(const lsp_utf16_t *src, size_t nsrc)
2245     {
2246         lsp_wchar_t cp;
2247         ssize_t nconv = 0;
2248 
2249         while (nsrc > 0)
2250         {
2251             cp      = read_utf16_streaming(&src, &nsrc, false);
2252             if (cp == LSP_UTF32_EOF) // No data ?
2253                 break;
2254 
2255             // Check that we have enough space
2256             nconv  += sizeof_utf16(cp);
2257         }
2258 
2259         return nconv;
2260     }
2261 
widechar_to_multibyte_utf32le(const lsp_utf16_t * src,size_t * nsrc,char * dst,size_t * ndst)2262     static ssize_t widechar_to_multibyte_utf32le(const lsp_utf16_t *src, size_t *nsrc, char *dst, size_t *ndst)
2263     {
2264         lsp_wchar_t cp;
2265         ssize_t nconv = 0;
2266         size_t nin  = *nsrc;
2267         size_t nout = *ndst;
2268         lsp_utf32_t *xdst = reinterpret_cast<lsp_utf32_t *>(dst);
2269 
2270         while (nin > 0)
2271         {
2272             size_t xin  = nin;
2273             cp          = read_utf16_streaming(&src, &xin, false);
2274             if (cp == LSP_UTF32_EOF) // No data ?
2275                 break;
2276 
2277             // Check that we have enough space
2278             if (nout < sizeof(lsp_utf32_t))
2279                 break;
2280 
2281             // Write code point
2282             *(xdst++)   = CPU_TO_LE(cp);
2283             nin         = xin;
2284             nout       -= sizeof(lsp_utf32_t);
2285             nconv      += sizeof(lsp_utf32_t);
2286         }
2287 
2288         *nsrc   = nin;
2289         *ndst   = nout;
2290         return nconv;
2291     }
2292 
widechar_to_multibyte_utf32be(const lsp_utf16_t * src,size_t * nsrc,char * dst,size_t * ndst)2293     static ssize_t widechar_to_multibyte_utf32be(const lsp_utf16_t *src, size_t *nsrc, char *dst, size_t *ndst)
2294     {
2295         lsp_wchar_t cp;
2296         ssize_t nconv = 0;
2297         size_t nin  = *nsrc;
2298         size_t nout = *ndst;
2299         lsp_utf32_t *xdst = reinterpret_cast<lsp_utf32_t *>(dst);
2300 
2301         while (nin > 0)
2302         {
2303             size_t xin  = nin;
2304             cp          = read_utf16_streaming(&src, &xin, false);
2305             if (cp == LSP_UTF32_EOF) // No data ?
2306                 break;
2307 
2308             // Check that we have enough space
2309             if (nout < sizeof(lsp_utf32_t))
2310                 break;
2311 
2312             // Write code point
2313             *(xdst++)   = CPU_TO_BE(cp);
2314             nin         = xin;
2315             nout       -= sizeof(lsp_utf32_t);
2316             nconv      += sizeof(lsp_utf32_t);
2317         }
2318 
2319         *nsrc   = nin;
2320         *ndst   = nout;
2321         return nconv;
2322     }
2323 
est_widechar_to_multibyte_utf32(const lsp_utf16_t * src,size_t nsrc)2324     static ssize_t est_widechar_to_multibyte_utf32(const lsp_utf16_t *src, size_t nsrc)
2325     {
2326         lsp_wchar_t cp;
2327         ssize_t nconv = 0;
2328 
2329         while (nsrc > 0)
2330         {
2331             cp          = read_utf16_streaming(&src, &nsrc, false);
2332             if (cp == LSP_UTF32_EOF) // No data ?
2333                 break;
2334             nconv      += sizeof(lsp_utf32_t);
2335         }
2336 
2337         return nconv;
2338     }
2339 
widechar_to_multibyte_split(const lsp_utf16_t * src,size_t limit)2340     static size_t widechar_to_multibyte_split(const lsp_utf16_t *src, size_t limit)
2341     {
2342         // Estimate the middle of an array
2343         size_t half     = limit >> 1;
2344         if (half <= 0)
2345             return half;
2346 
2347         // Now scan valid code points until we reach the end of array
2348         lsp_wchar_t cp;
2349         limit           = half;
2350         while (true)
2351         {
2352             cp          = read_utf16_streaming(&src, &limit, false);
2353             if (cp == LSP_UTF32_EOF) // No data ?
2354                 break;
2355         }
2356 
2357         // Return the result as middle of array without remained points in limit
2358         return half - limit;
2359     }
2360 
widechar_to_multibyte(size_t cp,LPCWCH src,size_t * nsrc,LPSTR dst,size_t * ndst)2361     ssize_t widechar_to_multibyte(size_t cp, LPCWCH src, size_t *nsrc, LPSTR dst, size_t *ndst)
2362     {
2363         ssize_t nconv;
2364 
2365         switch (cp)
2366         {
2367             case 1200:  // UTF-16LE
2368                 nconv = ((dst == NULL) || (ndst == NULL) || (ssize_t(*ndst) <= 0)) ?
2369                         est_widechar_to_multibyte_utf16(src, *nsrc) :
2370                         widechar_to_multibyte_utf16le(src, nsrc, dst, ndst);
2371                 break;
2372             case 1201:  // UTF-16BE
2373                 nconv = ((dst == NULL) || (ndst == NULL) || (ssize_t(*ndst) <= 0)) ?
2374                         est_widechar_to_multibyte_utf16(src, *nsrc) :
2375                         widechar_to_multibyte_utf16be(src, nsrc, dst, ndst);
2376                 break;
2377             case 12000: // UTF-32LE
2378                 nconv = ((dst == NULL) || (ndst == NULL) || (ssize_t(*ndst) <= 0)) ?
2379                         est_widechar_to_multibyte_utf32(src, *nsrc) :
2380                         widechar_to_multibyte_utf32le(src, nsrc, dst, ndst);
2381                 break;
2382             case 12001: // UTF-32BE
2383                 nconv = ((dst == NULL) || (ndst == NULL) || (ssize_t(*ndst) <= 0)) ?
2384                         est_widechar_to_multibyte_utf32(src, *nsrc) :
2385                         widechar_to_multibyte_utf32be(src, nsrc, dst, ndst);
2386                 break;
2387             default:
2388             {
2389                 // Just estimate number of characters?
2390                 if ((dst == NULL) || (ndst == NULL) || (ssize_t(*ndst) <= 0))
2391                 {
2392                     nconv           = ::WideCharToMultiByte(cp, 0, src, *nsrc, NULL, 0, 0, FALSE);
2393                     if (nconv == 0)
2394                     {
2395                         switch (::GetLastError())
2396                         {
2397                             case ERROR_SUCCESS:
2398                                 return 0;
2399                             case ERROR_INSUFFICIENT_BUFFER:
2400                                 return -STATUS_NO_MEM;
2401                             case ERROR_INVALID_FLAGS:
2402                             case ERROR_INVALID_PARAMETER:
2403                                 return -STATUS_BAD_STATE;
2404                             case ERROR_NO_UNICODE_TRANSLATION:
2405                                 return -STATUS_BAD_LOCALE;
2406                             default:
2407                                 return -STATUS_UNKNOWN_ERR;
2408                         }
2409                     }
2410                     return nconv;
2411                 }
2412 
2413                 // Perform first try
2414                 size_t xnsrc    = *nsrc;
2415                 nconv = ::WideCharToMultiByte(cp, 0, src, xnsrc, dst, *ndst, 0, FALSE);
2416 
2417                 // Do while conversion is unsuccessful
2418                 while (nconv <= 0)
2419                 {
2420                     // There was a fail, analyze it
2421                     switch (::GetLastError())
2422                     {
2423                         case ERROR_SUCCESS:
2424                             return 0;
2425                         case ERROR_INSUFFICIENT_BUFFER:
2426                             break;  // Will retry with twice lesser input buffer
2427                         case ERROR_INVALID_FLAGS:
2428                         case ERROR_INVALID_PARAMETER:
2429                             return -STATUS_BAD_STATE;
2430                         case ERROR_NO_UNICODE_TRANSLATION:
2431                             return -STATUS_BAD_LOCALE;
2432                         default:
2433                             return -STATUS_UNKNOWN_ERR;
2434                     }
2435 
2436                     // Try to twice reduce the buffer size, validate data for surrogates
2437                     xnsrc = widechar_to_multibyte_split(src, xnsrc);
2438                     if (xnsrc <= 0)
2439                         break;
2440 
2441                     // Perform next conversion try with lesser buffer
2442                     nconv = ::WideCharToMultiByte(cp, 0, src, xnsrc, dst, *ndst, 0, FALSE);
2443                 }
2444 
2445                 *ndst      -= nconv;
2446                 *nsrc      -= xnsrc;
2447             }
2448             break;
2449         }
2450 
2451         return nconv;
2452     }
2453 #endif /* PLATFORM_WINDOWS */
2454 }
2455