1 #include <cstdlib>
2 #include <cstring>
3 #include <climits>
4 #if !defined(FORCE_ICONV) && defined(__STDC_ISO_10646__) || defined(_WIN32)
5 #  include <wchar.h>
6 #  define fallback(call) (0)
7 #elif defined(__DJGPP__)
8 #  include <map>
9 #  include <dos.h>
10 #else
11 #  include <stdexcept>
12 #  include <langinfo.h>
13 #  if !defined(NO_ICONV)
14 #  include <cerrno>
15 #  include <vector>
16 #  include <iconv.h>
17 #  endif
18 #  define fallback(call) (call)
19 #endif
20 #include "charconv.h"
21 
22 /*
23 
24   character conversion (user locale -> latin1) (portable)
25 
26   copyright (c) 2005, 2006, 2015 squell <squell@alumina.nl>
27 
28   use, modification, copying and distribution of this software is permitted
29   under the conditions described in the file 'COPYING'.
30 
31 */
32 
33 namespace charset {
34     using namespace std;
35 
36     namespace {
37         union wide {                        // accomodate wstring and string
wide(wchar_t wc)38             wide(wchar_t wc) : code(wc) { }
39             wchar_t code;
40             char    raw[sizeof(wchar_t)];
41         };
42 
43         template<class T> inline
operator +=(std::basic_string<T> & str,const wide w)44         std::basic_string<T>& operator+=(std::basic_string<T>& str, const wide w)
45         {
46             return str += w.code;
47         }
48 
operator +=(std::string & str,const wide w)49         inline std::string& operator+=(std::string& str, const wide w)
50         {
51             return str.append(w.raw, sizeof w.raw);
52         }
53     }
54 
55   // latin1 <-> unicode interconversion
56 
decode(const char * s,size_t len)57     template<> conv<>::data conv<latin1>::decode(const char* s, size_t len)
58     {
59         conv<>::data build;
60         build.reserve(len);
61         for( ; len--; ) {
62             build += wide(*s++ & 0xFF);
63         }
64         return build;
65     }
66 
encode(const void * p,size_t len)67     template<> std::string conv<latin1>::encode(const void* p, size_t len)
68     {
69         const wchar_t* w = (wchar_t*)p;
70         std::string build;
71         build.reserve(len);
72         for( ; len--; ) {
73             wchar_t c = *w++;
74             build += (c < 0x100)? c : '?';
75         }
76         return build;
77     }
78 
79 #if !defined(__DJGPP__)
80 
81   // locale <-> unicode interconversion
82   // a bit touchy when changing locales
83 
84     namespace {
85         struct _7bit;
86 
wchar_unicode()87         static bool wchar_unicode()
88         {
89 #  if fallback(1) && defined(NO_ICONV)
90 #    if defined(CODESET)
91 #    warning "Assuming Unicode if (and only if) CODESET is UTF-8; 7-bit ASCII otherwise."
92             return strcmp(nl_langinfo(CODESET), "UTF-8") == 0;
93 #    else
94 #    warning "Unicode not available on this platform; only supporting 7-bit ASCII"
95             return false;
96 #    endif
97 #  else
98             return true;
99 #  endif
100         }
101     } // end anon. namespace
102 
103 #if fallback(1) && !defined(NO_ICONV)
104 
recode_error(const char * to,const char * from)105     static void recode_error(const char* to, const char* from)
106     {
107         throw std::runtime_error(std::string("iconv -f ") + from + " -t " + to + " not working; recompile with -DNO_ICONV");
108     }
109 
110     // Work-around for the ambiguity in SUSv2 specification if iconv; see
111     // https://www.opengroup.org/austin/aardvark/finaltext/xshbug.txt
112     // (Even though POSIX.2001 fixed this, still an issue in 2015...)
113 
114     using ::iconv;
iconv(T cd,char ** inbuf,size_t * inbytesleft,char ** outbuf,size_t * outbytesleft)115     template<class T> inline size_t iconv(T cd, char **inbuf, size_t *inbytesleft, char **outbuf, size_t *outbytesleft)
116     {
117         // Only instantiated if iconv.h doesn't take a "char** inbuf" argument
118         return iconv(cd, (const char**)inbuf, inbytesleft, outbuf, outbytesleft);
119     }
120 
recode(char * out,size_t avail,const void * src,size_t len,const char * to,const char * from,bool out_wide,size_t in_step)121     size_t recode(char* out, size_t avail, const void* src, size_t len, const char* to, const char* from, bool out_wide, size_t in_step)
122     {
123         size_t const max_avail = avail;
124         char* in = (char*)src;
125 
126         iconv_t cvt = iconv_open(to, from);
127         if(cvt == (iconv_t)-1)
128             recode_error(to, from);
129 
130         struct guard_t {
131             iconv_t cvt;
132             ~guard_t() { iconv_close(cvt); }
133         } const guard = { cvt };
134 
135         while(len > 0) {
136             size_t result = iconv(cvt, &in, &len, &out, &avail);
137             if(result == (size_t)-1) {
138                 if(errno == E2BIG)
139                     throw std::logic_error("broken iconv");
140                 if(out_wide) {  // emit a placeholder
141                     memcpy(out, wide(0xFFFDu).raw, sizeof(wchar_t));
142                     out  += sizeof(wchar_t);
143                     avail-= sizeof(wchar_t);
144                 } else {
145                     *out++ = '?', avail--;
146                 }
147                 if(len <= in_step)
148                     break;
149                 len -= in_step; // skip some bytes and re-try
150                 in  += in_step;
151             } else if(len != 0)
152                 throw std::logic_error("broken iconv");
153         }
154         return max_avail - avail;
155     }
156 
157 #elif fallback(1)
158 
159     // fallback conversion, 7bit ASCII <-> unicode
160 
decode(const char * s,size_t len)161     template<> conv<>::data conv<_7bit>::decode(const char* s, size_t len)
162     {
163         conv<>::data build;
164         build.reserve(len);
165         for( ; len--; ) {
166             int c = *s++ & 0xFF;
167             build += wide(c < 0x80? c : '?');
168         }
169         return build;
170     }
171 
encode(const void * p,size_t len)172     template<> std::string conv<_7bit>::encode(const void* p, size_t len)
173     {
174         const wchar_t* w = (wchar_t*)p;
175         std::string build;
176         build.reserve(len);
177         for( ; len--; ) {
178             wchar_t c = *w++;
179             build += (c < 0x80)? c : '?';
180         }
181         return build;
182     }
183 
184 #endif
185 
UCS()186     inline static const char* UCS()
187     {
188         union { unsigned short bom; unsigned char byte; } endian_test;
189         endian_test.bom = 0xFFFE;
190         if(sizeof(wchar_t) == 4 && endian_test.byte == 0xFE)
191             return "UCS-4LE";
192         else if(sizeof(wchar_t) == 2 && endian_test.byte == 0xFE)
193             return "UCS-2LE";
194         else if(sizeof(wchar_t) == 4 && endian_test.byte != 0xFE)
195             return "UCS-4BE";
196         else if(sizeof(wchar_t) == 2 && endian_test.byte != 0xFE)
197             return "UCS-2BE";
198         else
199             return "ASCII";
200     }
201 
decode(const char * s,size_t len)202     template<> conv<>::data conv<local>::decode(const char* s, size_t len)
203     {
204 #   if fallback(1) && !defined(NO_ICONV)
205         std::vector<char> build((len+1)*sizeof(wchar_t));
206         wchar_unicode();
207 
208         size_t n = recode(build.data(), build.size(), s, len, UCS(), nl_langinfo(CODESET), true, 1);
209         return conv<>::data((wchar_t*)build.data(), n/sizeof(wchar_t));
210 #   else
211         if(!wchar_unicode())
212             return fallback(conv<_7bit>::decode(s, len));
213 
214         conv<>::data build;
215         build.reserve(len);
216         wchar_t wc;
217         s += len;
218         for(int n; len; len -= n+!n) {
219             n = mbtowc(&wc, s-len, len);
220             if(n < 0) break;
221             build += wide(wc);
222         }
223         return build;
224 #   endif
225     }
226 
encode(const void * p,size_t len)227     template<> std::string conv<local>::encode(const void* p, size_t len)
228     {
229 #   if fallback(1) && !defined(NO_ICONV)
230         std::vector<char> build((len+1)*4);
231         wchar_unicode();
232 
233         size_t n = recode(build.data(), build.size(), p, len*sizeof(wchar_t), nl_langinfo(CODESET), UCS(), false, sizeof(wchar_t));
234         return std::string(build.data(), n);
235 #   else
236         if(!wchar_unicode())
237             return fallback(conv<_7bit>::encode(p, len));
238 
239         const wchar_t* w = (wchar_t*)p;
240         std::string build;
241         build.reserve(len*2);
242 
243         for( ; len--; ) {
244             char buf[MB_LEN_MAX];
245             int n = wctomb(buf, *w++);
246             if(n >= 0) build.append(buf, n);
247             else       build += '?';
248         }
249         return build;
250 #   endif
251     }
252 
253 #elif defined(__DJGPP__)
254 
255  // mess-dos codepages (hardcoded, one-on-one relationship to unicode)
256 
257     namespace {
258         typedef wchar_t charmap[128];
259 
260  // Codepage 437, possible alternatives: (those active marked with +)
261  //
262  // I'd rather have a lunate epsilon or element of for '�', but it's not WGL
263  //
264  //   LATIN SMALL L. SHAPR S (00DF) -> GREEK SMALL L. BETA (03B2)
265  // + GREEK SMALL L. EPSILON (03B5) -> EURO SIGN  (20AC)
266  //   GREEK SMALL L. EPSILON (03B5) -> ELEMENT OF (2208)
267  //   GREEK SMALL L. EPSILON (03B5) -> GREEK LUNATE EPSILON S. (03F5)
268  //   GREEK SMALL L. PHI     (03C6) -> LATIN SMALL L. O SLASH (00F8)
269  //   GREEK SMALL L. PHI     (03C6) -> LATIN SMALL L. PHI (0278)
270  //   GREEK SMALL L. PHI     (03C6) -> GREEK PHI S.       (03D5)
271  // + BULLET OPERATOR        (2219) -> BULLET (2022)
272 
273         const charmap cp437 = {
274             0x00C7, 0x00FC, 0x00E9, 0x00E2, 0x00E4, 0x00E0, 0x00E5, 0x00E7,
275             0x00EA, 0x00EB, 0x00E8, 0x00EF, 0x00EE, 0x00EC, 0x00C4, 0x00C5,
276             0x00C9, 0x00E6, 0x00C6, 0x00F4, 0x00F6, 0x00F2, 0x00FB, 0x00F9,
277             0x00FF, 0x00D6, 0x00DC, 0x00A2, 0x00A3, 0x00A5, 0x20A7, 0x0192,
278             0x00E1, 0x00ED, 0x00F3, 0x00FA, 0x00F1, 0x00D1, 0x00AA, 0x00BA,
279             0x00BF, 0x2310, 0x00AC, 0x00BD, 0x00BC, 0x00A1, 0x00AB, 0x00BB,
280             0x2591, 0x2592, 0x2593, 0x2502, 0x2524, 0x2561, 0x2562, 0x2556,
281             0x2555, 0x2563, 0x2551, 0x2557, 0x255D, 0x255C, 0x255B, 0x2510,
282             0x2514, 0x2534, 0x252C, 0x251C, 0x2500, 0x253C, 0x255E, 0x255F,
283             0x255A, 0x2554, 0x2569, 0x2566, 0x2560, 0x2550, 0x256C, 0x2567,
284             0x2568, 0x2564, 0x2565, 0x2559, 0x2558, 0x2552, 0x2553, 0x256B,
285             0x256A, 0x2518, 0x250C, 0x2588, 0x2584, 0x258C, 0x2590, 0x2580,
286             0x03B1, 0x00DF, 0x0393, 0x03C0, 0x03A3, 0x03C3, 0x00B5, 0x03C4,
287             0x03A6, 0x0398, 0x03A9, 0x03B4, 0x221E, 0x03C6, 0x20AC, 0x2229,
288             0x2261, 0x00B1, 0x2265, 0x2264, 0x2320, 0x2321, 0x00F7, 0x2248,
289             0x00B0, 0x2022, 0x00B7, 0x221A, 0x207F, 0x00B2, 0x25A0, 0x00A0,
290         };
291 
292         const charmap cp850 = {
293             0x00C7, 0x00FC, 0x00E9, 0x00E2, 0x00E4, 0x00E0, 0x00E5, 0x00E7,
294             0x00EA, 0x00EB, 0x00E8, 0x00EF, 0x00EE, 0x00EC, 0x00C4, 0x00C5,
295             0x00C9, 0x00E6, 0x00C6, 0x00F4, 0x00F6, 0x00F2, 0x00FB, 0x00F9,
296             0x00FF, 0x00D6, 0x00DC, 0x00F8, 0x00A3, 0x00D8, 0x00D7, 0x0192,
297             0x00E1, 0x00ED, 0x00F3, 0x00FA, 0x00F1, 0x00D1, 0x00AA, 0x00BA,
298             0x00BF, 0x00AE, 0x00AC, 0x00BD, 0x00BC, 0x00A1, 0x00AB, 0x00BB,
299             0x2591, 0x2592, 0x2593, 0x2502, 0x2524, 0x00C1, 0x00C2, 0x00C0,
300             0x00A9, 0x2563, 0x2551, 0x2557, 0x255D, 0x00A2, 0x00A5, 0x2510,
301             0x2514, 0x2534, 0x252C, 0x251C, 0x2500, 0x253C, 0x00E3, 0x00C3,
302             0x255A, 0x2554, 0x2569, 0x2566, 0x2560, 0x2550, 0x256C, 0x00A4,
303             0x00F0, 0x00D0, 0x00CA, 0x00CB, 0x00C8, 0x0131, 0x00CD, 0x00CE,
304             0x00CF, 0x2518, 0x250C, 0x2588, 0x2584, 0x00A6, 0x00CC, 0x2580,
305             0x00D3, 0x00DF, 0x00D4, 0x00D2, 0x00F5, 0x00D5, 0x00B5, 0x00FE,
306             0x00DE, 0x00DA, 0x00DB, 0x00D9, 0x00FD, 0x00DD, 0x00AF, 0x00B4,
307             0x00AD, 0x00B1, 0x2017, 0x00BE, 0x00B6, 0x00A7, 0x00F7, 0x00B8,
308             0x00B0, 0x00A8, 0x00B7, 0x00B9, 0x00B3, 0x00B2, 0x25A0, 0x00A0,
309         };
310 
codepage()311         unsigned codepage()
312         {
313             REGS cpu;
314             cpu.w.ax = 0x6601;    // int21h - ax 6601h - get global code page table
315             intdos(&cpu, &cpu);   // -> bx: active code page, dx: system codepage
316             return cpu.w.cflag? 0 : cpu.w.bx;  // carry set on error
317         }
318 
dos_to_uni()319         const wchar_t* dos_to_uni()
320         {
321             switch(codepage()) {
322             case 850: return cp850;
323             case 437: return cp437;
324             default : return 0;
325             }
326         }
327 
328         struct uni_to_dos : map<wchar_t, char> {     // crude! reverse table
uni_to_doscharset::__anon3d0641ca0411::uni_to_dos329             uni_to_dos()
330             {
331                 if(const wchar_t* cmap = dos_to_uni())
332                     for(int n = 0; n < 128; ++n) {
333                         insert(value_type(cmap[n], n|0x80));
334                     }
335             }
operator []charset::__anon3d0641ca0411::uni_to_dos336             char& operator[](wchar_t uc)
337             {
338                 return insert(value_type(uc, '?')).first->second;
339             }
340         };
341     }
342 
decode(const char * s,size_t len)343     template<> conv<>::data conv<local>::decode(const char* s, size_t len)
344     {
345         static const wchar_t* const map = dos_to_uni();
346         conv<>::data build;
347         build.reserve(len);
348         for( ; len--; s++) {
349             wide w = (*s & 0x80)? map[*s & 0x7F] : (*s & 0xFF);
350             build += w;
351         }
352         return build;
353     }
354 
encode(const void * p,size_t len)355     template<> std::string conv<local>::encode(const void* p, size_t len)
356     {
357         const wchar_t* w = (wchar_t*)p;
358         static uni_to_dos rmap;
359         std::string build;
360         build.reserve(len);
361         for( ; len--; ) {
362             wchar_t c = *w++;
363             build += (c < 0x80)? c : rmap[c];
364         }
365         return build;
366     }
367 
368 #endif
369 
370 } // end of namespace
371 
372  /*
373    Notes:
374 
375    __STDC_ISO_10646__ is a C99 constant. If defined, wchar_t is
376    guaranteed to be a coded representation of the Unicode set in all
377    locales. This is bliss. glibc2.2 defines it, so this covers Linux.
378 
379 
380    On Windows, you need to fight the jargon first;
381 
382    "Unicode" = UCS2 16bit chars
383    "ANSI"    = "Windows codepage" (such as CP1252)
384    "OEM"     = "DOS codepage"     (such as CP437, CP850, CP858)
385 
386    In true Microsoft fashion, ANSI and OEM are two different beasts, and so
387    there are always *two* codepages active! You CAN use ANSI codepages on the
388    Win32 commandline in NT/2K/XP, and also UTF8 ("codepage 65001"), but these
389    will only display properly with a TrueType font.
390 
391    Commandlines are apparently converted to "ANSI" codepage before being
392    passed. If you want the "Unicode" version, there's GetCommandLineW in
393    windows.h.
394 
395    So; arguments a program get will be in "correct" ANSI codepage, but I/O
396    (e.g., pipes, console output) will not be. Console output should be in OEM
397    but file output should (probably) be in ANSI. File routines can be either
398    OEM or ANSI style.
399 
400    Windows also has two locales for converting multibyte chars. The ISO C mb
401    functions from stdlib.h and wchar.h listens to setlocale(), but most MS
402    runtime functions listen to _setmbcp.
403 
404    Second, setlocale(LC_CTYPE, "") might get the active ANSI or OEM codepage!
405    MinGW does the former, Borland C++ the latter. Forcing this with ".ACP" or
406    ".OCP", doesn't work on Borland (apparently).
407 
408    So the problem is not converting to Unicode - mbtowc does this! - but to
409    actually select the proper locale.
410 
411    Related routines, without stupid MS typedefs;
412     wchar_t* GetCommandLineW(void)
413     char*    GetCommandLineA(void)
414     bool     AreFileApisANSI(void)
415     void     SetFileApisToOEM(void)
416     void     SetFileApisToANSI(void)
417     unsigned GetACP(void)
418     unsigned GetOEMACP(void)
419     unsigned GetConsoleOutputCP(void)  // why two ?
420     unsigned GetConsoleCP(void)
421     unsigned SetConsoleOutputCP(void)  // NT only! since 9x has no console
422     unsigned SetConsoleCP(void)        // NT only! since 9x has no console
423 
424   */
425 
426