1 #include <cstdlib> 2 #include <cstring> 3 #include <climits> 4 #if !defined(FORCE_ICONV) && defined(__STDC_ISO_10646__) || defined(_WIN32) 5 # include <wchar.h> 6 # define fallback(call) (0) 7 #elif defined(__DJGPP__) 8 # include <map> 9 # include <dos.h> 10 #else 11 # include <stdexcept> 12 # include <langinfo.h> 13 # if !defined(NO_ICONV) 14 # include <cerrno> 15 # include <vector> 16 # include <iconv.h> 17 # endif 18 # define fallback(call) (call) 19 #endif 20 #include "charconv.h" 21 22 /* 23 24 character conversion (user locale -> latin1) (portable) 25 26 copyright (c) 2005, 2006, 2015 squell <squell@alumina.nl> 27 28 use, modification, copying and distribution of this software is permitted 29 under the conditions described in the file 'COPYING'. 30 31 */ 32 33 namespace charset { 34 using namespace std; 35 36 namespace { 37 union wide { // accomodate wstring and string wide(wchar_t wc)38 wide(wchar_t wc) : code(wc) { } 39 wchar_t code; 40 char raw[sizeof(wchar_t)]; 41 }; 42 43 template<class T> inline operator +=(std::basic_string<T> & str,const wide w)44 std::basic_string<T>& operator+=(std::basic_string<T>& str, const wide w) 45 { 46 return str += w.code; 47 } 48 operator +=(std::string & str,const wide w)49 inline std::string& operator+=(std::string& str, const wide w) 50 { 51 return str.append(w.raw, sizeof w.raw); 52 } 53 } 54 55 // latin1 <-> unicode interconversion 56 decode(const char * s,size_t len)57 template<> conv<>::data conv<latin1>::decode(const char* s, size_t len) 58 { 59 conv<>::data build; 60 build.reserve(len); 61 for( ; len--; ) { 62 build += wide(*s++ & 0xFF); 63 } 64 return build; 65 } 66 encode(const void * p,size_t len)67 template<> std::string conv<latin1>::encode(const void* p, size_t len) 68 { 69 const wchar_t* w = (wchar_t*)p; 70 std::string build; 71 build.reserve(len); 72 for( ; len--; ) { 73 wchar_t c = *w++; 74 build += (c < 0x100)? c : '?'; 75 } 76 return build; 77 } 78 79 #if !defined(__DJGPP__) 80 81 // locale <-> unicode interconversion 82 // a bit touchy when changing locales 83 84 namespace { 85 struct _7bit; 86 wchar_unicode()87 static bool wchar_unicode() 88 { 89 # if fallback(1) && defined(NO_ICONV) 90 # if defined(CODESET) 91 # warning "Assuming Unicode if (and only if) CODESET is UTF-8; 7-bit ASCII otherwise." 92 return strcmp(nl_langinfo(CODESET), "UTF-8") == 0; 93 # else 94 # warning "Unicode not available on this platform; only supporting 7-bit ASCII" 95 return false; 96 # endif 97 # else 98 return true; 99 # endif 100 } 101 } // end anon. namespace 102 103 #if fallback(1) && !defined(NO_ICONV) 104 recode_error(const char * to,const char * from)105 static void recode_error(const char* to, const char* from) 106 { 107 throw std::runtime_error(std::string("iconv -f ") + from + " -t " + to + " not working; recompile with -DNO_ICONV"); 108 } 109 110 // Work-around for the ambiguity in SUSv2 specification if iconv; see 111 // https://www.opengroup.org/austin/aardvark/finaltext/xshbug.txt 112 // (Even though POSIX.2001 fixed this, still an issue in 2015...) 113 114 using ::iconv; iconv(T cd,char ** inbuf,size_t * inbytesleft,char ** outbuf,size_t * outbytesleft)115 template<class T> inline size_t iconv(T cd, char **inbuf, size_t *inbytesleft, char **outbuf, size_t *outbytesleft) 116 { 117 // Only instantiated if iconv.h doesn't take a "char** inbuf" argument 118 return iconv(cd, (const char**)inbuf, inbytesleft, outbuf, outbytesleft); 119 } 120 recode(char * out,size_t avail,const void * src,size_t len,const char * to,const char * from,bool out_wide,size_t in_step)121 size_t recode(char* out, size_t avail, const void* src, size_t len, const char* to, const char* from, bool out_wide, size_t in_step) 122 { 123 size_t const max_avail = avail; 124 char* in = (char*)src; 125 126 iconv_t cvt = iconv_open(to, from); 127 if(cvt == (iconv_t)-1) 128 recode_error(to, from); 129 130 struct guard_t { 131 iconv_t cvt; 132 ~guard_t() { iconv_close(cvt); } 133 } const guard = { cvt }; 134 135 while(len > 0) { 136 size_t result = iconv(cvt, &in, &len, &out, &avail); 137 if(result == (size_t)-1) { 138 if(errno == E2BIG) 139 throw std::logic_error("broken iconv"); 140 if(out_wide) { // emit a placeholder 141 memcpy(out, wide(0xFFFDu).raw, sizeof(wchar_t)); 142 out += sizeof(wchar_t); 143 avail-= sizeof(wchar_t); 144 } else { 145 *out++ = '?', avail--; 146 } 147 if(len <= in_step) 148 break; 149 len -= in_step; // skip some bytes and re-try 150 in += in_step; 151 } else if(len != 0) 152 throw std::logic_error("broken iconv"); 153 } 154 return max_avail - avail; 155 } 156 157 #elif fallback(1) 158 159 // fallback conversion, 7bit ASCII <-> unicode 160 decode(const char * s,size_t len)161 template<> conv<>::data conv<_7bit>::decode(const char* s, size_t len) 162 { 163 conv<>::data build; 164 build.reserve(len); 165 for( ; len--; ) { 166 int c = *s++ & 0xFF; 167 build += wide(c < 0x80? c : '?'); 168 } 169 return build; 170 } 171 encode(const void * p,size_t len)172 template<> std::string conv<_7bit>::encode(const void* p, size_t len) 173 { 174 const wchar_t* w = (wchar_t*)p; 175 std::string build; 176 build.reserve(len); 177 for( ; len--; ) { 178 wchar_t c = *w++; 179 build += (c < 0x80)? c : '?'; 180 } 181 return build; 182 } 183 184 #endif 185 UCS()186 inline static const char* UCS() 187 { 188 union { unsigned short bom; unsigned char byte; } endian_test; 189 endian_test.bom = 0xFFFE; 190 if(sizeof(wchar_t) == 4 && endian_test.byte == 0xFE) 191 return "UCS-4LE"; 192 else if(sizeof(wchar_t) == 2 && endian_test.byte == 0xFE) 193 return "UCS-2LE"; 194 else if(sizeof(wchar_t) == 4 && endian_test.byte != 0xFE) 195 return "UCS-4BE"; 196 else if(sizeof(wchar_t) == 2 && endian_test.byte != 0xFE) 197 return "UCS-2BE"; 198 else 199 return "ASCII"; 200 } 201 decode(const char * s,size_t len)202 template<> conv<>::data conv<local>::decode(const char* s, size_t len) 203 { 204 # if fallback(1) && !defined(NO_ICONV) 205 std::vector<char> build((len+1)*sizeof(wchar_t)); 206 wchar_unicode(); 207 208 size_t n = recode(build.data(), build.size(), s, len, UCS(), nl_langinfo(CODESET), true, 1); 209 return conv<>::data((wchar_t*)build.data(), n/sizeof(wchar_t)); 210 # else 211 if(!wchar_unicode()) 212 return fallback(conv<_7bit>::decode(s, len)); 213 214 conv<>::data build; 215 build.reserve(len); 216 wchar_t wc; 217 s += len; 218 for(int n; len; len -= n+!n) { 219 n = mbtowc(&wc, s-len, len); 220 if(n < 0) break; 221 build += wide(wc); 222 } 223 return build; 224 # endif 225 } 226 encode(const void * p,size_t len)227 template<> std::string conv<local>::encode(const void* p, size_t len) 228 { 229 # if fallback(1) && !defined(NO_ICONV) 230 std::vector<char> build((len+1)*4); 231 wchar_unicode(); 232 233 size_t n = recode(build.data(), build.size(), p, len*sizeof(wchar_t), nl_langinfo(CODESET), UCS(), false, sizeof(wchar_t)); 234 return std::string(build.data(), n); 235 # else 236 if(!wchar_unicode()) 237 return fallback(conv<_7bit>::encode(p, len)); 238 239 const wchar_t* w = (wchar_t*)p; 240 std::string build; 241 build.reserve(len*2); 242 243 for( ; len--; ) { 244 char buf[MB_LEN_MAX]; 245 int n = wctomb(buf, *w++); 246 if(n >= 0) build.append(buf, n); 247 else build += '?'; 248 } 249 return build; 250 # endif 251 } 252 253 #elif defined(__DJGPP__) 254 255 // mess-dos codepages (hardcoded, one-on-one relationship to unicode) 256 257 namespace { 258 typedef wchar_t charmap[128]; 259 260 // Codepage 437, possible alternatives: (those active marked with +) 261 // 262 // I'd rather have a lunate epsilon or element of for '�', but it's not WGL 263 // 264 // LATIN SMALL L. SHAPR S (00DF) -> GREEK SMALL L. BETA (03B2) 265 // + GREEK SMALL L. EPSILON (03B5) -> EURO SIGN (20AC) 266 // GREEK SMALL L. EPSILON (03B5) -> ELEMENT OF (2208) 267 // GREEK SMALL L. EPSILON (03B5) -> GREEK LUNATE EPSILON S. (03F5) 268 // GREEK SMALL L. PHI (03C6) -> LATIN SMALL L. O SLASH (00F8) 269 // GREEK SMALL L. PHI (03C6) -> LATIN SMALL L. PHI (0278) 270 // GREEK SMALL L. PHI (03C6) -> GREEK PHI S. (03D5) 271 // + BULLET OPERATOR (2219) -> BULLET (2022) 272 273 const charmap cp437 = { 274 0x00C7, 0x00FC, 0x00E9, 0x00E2, 0x00E4, 0x00E0, 0x00E5, 0x00E7, 275 0x00EA, 0x00EB, 0x00E8, 0x00EF, 0x00EE, 0x00EC, 0x00C4, 0x00C5, 276 0x00C9, 0x00E6, 0x00C6, 0x00F4, 0x00F6, 0x00F2, 0x00FB, 0x00F9, 277 0x00FF, 0x00D6, 0x00DC, 0x00A2, 0x00A3, 0x00A5, 0x20A7, 0x0192, 278 0x00E1, 0x00ED, 0x00F3, 0x00FA, 0x00F1, 0x00D1, 0x00AA, 0x00BA, 279 0x00BF, 0x2310, 0x00AC, 0x00BD, 0x00BC, 0x00A1, 0x00AB, 0x00BB, 280 0x2591, 0x2592, 0x2593, 0x2502, 0x2524, 0x2561, 0x2562, 0x2556, 281 0x2555, 0x2563, 0x2551, 0x2557, 0x255D, 0x255C, 0x255B, 0x2510, 282 0x2514, 0x2534, 0x252C, 0x251C, 0x2500, 0x253C, 0x255E, 0x255F, 283 0x255A, 0x2554, 0x2569, 0x2566, 0x2560, 0x2550, 0x256C, 0x2567, 284 0x2568, 0x2564, 0x2565, 0x2559, 0x2558, 0x2552, 0x2553, 0x256B, 285 0x256A, 0x2518, 0x250C, 0x2588, 0x2584, 0x258C, 0x2590, 0x2580, 286 0x03B1, 0x00DF, 0x0393, 0x03C0, 0x03A3, 0x03C3, 0x00B5, 0x03C4, 287 0x03A6, 0x0398, 0x03A9, 0x03B4, 0x221E, 0x03C6, 0x20AC, 0x2229, 288 0x2261, 0x00B1, 0x2265, 0x2264, 0x2320, 0x2321, 0x00F7, 0x2248, 289 0x00B0, 0x2022, 0x00B7, 0x221A, 0x207F, 0x00B2, 0x25A0, 0x00A0, 290 }; 291 292 const charmap cp850 = { 293 0x00C7, 0x00FC, 0x00E9, 0x00E2, 0x00E4, 0x00E0, 0x00E5, 0x00E7, 294 0x00EA, 0x00EB, 0x00E8, 0x00EF, 0x00EE, 0x00EC, 0x00C4, 0x00C5, 295 0x00C9, 0x00E6, 0x00C6, 0x00F4, 0x00F6, 0x00F2, 0x00FB, 0x00F9, 296 0x00FF, 0x00D6, 0x00DC, 0x00F8, 0x00A3, 0x00D8, 0x00D7, 0x0192, 297 0x00E1, 0x00ED, 0x00F3, 0x00FA, 0x00F1, 0x00D1, 0x00AA, 0x00BA, 298 0x00BF, 0x00AE, 0x00AC, 0x00BD, 0x00BC, 0x00A1, 0x00AB, 0x00BB, 299 0x2591, 0x2592, 0x2593, 0x2502, 0x2524, 0x00C1, 0x00C2, 0x00C0, 300 0x00A9, 0x2563, 0x2551, 0x2557, 0x255D, 0x00A2, 0x00A5, 0x2510, 301 0x2514, 0x2534, 0x252C, 0x251C, 0x2500, 0x253C, 0x00E3, 0x00C3, 302 0x255A, 0x2554, 0x2569, 0x2566, 0x2560, 0x2550, 0x256C, 0x00A4, 303 0x00F0, 0x00D0, 0x00CA, 0x00CB, 0x00C8, 0x0131, 0x00CD, 0x00CE, 304 0x00CF, 0x2518, 0x250C, 0x2588, 0x2584, 0x00A6, 0x00CC, 0x2580, 305 0x00D3, 0x00DF, 0x00D4, 0x00D2, 0x00F5, 0x00D5, 0x00B5, 0x00FE, 306 0x00DE, 0x00DA, 0x00DB, 0x00D9, 0x00FD, 0x00DD, 0x00AF, 0x00B4, 307 0x00AD, 0x00B1, 0x2017, 0x00BE, 0x00B6, 0x00A7, 0x00F7, 0x00B8, 308 0x00B0, 0x00A8, 0x00B7, 0x00B9, 0x00B3, 0x00B2, 0x25A0, 0x00A0, 309 }; 310 codepage()311 unsigned codepage() 312 { 313 REGS cpu; 314 cpu.w.ax = 0x6601; // int21h - ax 6601h - get global code page table 315 intdos(&cpu, &cpu); // -> bx: active code page, dx: system codepage 316 return cpu.w.cflag? 0 : cpu.w.bx; // carry set on error 317 } 318 dos_to_uni()319 const wchar_t* dos_to_uni() 320 { 321 switch(codepage()) { 322 case 850: return cp850; 323 case 437: return cp437; 324 default : return 0; 325 } 326 } 327 328 struct uni_to_dos : map<wchar_t, char> { // crude! reverse table uni_to_doscharset::__anon3d0641ca0411::uni_to_dos329 uni_to_dos() 330 { 331 if(const wchar_t* cmap = dos_to_uni()) 332 for(int n = 0; n < 128; ++n) { 333 insert(value_type(cmap[n], n|0x80)); 334 } 335 } operator []charset::__anon3d0641ca0411::uni_to_dos336 char& operator[](wchar_t uc) 337 { 338 return insert(value_type(uc, '?')).first->second; 339 } 340 }; 341 } 342 decode(const char * s,size_t len)343 template<> conv<>::data conv<local>::decode(const char* s, size_t len) 344 { 345 static const wchar_t* const map = dos_to_uni(); 346 conv<>::data build; 347 build.reserve(len); 348 for( ; len--; s++) { 349 wide w = (*s & 0x80)? map[*s & 0x7F] : (*s & 0xFF); 350 build += w; 351 } 352 return build; 353 } 354 encode(const void * p,size_t len)355 template<> std::string conv<local>::encode(const void* p, size_t len) 356 { 357 const wchar_t* w = (wchar_t*)p; 358 static uni_to_dos rmap; 359 std::string build; 360 build.reserve(len); 361 for( ; len--; ) { 362 wchar_t c = *w++; 363 build += (c < 0x80)? c : rmap[c]; 364 } 365 return build; 366 } 367 368 #endif 369 370 } // end of namespace 371 372 /* 373 Notes: 374 375 __STDC_ISO_10646__ is a C99 constant. If defined, wchar_t is 376 guaranteed to be a coded representation of the Unicode set in all 377 locales. This is bliss. glibc2.2 defines it, so this covers Linux. 378 379 380 On Windows, you need to fight the jargon first; 381 382 "Unicode" = UCS2 16bit chars 383 "ANSI" = "Windows codepage" (such as CP1252) 384 "OEM" = "DOS codepage" (such as CP437, CP850, CP858) 385 386 In true Microsoft fashion, ANSI and OEM are two different beasts, and so 387 there are always *two* codepages active! You CAN use ANSI codepages on the 388 Win32 commandline in NT/2K/XP, and also UTF8 ("codepage 65001"), but these 389 will only display properly with a TrueType font. 390 391 Commandlines are apparently converted to "ANSI" codepage before being 392 passed. If you want the "Unicode" version, there's GetCommandLineW in 393 windows.h. 394 395 So; arguments a program get will be in "correct" ANSI codepage, but I/O 396 (e.g., pipes, console output) will not be. Console output should be in OEM 397 but file output should (probably) be in ANSI. File routines can be either 398 OEM or ANSI style. 399 400 Windows also has two locales for converting multibyte chars. The ISO C mb 401 functions from stdlib.h and wchar.h listens to setlocale(), but most MS 402 runtime functions listen to _setmbcp. 403 404 Second, setlocale(LC_CTYPE, "") might get the active ANSI or OEM codepage! 405 MinGW does the former, Borland C++ the latter. Forcing this with ".ACP" or 406 ".OCP", doesn't work on Borland (apparently). 407 408 So the problem is not converting to Unicode - mbtowc does this! - but to 409 actually select the proper locale. 410 411 Related routines, without stupid MS typedefs; 412 wchar_t* GetCommandLineW(void) 413 char* GetCommandLineA(void) 414 bool AreFileApisANSI(void) 415 void SetFileApisToOEM(void) 416 void SetFileApisToANSI(void) 417 unsigned GetACP(void) 418 unsigned GetOEMACP(void) 419 unsigned GetConsoleOutputCP(void) // why two ? 420 unsigned GetConsoleCP(void) 421 unsigned SetConsoleOutputCP(void) // NT only! since 9x has no console 422 unsigned SetConsoleCP(void) // NT only! since 9x has no console 423 424 */ 425 426