1// 2// Copyright (c) 2009-2011 Artyom Beilis (Tonkikh) 3// 4// Distributed under the Boost Software License, Version 1.0. (See 5// accompanying file LICENSE_1_0.txt or copy at 6// http://www.boost.org/LICENSE_1_0.txt) 7// 8 9#ifndef BOOST_LOCALE_IMPL_WCONV_CODEPAGE_HPP 10#define BOOST_LOCALE_IMPL_WCONV_CODEPAGE_HPP 11 12 13#include <boost/locale/encoding.hpp> 14#include <algorithm> 15#include <cstring> 16#include <string> 17#include "conv.hpp" 18 19#ifndef NOMINMAX 20# define NOMINMAX 21#endif 22#include <windows.h> 23#include <vector> 24 25 26namespace boost { 27namespace locale { 28namespace conv { 29namespace impl { 30 31 struct windows_encoding { 32 char const *name; 33 unsigned codepage; 34 unsigned was_tested; 35 }; 36 37 bool operator<(windows_encoding const &l,windows_encoding const &r) 38 { 39 return strcmp(l.name,r.name) < 0; 40 } 41 42 windows_encoding all_windows_encodings[] = { 43 { "big5", 950, 0 }, 44 { "cp1250", 1250, 0 }, 45 { "cp1251", 1251, 0 }, 46 { "cp1252", 1252, 0 }, 47 { "cp1253", 1253, 0 }, 48 { "cp1254", 1254, 0 }, 49 { "cp1255", 1255, 0 }, 50 { "cp1256", 1256, 0 }, 51 { "cp1257", 1257, 0 }, 52 { "cp874", 874, 0 }, 53 { "cp932", 932, 0 }, 54 { "cp936", 936, 0 }, 55 { "eucjp", 20932, 0 }, 56 { "euckr", 51949, 0 }, 57 { "gb18030", 54936, 0 }, 58 { "gb2312", 20936, 0 }, 59 { "gbk", 936, 0 }, 60 { "iso2022jp", 50220, 0 }, 61 { "iso2022kr", 50225, 0 }, 62 { "iso88591", 28591, 0 }, 63 { "iso885913", 28603, 0 }, 64 { "iso885915", 28605, 0 }, 65 { "iso88592", 28592, 0 }, 66 { "iso88593", 28593, 0 }, 67 { "iso88594", 28594, 0 }, 68 { "iso88595", 28595, 0 }, 69 { "iso88596", 28596, 0 }, 70 { "iso88597", 28597, 0 }, 71 { "iso88598", 28598, 0 }, 72 { "iso88599", 28599, 0 }, 73 { "koi8r", 20866, 0 }, 74 { "koi8u", 21866, 0 }, 75 { "ms936", 936, 0 }, 76 { "shiftjis", 932, 0 }, 77 { "sjis", 932, 0 }, 78 { "usascii", 20127, 0 }, 79 { "utf8", 65001, 0 }, 80 { "windows1250", 1250, 0 }, 81 { "windows1251", 1251, 0 }, 82 { "windows1252", 1252, 0 }, 83 { "windows1253", 1253, 0 }, 84 { "windows1254", 1254, 0 }, 85 { "windows1255", 1255, 0 }, 86 { "windows1256", 1256, 0 }, 87 { "windows1257", 1257, 0 }, 88 { "windows874", 874, 0 }, 89 { "windows932", 932, 0 }, 90 { "windows936", 936, 0 }, 91 }; 92 93 size_t remove_substitutions(std::vector<char> &v) 94 { 95 if(std::find(v.begin(),v.end(),0) == v.end()) { 96 return v.size(); 97 } 98 std::vector<char> v2; 99 v2.reserve(v.size()); 100 for(unsigned i=0;i<v.size();i++) { 101 if(v[i]!=0) 102 v2.push_back(v[i]); 103 } 104 v.swap(v2); 105 return v.size(); 106 } 107 108 void multibyte_to_wide_one_by_one(int codepage,char const *begin,char const *end,std::vector<wchar_t> &buf) 109 { 110 buf.reserve(end-begin); 111 while(begin!=end) { 112 wchar_t wide_buf[4]; 113 int n = 0; 114 int len = IsDBCSLeadByteEx(codepage,*begin) ? 2 : 1; 115 if(len == 2 && begin+1==end) 116 return; 117 n = MultiByteToWideChar(codepage,MB_ERR_INVALID_CHARS,begin,len,wide_buf,4); 118 for(int i=0;i<n;i++) 119 buf.push_back(wide_buf[i]); 120 begin+=len; 121 } 122 } 123 124 125 void multibyte_to_wide(int codepage,char const *begin,char const *end,bool do_skip,std::vector<wchar_t> &buf) 126 { 127 if(begin==end) 128 return; 129 int n = MultiByteToWideChar(codepage,MB_ERR_INVALID_CHARS,begin,end-begin,0,0); 130 if(n == 0) { 131 if(do_skip) { 132 multibyte_to_wide_one_by_one(codepage,begin,end,buf); 133 return; 134 } 135 throw conversion_error(); 136 } 137 138 buf.resize(n,0); 139 if(MultiByteToWideChar(codepage,MB_ERR_INVALID_CHARS,begin,end-begin,&buf.front(),buf.size())==0) 140 throw conversion_error(); 141 } 142 143 void wide_to_multibyte_non_zero(int codepage,wchar_t const *begin,wchar_t const *end,bool do_skip,std::vector<char> &buf) 144 { 145 if(begin==end) 146 return; 147 BOOL substitute = FALSE; 148 BOOL *substitute_ptr = codepage == 65001 || codepage == 65000 ? 0 : &substitute; 149 char subst_char = 0; 150 char *subst_char_ptr = codepage == 65001 || codepage == 65000 ? 0 : &subst_char; 151 152 int n = WideCharToMultiByte(codepage,0,begin,end-begin,0,0,subst_char_ptr,substitute_ptr); 153 buf.resize(n); 154 155 if(WideCharToMultiByte(codepage,0,begin,end-begin,&buf[0],n,subst_char_ptr,substitute_ptr)==0) 156 throw conversion_error(); 157 if(substitute) { 158 if(do_skip) 159 remove_substitutions(buf); 160 else 161 throw conversion_error(); 162 } 163 } 164 165 void wide_to_multibyte(int codepage,wchar_t const *begin,wchar_t const *end,bool do_skip,std::vector<char> &buf) 166 { 167 if(begin==end) 168 return; 169 buf.reserve(end-begin); 170 wchar_t const *e = std::find(begin,end,L'\0'); 171 wchar_t const *b = begin; 172 for(;;) { 173 std::vector<char> tmp; 174 wide_to_multibyte_non_zero(codepage,b,e,do_skip,tmp); 175 size_t osize = buf.size(); 176 buf.resize(osize+tmp.size()); 177 std::copy(tmp.begin(),tmp.end(),buf.begin()+osize); 178 if(e!=end) { 179 buf.push_back('\0'); 180 b=e+1; 181 e=std::find(b,end,L'0'); 182 } 183 else 184 break; 185 } 186 } 187 188 189 int encoding_to_windows_codepage(char const *ccharset) 190 { 191 std::string charset = normalize_encoding(ccharset); 192 windows_encoding ref; 193 ref.name = charset.c_str(); 194 size_t n = sizeof(all_windows_encodings)/sizeof(all_windows_encodings[0]); 195 windows_encoding *begin = all_windows_encodings; 196 windows_encoding *end = all_windows_encodings + n; 197 windows_encoding *ptr = std::lower_bound(begin,end,ref); 198 if(ptr!=end && strcmp(ptr->name,charset.c_str())==0) { 199 if(ptr->was_tested) { 200 return ptr->codepage; 201 } 202 else if(IsValidCodePage(ptr->codepage)) { 203 // the thread safety is not an issue, maximum 204 // it would be checked more then once 205 ptr->was_tested=1; 206 return ptr->codepage; 207 } 208 else { 209 return -1; 210 } 211 } 212 return -1; 213 214 } 215 216 template<typename CharType> 217 bool validate_utf16(CharType const *str,unsigned len) 218 { 219 CharType const *begin = str; 220 CharType const *end = str+len; 221 while(begin!=end) { 222 utf::code_point c = utf::utf_traits<CharType,2>::template decode<CharType const *>(begin,end); 223 if(c==utf::illegal || c==utf::incomplete) 224 return false; 225 } 226 return true; 227 } 228 229 template<typename CharType,typename OutChar> 230 void clean_invalid_utf16(CharType const *str,unsigned len,std::vector<OutChar> &out) 231 { 232 out.reserve(len); 233 for(unsigned i=0;i<len;i++) { 234 uint16_t c = static_cast<uint16_t>(str[i]); 235 236 if(0xD800 <= c && c<= 0xDBFF) { 237 i++; 238 if(i>=len) 239 return; 240 uint16_t c2=static_cast<uint16_t>(str[i]); 241 if(0xDC00 <= c2 && c2 <= 0xDFFF) { 242 out.push_back(static_cast<OutChar>(c)); 243 out.push_back(static_cast<OutChar>(c2)); 244 } 245 } 246 else if(0xDC00 <= c && c <=0xDFFF) 247 continue; 248 else 249 out.push_back(static_cast<OutChar>(c)); 250 } 251 } 252 253 254 class wconv_between : public converter_between { 255 public: 256 wconv_between() : 257 how_(skip), 258 to_code_page_ (-1), 259 from_code_page_ ( -1) 260 { 261 } 262 bool open(char const *to_charset,char const *from_charset,method_type how) 263 { 264 how_ = how; 265 to_code_page_ = encoding_to_windows_codepage(to_charset); 266 from_code_page_ = encoding_to_windows_codepage(from_charset); 267 if(to_code_page_ == -1 || from_code_page_ == -1) 268 return false; 269 return true; 270 } 271 virtual std::string convert(char const *begin,char const *end) 272 { 273 if(to_code_page_ == 65001 && from_code_page_ == 65001) 274 return utf_to_utf<char>(begin,end,how_); 275 276 std::string res; 277 278 std::vector<wchar_t> tmp; // buffer for mb2w 279 std::wstring tmps; // buffer for utf_to_utf 280 wchar_t const *wbegin=0; 281 wchar_t const *wend=0; 282 283 if(from_code_page_ == 65001) { 284 tmps = utf_to_utf<wchar_t>(begin,end,how_); 285 if(tmps.empty()) 286 return res; 287 wbegin = tmps.c_str(); 288 wend = wbegin + tmps.size(); 289 } 290 else { 291 multibyte_to_wide(from_code_page_,begin,end,how_ == skip,tmp); 292 if(tmp.empty()) 293 return res; 294 wbegin = &tmp[0]; 295 wend = wbegin + tmp.size(); 296 } 297 298 if(to_code_page_ == 65001) { 299 return utf_to_utf<char>(wbegin,wend,how_); 300 } 301 302 std::vector<char> ctmp; 303 wide_to_multibyte(to_code_page_,wbegin,wend,how_ == skip,ctmp); 304 if(ctmp.empty()) 305 return res; 306 res.assign(&ctmp.front(),ctmp.size()); 307 return res; 308 } 309 private: 310 method_type how_; 311 int to_code_page_; 312 int from_code_page_; 313 }; 314 315 template<typename CharType,int size = sizeof(CharType) > 316 class wconv_to_utf; 317 318 template<typename CharType,int size = sizeof(CharType) > 319 class wconv_from_utf; 320 321 template<> 322 class wconv_to_utf<char,1> : public converter_to_utf<char> , public wconv_between { 323 public: 324 virtual bool open(char const *cs,method_type how) 325 { 326 return wconv_between::open("UTF-8",cs,how); 327 } 328 virtual std::string convert(char const *begin,char const *end) 329 { 330 return wconv_between::convert(begin,end); 331 } 332 }; 333 334 template<> 335 class wconv_from_utf<char,1> : public converter_from_utf<char> , public wconv_between { 336 public: 337 virtual bool open(char const *cs,method_type how) 338 { 339 return wconv_between::open(cs,"UTF-8",how); 340 } 341 virtual std::string convert(char const *begin,char const *end) 342 { 343 return wconv_between::convert(begin,end); 344 } 345 }; 346 347 template<typename CharType> 348 class wconv_to_utf<CharType,2> : public converter_to_utf<CharType> { 349 public: 350 typedef CharType char_type; 351 352 typedef std::basic_string<char_type> string_type; 353 354 wconv_to_utf() : 355 how_(skip), 356 code_page_(-1) 357 { 358 } 359 360 virtual bool open(char const *charset,method_type how) 361 { 362 how_ = how; 363 code_page_ = encoding_to_windows_codepage(charset); 364 return code_page_ != -1; 365 } 366 367 virtual string_type convert(char const *begin,char const *end) 368 { 369 if(code_page_ == 65001) { 370 return utf_to_utf<char_type>(begin,end,how_); 371 } 372 std::vector<wchar_t> tmp; 373 multibyte_to_wide(code_page_,begin,end,how_ == skip,tmp); 374 string_type res; 375 if(!tmp.empty()) 376 res.assign(reinterpret_cast<char_type *>(&tmp.front()),tmp.size()); 377 return res; 378 } 379 380 private: 381 method_type how_; 382 int code_page_; 383 }; 384 385 template<typename CharType> 386 class wconv_from_utf<CharType,2> : public converter_from_utf<CharType> { 387 public: 388 typedef CharType char_type; 389 390 typedef std::basic_string<char_type> string_type; 391 392 wconv_from_utf() : 393 how_(skip), 394 code_page_(-1) 395 { 396 } 397 398 virtual bool open(char const *charset,method_type how) 399 { 400 how_ = how; 401 code_page_ = encoding_to_windows_codepage(charset); 402 return code_page_ != -1; 403 } 404 405 virtual std::string convert(CharType const *begin,CharType const *end) 406 { 407 if(code_page_ == 65001) { 408 return utf_to_utf<char>(begin,end,how_); 409 } 410 wchar_t const *wbegin = 0; 411 wchar_t const *wend = 0; 412 std::vector<wchar_t> buffer; // if needed 413 if(begin==end) 414 return std::string(); 415 if(validate_utf16(begin,end-begin)) { 416 wbegin = reinterpret_cast<wchar_t const *>(begin); 417 wend = reinterpret_cast<wchar_t const *>(end); 418 } 419 else { 420 if(how_ == stop) { 421 throw conversion_error(); 422 } 423 else { 424 clean_invalid_utf16(begin,end-begin,buffer); 425 if(!buffer.empty()) { 426 wbegin = &buffer[0]; 427 wend = wbegin + buffer.size(); 428 } 429 } 430 } 431 std::string res; 432 if(wbegin==wend) 433 return res; 434 std::vector<char> ctmp; 435 wide_to_multibyte(code_page_,wbegin,wend,how_ == skip,ctmp); 436 if(ctmp.empty()) 437 return res; 438 res.assign(&ctmp.front(),ctmp.size()); 439 return res; 440 } 441 442 private: 443 method_type how_; 444 int code_page_; 445 }; 446 447 448 449 template<typename CharType> 450 class wconv_to_utf<CharType,4> : public converter_to_utf<CharType> { 451 public: 452 typedef CharType char_type; 453 454 typedef std::basic_string<char_type> string_type; 455 456 wconv_to_utf() : 457 how_(skip), 458 code_page_(-1) 459 { 460 } 461 462 virtual bool open(char const *charset,method_type how) 463 { 464 how_ = how; 465 code_page_ = encoding_to_windows_codepage(charset); 466 return code_page_ != -1; 467 } 468 469 virtual string_type convert(char const *begin,char const *end) 470 { 471 if(code_page_ == 65001) { 472 return utf_to_utf<char_type>(begin,end,how_); 473 } 474 std::vector<wchar_t> buf; 475 multibyte_to_wide(code_page_,begin,end,how_ == skip,buf); 476 477 if(buf.empty()) 478 return string_type(); 479 480 return utf_to_utf<CharType>(&buf[0],&buf[0]+buf.size(),how_); 481 } 482 private: 483 method_type how_; 484 int code_page_; 485 }; 486 487 template<typename CharType> 488 class wconv_from_utf<CharType,4> : public converter_from_utf<CharType> { 489 public: 490 typedef CharType char_type; 491 492 typedef std::basic_string<char_type> string_type; 493 494 wconv_from_utf() : 495 how_(skip), 496 code_page_(-1) 497 { 498 } 499 500 virtual bool open(char const *charset,method_type how) 501 { 502 how_ = how; 503 code_page_ = encoding_to_windows_codepage(charset); 504 return code_page_ != -1; 505 } 506 507 virtual std::string convert(CharType const *begin,CharType const *end) 508 { 509 if(code_page_ == 65001) { 510 return utf_to_utf<char>(begin,end,how_); 511 } 512 std::wstring tmp = utf_to_utf<wchar_t>(begin,end,how_); 513 514 std::vector<char> ctmp; 515 wide_to_multibyte(code_page_,tmp.c_str(),tmp.c_str()+tmp.size(),how_ == skip,ctmp); 516 std::string res; 517 if(ctmp.empty()) 518 return res; 519 res.assign(&ctmp.front(),ctmp.size()); 520 return res; 521 522 } 523 524 private: 525 method_type how_; 526 int code_page_; 527 }; 528 529 530 531 532 533} // impl 534} // conv 535} // locale 536} // boost 537 538#endif 539// vim: tabstop=4 expandtab shiftwidth=4 softtabstop=4 540