1 // 2 // Copyright (c) 2009-2011 Artyom Beilis (Tonkikh) 3 // 4 // Distributed under the Boost Software License, Version 1.0. (See 5 // accompanying file LICENSE_1_0.txt or copy at 6 // http://www.boost.org/LICENSE_1_0.txt) 7 // 8 #define BOOST_LOCALE_SOURCE 9 #include <boost/locale/generator.hpp> 10 #include <boost/locale/encoding.hpp> 11 12 #include "../encoding/conv.hpp" 13 14 #include <boost/locale/util.hpp> 15 16 #ifdef BOOST_MSVC 17 # pragma warning(disable : 4244 4996) // loose data 18 #endif 19 20 #include <cstddef> 21 #include <string.h> 22 #include <vector> 23 #include <algorithm> 24 25 //#define DEBUG_CODECVT 26 27 #ifdef DEBUG_CODECVT 28 #include <iostream> 29 #endif 30 31 namespace boost { 32 namespace locale { 33 namespace util { 34 35 class utf8_converter : public base_converter { 36 public: max_len() const37 virtual int max_len() const 38 { 39 return 4; 40 } 41 clone() const42 virtual utf8_converter *clone() const 43 { 44 return new utf8_converter(); 45 } 46 is_thread_safe() const47 bool is_thread_safe() const 48 { 49 return true; 50 } 51 to_unicode(char const * & begin,char const * end)52 virtual uint32_t to_unicode(char const *&begin,char const *end) 53 { 54 char const *p=begin; 55 56 utf::code_point c = utf::utf_traits<char>::decode(p,end); 57 58 if(c==utf::illegal) 59 return illegal; 60 61 if(c==utf::incomplete) 62 return incomplete; 63 64 begin = p; 65 return c; 66 } 67 from_unicode(uint32_t u,char * begin,char const * end)68 virtual uint32_t from_unicode(uint32_t u,char *begin,char const *end) 69 { 70 if(!utf::is_valid_codepoint(u)) 71 return illegal; 72 int width = utf::utf_traits<char>::width(u); 73 std::ptrdiff_t d=end-begin; 74 if(d < width) 75 return incomplete; 76 utf::utf_traits<char>::encode(u,begin); 77 return width; 78 } 79 }; // utf8_converter 80 81 class simple_converter : public base_converter { 82 public: 83 ~simple_converter()84 virtual ~simple_converter() 85 { 86 } 87 simple_converter(std::string const & encoding)88 simple_converter(std::string const &encoding) 89 { 90 for(unsigned i=0;i<128;i++) 91 to_unicode_tbl_[i]=i; 92 for(unsigned i=128;i<256;i++) { 93 char buf[2] = { char(i) , 0 }; 94 try { 95 std::wstring const tmp = conv::to_utf<wchar_t>(buf,buf+1,encoding,conv::stop); 96 if(tmp.size() == 1) { 97 to_unicode_tbl_[i] = tmp[0]; 98 } 99 else { 100 to_unicode_tbl_[i] = illegal; 101 } 102 } 103 catch(conv::conversion_error const &/*e*/) { 104 to_unicode_tbl_[i] = illegal; 105 } 106 } 107 from_unicode_tbl_.resize(256); 108 for(unsigned i=0;i<256;i++) { 109 from_unicode_tbl_[to_unicode_tbl_[i] & 0xFF].push_back(i); 110 } 111 } 112 max_len() const113 virtual int max_len() const 114 { 115 return 1; 116 } 117 is_thread_safe() const118 virtual bool is_thread_safe() const 119 { 120 return true; 121 } clone() const122 virtual base_converter *clone() const 123 { 124 return new simple_converter(*this); 125 } to_unicode(char const * & begin,char const * end)126 virtual uint32_t to_unicode(char const *&begin,char const *end) 127 { 128 if(begin==end) 129 return incomplete; 130 unsigned char c = *begin++; 131 return to_unicode_tbl_[c]; 132 } from_unicode(uint32_t u,char * begin,char const * end)133 virtual uint32_t from_unicode(uint32_t u,char *begin,char const *end) 134 { 135 if(begin==end) 136 return incomplete; 137 std::vector<unsigned char> const &tbl = from_unicode_tbl_[u & 0xFF]; 138 for(std::vector<unsigned char>::const_iterator p=tbl.begin();p!=tbl.end();++p) { 139 if(to_unicode_tbl_[*p]==u) { 140 *begin++ = *p; 141 return 1; 142 } 143 } 144 return illegal; 145 } 146 private: 147 uint32_t to_unicode_tbl_[256]; 148 std::vector<std::vector<unsigned char> > from_unicode_tbl_; 149 }; 150 151 namespace { 152 char const *simple_encoding_table[] = { 153 "cp1250", 154 "cp1251", 155 "cp1252", 156 "cp1253", 157 "cp1254", 158 "cp1255", 159 "cp1256", 160 "cp1257", 161 "iso88591", 162 "iso885913", 163 "iso885915", 164 "iso88592", 165 "iso88593", 166 "iso88594", 167 "iso88595", 168 "iso88596", 169 "iso88597", 170 "iso88598", 171 "iso88599", 172 "koi8r", 173 "koi8u", 174 "usascii", 175 "windows1250", 176 "windows1251", 177 "windows1252", 178 "windows1253", 179 "windows1254", 180 "windows1255", 181 "windows1256", 182 "windows1257" 183 }; 184 compare_strings(char const * l,char const * r)185 bool compare_strings(char const *l,char const *r) 186 { 187 return strcmp(l,r) < 0; 188 } 189 } 190 191 create_simple_converter(std::string const & encoding)192 std::auto_ptr<base_converter> create_simple_converter(std::string const &encoding) 193 { 194 std::auto_ptr<base_converter> res; 195 std::string norm = conv::impl::normalize_encoding(encoding.c_str()); 196 if(std::binary_search<char const **>( simple_encoding_table, 197 simple_encoding_table + sizeof(simple_encoding_table)/sizeof(char const *), 198 norm.c_str(), 199 compare_strings)) 200 { 201 res.reset(new simple_converter(encoding)); 202 } 203 return res; 204 } 205 206 207 create_utf8_converter()208 std::auto_ptr<base_converter> create_utf8_converter() 209 { 210 std::auto_ptr<base_converter> res(new utf8_converter()); 211 return res; 212 } 213 214 // 215 // Traits for sizeof char 216 // 217 template<typename CharType,int n=sizeof(CharType)> 218 struct uchar_traits; 219 220 template<typename CharType> 221 struct uchar_traits<CharType,2> { 222 typedef uint16_t uint_type; 223 }; 224 template<typename CharType> 225 struct uchar_traits<CharType,4> { 226 typedef uint32_t uint_type; 227 }; 228 229 // Real codecvt 230 231 template<typename CharType> 232 class code_converter : public std::codecvt<CharType,char,std::mbstate_t> 233 { 234 public: code_converter(std::auto_ptr<base_converter> cvt,size_t refs=0)235 code_converter(std::auto_ptr<base_converter> cvt,size_t refs = 0) : 236 std::codecvt<CharType,char,std::mbstate_t>(refs), 237 cvt_(cvt) 238 { 239 max_len_ = cvt_->max_len(); 240 } 241 protected: 242 243 typedef CharType uchar; 244 do_unshift(std::mbstate_t & s,char * from,char *,char * & next) const245 virtual std::codecvt_base::result do_unshift(std::mbstate_t &s,char *from,char * /*to*/,char *&next) const 246 { 247 uint16_t &state = *reinterpret_cast<uint16_t *>(&s); 248 #ifdef DEBUG_CODECVT 249 std::cout << "Entering unshift " << std::hex << state << std::dec << std::endl; 250 #endif 251 if(state != 0) 252 return std::codecvt_base::error; 253 next=from; 254 return std::codecvt_base::ok; 255 } do_encoding() const256 virtual int do_encoding() const throw() 257 { 258 return 0; 259 } do_max_length() const260 virtual int do_max_length() const throw() 261 { 262 return max_len_; 263 } do_always_noconv() const264 virtual bool do_always_noconv() const throw() 265 { 266 return false; 267 } 268 269 virtual std::codecvt_base::result do_in(std::mbstate_t & state,char const * from,char const * from_end,char const * & from_next,uchar * uto,uchar * uto_end,uchar * & uto_next) const270 do_in( std::mbstate_t &state, 271 char const *from, 272 char const *from_end, 273 char const *&from_next, 274 uchar *uto, 275 uchar *uto_end, 276 uchar *&uto_next) const 277 { 278 typedef typename uchar_traits<uchar>::uint_type uint_type; 279 uint_type *to=reinterpret_cast<uint_type *>(uto); 280 uint_type *to_end=reinterpret_cast<uint_type *>(uto_end); 281 uint_type *&to_next=reinterpret_cast<uint_type *&>(uto_next); 282 return do_real_in(state,from,from_end,from_next,to,to_end,to_next); 283 } 284 285 virtual int do_length(std::mbstate_t & state,char const * from,char const * from_end,size_t max) const286 do_length( std::mbstate_t &state, 287 char const *from, 288 char const *from_end, 289 size_t max) const 290 { 291 char const *from_next=from; 292 std::vector<uchar> chrs(max+1); 293 uchar *to=&chrs.front(); 294 uchar *to_end=to+max; 295 uchar *to_next=to; 296 do_in(state,from,from_end,from_next,to,to_end,to_next); 297 return from_next-from; 298 } 299 300 virtual std::codecvt_base::result do_out(std::mbstate_t & state,uchar const * ufrom,uchar const * ufrom_end,uchar const * & ufrom_next,char * to,char * to_end,char * & to_next) const301 do_out( std::mbstate_t &state, 302 uchar const *ufrom, 303 uchar const *ufrom_end, 304 uchar const *&ufrom_next, 305 char *to, 306 char *to_end, 307 char *&to_next) const 308 { 309 typedef typename uchar_traits<uchar>::uint_type uint_type; 310 uint_type const *from=reinterpret_cast<uint_type const *>(ufrom); 311 uint_type const *from_end=reinterpret_cast<uint_type const *>(ufrom_end); 312 uint_type const *&from_next=reinterpret_cast<uint_type const *&>(ufrom_next); 313 return do_real_out(state,from,from_end,from_next,to,to_end,to_next); 314 } 315 316 317 private: 318 319 // 320 // Implementation for UTF-32 321 // 322 std::codecvt_base::result do_real_in(std::mbstate_t &,char const * from,char const * from_end,char const * & from_next,uint32_t * to,uint32_t * to_end,uint32_t * & to_next) const323 do_real_in( std::mbstate_t &/*state*/, 324 char const *from, 325 char const *from_end, 326 char const *&from_next, 327 uint32_t *to, 328 uint32_t *to_end, 329 uint32_t *&to_next) const 330 { 331 std::auto_ptr<base_converter> cvtp; 332 base_converter *cvt = 0; 333 if(cvt_->is_thread_safe()) { 334 cvt = cvt_.get(); 335 } 336 else { 337 cvtp.reset(cvt_->clone()); 338 cvt = cvtp.get(); 339 } 340 std::codecvt_base::result r=std::codecvt_base::ok; 341 while(to < to_end && from < from_end) 342 { 343 uint32_t ch=cvt->to_unicode(from,from_end); 344 if(ch==base_converter::illegal) { 345 r=std::codecvt_base::error; 346 break; 347 } 348 if(ch==base_converter::incomplete) { 349 r=std::codecvt_base::partial; 350 break; 351 } 352 *to++=ch; 353 } 354 from_next=from; 355 to_next=to; 356 if(r!=std::codecvt_base::ok) 357 return r; 358 if(from!=from_end) 359 return std::codecvt_base::partial; 360 return r; 361 } 362 363 // 364 // Implementation for UTF-32 365 // 366 std::codecvt_base::result do_real_out(std::mbstate_t &,uint32_t const * from,uint32_t const * from_end,uint32_t const * & from_next,char * to,char * to_end,char * & to_next) const367 do_real_out(std::mbstate_t &/*state*/, // state is not used there 368 uint32_t const *from, 369 uint32_t const *from_end, 370 uint32_t const *&from_next, 371 char *to, 372 char *to_end, 373 char *&to_next) const 374 { 375 std::auto_ptr<base_converter> cvtp; 376 base_converter *cvt = 0; 377 if(cvt_->is_thread_safe()) { 378 cvt = cvt_.get(); 379 } 380 else { 381 cvtp.reset(cvt_->clone()); 382 cvt = cvtp.get(); 383 } 384 385 std::codecvt_base::result r=std::codecvt_base::ok; 386 while(to < to_end && from < from_end) 387 { 388 uint32_t len=cvt->from_unicode(*from,to,to_end); 389 if(len==base_converter::illegal) { 390 r=std::codecvt_base::error; 391 break; 392 } 393 if(len==base_converter::incomplete) { 394 r=std::codecvt_base::partial; 395 break; 396 } 397 from++; 398 to+=len; 399 } 400 from_next=from; 401 to_next=to; 402 if(r!=std::codecvt_base::ok) 403 return r; 404 if(from!=from_end) 405 return std::codecvt_base::partial; 406 return r; 407 } 408 409 // 410 // Implementation for UTF-16 411 // 412 std::codecvt_base::result do_real_in(std::mbstate_t & std_state,char const * from,char const * from_end,char const * & from_next,uint16_t * to,uint16_t * to_end,uint16_t * & to_next) const413 do_real_in( std::mbstate_t &std_state, 414 char const *from, 415 char const *from_end, 416 char const *&from_next, 417 uint16_t *to, 418 uint16_t *to_end, 419 uint16_t *&to_next) const 420 { 421 std::auto_ptr<base_converter> cvtp; 422 base_converter *cvt = 0; 423 if(cvt_->is_thread_safe()) { 424 cvt = cvt_.get(); 425 } 426 else { 427 cvtp.reset(cvt_->clone()); 428 cvt = cvtp.get(); 429 } 430 std::codecvt_base::result r=std::codecvt_base::ok; 431 // mbstate_t is POD type and should be initialized to 0 (i.a. state = stateT()) 432 // according to standard. We use it to keed a flag 0/1 for surrogate pair writing 433 // 434 // if 0 no code above >0xFFFF observed, of 1 a code above 0xFFFF observerd 435 // and first pair is written, but no input consumed 436 uint16_t &state = *reinterpret_cast<uint16_t *>(&std_state); 437 while(to < to_end && from < from_end) 438 { 439 #ifdef DEBUG_CODECVT 440 std::cout << "Entering IN--------------" << std::endl; 441 std::cout << "State " << std::hex << state <<std::endl; 442 std::cout << "Left in " << std::dec << from_end - from << " out " << to_end -to << std::endl; 443 #endif 444 char const *from_saved = from; 445 uint32_t ch=cvt->to_unicode(from,from_end); 446 if(ch==base_converter::illegal) { 447 r=std::codecvt_base::error; 448 break; 449 } 450 if(ch==base_converter::incomplete) { 451 r=std::codecvt_base::partial; 452 break; 453 } 454 // Normal codepoints go direcly to stream 455 if(ch <= 0xFFFF) { 456 *to++=ch; 457 } 458 else { 459 // for other codepoints we do following 460 // 461 // 1. We can't consume our input as we may find ourselfs 462 // in state where all input consumed but not all output written,i.e. only 463 // 1st pair is written 464 // 2. We only write first pair and mark this in the state, we also revert back 465 // the from pointer in order to make sure this codepoint would be read 466 // once again and then we would consume our input together with writing 467 // second surrogate pair 468 ch-=0x10000; 469 uint16_t vh = ch >> 10; 470 uint16_t vl = ch & 0x3FF; 471 uint16_t w1 = vh + 0xD800; 472 uint16_t w2 = vl + 0xDC00; 473 if(state == 0) { 474 from = from_saved; 475 *to++ = w1; 476 state = 1; 477 } 478 else { 479 *to++ = w2; 480 state = 0; 481 } 482 } 483 } 484 from_next=from; 485 to_next=to; 486 if(r == std::codecvt_base::ok && (from!=from_end || state!=0)) 487 r = std::codecvt_base::partial; 488 #ifdef DEBUG_CODECVT 489 std::cout << "Returning "; 490 switch(r) { 491 case std::codecvt_base::ok: 492 std::cout << "ok" << std::endl; 493 break; 494 case std::codecvt_base::partial: 495 std::cout << "partial" << std::endl; 496 break; 497 case std::codecvt_base::error: 498 std::cout << "error" << std::endl; 499 break; 500 default: 501 std::cout << "other" << std::endl; 502 break; 503 } 504 std::cout << "State " << std::hex << state <<std::endl; 505 std::cout << "Left in " << std::dec << from_end - from << " out " << to_end -to << std::endl; 506 #endif 507 return r; 508 } 509 510 //encoding// Implementation for UTF-16 511 // 512 std::codecvt_base::result do_real_out(std::mbstate_t & std_state,uint16_t const * from,uint16_t const * from_end,uint16_t const * & from_next,char * to,char * to_end,char * & to_next) const513 do_real_out(std::mbstate_t &std_state, 514 uint16_t const *from, 515 uint16_t const *from_end, 516 uint16_t const *&from_next, 517 char *to, 518 char *to_end, 519 char *&to_next) const 520 { 521 std::auto_ptr<base_converter> cvtp; 522 base_converter *cvt = 0; 523 if(cvt_->is_thread_safe()) { 524 cvt = cvt_.get(); 525 } 526 else { 527 cvtp.reset(cvt_->clone()); 528 cvt = cvtp.get(); 529 } 530 std::codecvt_base::result r=std::codecvt_base::ok; 531 // mbstate_t is POD type and should be initialized to 0 (i.a. state = stateT()) 532 // according to standard. We assume that sizeof(mbstate_t) >=2 in order 533 // to be able to store first observerd surrogate pair 534 // 535 // State: state!=0 - a first surrogate pair was observerd (state = first pair), 536 // we expect the second one to come and then zero the state 537 /// 538 uint16_t &state = *reinterpret_cast<uint16_t *>(&std_state); 539 while(to < to_end && from < from_end) 540 { 541 #ifdef DEBUG_CODECVT 542 std::cout << "Entering OUT --------------" << std::endl; 543 std::cout << "State " << std::hex << state <<std::endl; 544 std::cout << "Left in " << std::dec << from_end - from << " out " << to_end -to << std::endl; 545 #endif 546 uint32_t ch=0; 547 if(state != 0) { 548 // if the state idecates that 1st surrogate pair was written 549 // we should make sure that the second one that comes is actually 550 // second surrogate 551 uint16_t w1 = state; 552 uint16_t w2 = *from; 553 // we don't forward from as writing may fail to incomplete or 554 // partial conversion 555 if(0xDC00 <= w2 && w2<=0xDFFF) { 556 uint16_t vh = w1 - 0xD800; 557 uint16_t vl = w2 - 0xDC00; 558 ch=((uint32_t(vh) << 10) | vl) + 0x10000; 559 } 560 else { 561 // Invalid surrogate 562 r=std::codecvt_base::error; 563 break; 564 } 565 } 566 else { 567 ch = *from; 568 if(0xD800 <= ch && ch<=0xDBFF) { 569 // if this is a first surrogate pair we put 570 // it into the state and consume it, note we don't 571 // go forward as it should be illegal so we increase 572 // the from pointer manually 573 state = ch; 574 from++; 575 continue; 576 } 577 else if(0xDC00 <= ch && ch<=0xDFFF) { 578 // if we observe second surrogate pair and 579 // first only may be expected we should break from the loop with error 580 // as it is illegal input 581 r=std::codecvt_base::error; 582 break; 583 } 584 } 585 586 uint32_t len=cvt->from_unicode(ch,to,to_end); 587 if(len==base_converter::illegal) { 588 r=std::codecvt_base::error; 589 break; 590 } 591 if(len==base_converter::incomplete) { 592 r=std::codecvt_base::partial; 593 break; 594 } 595 state = 0; 596 to+=len; 597 from++; 598 } 599 from_next=from; 600 to_next=to; 601 if(r==std::codecvt_base::ok && from!=from_end) 602 r = std::codecvt_base::partial; 603 #ifdef DEBUG_CODECVT 604 std::cout << "Returning "; 605 switch(r) { 606 case std::codecvt_base::ok: 607 std::cout << "ok" << std::endl; 608 break; 609 case std::codecvt_base::partial: 610 std::cout << "partial" << std::endl; 611 break; 612 case std::codecvt_base::error: 613 std::cout << "error" << std::endl; 614 break; 615 default: 616 std::cout << "other" << std::endl; 617 break; 618 } 619 std::cout << "State " << std::hex << state <<std::endl; 620 std::cout << "Left in " << std::dec << from_end - from << " out " << to_end -to << std::endl; 621 #endif 622 return r; 623 } 624 625 int max_len_; 626 std::auto_ptr<base_converter> cvt_; 627 628 }; 629 630 static const char ensure_mbstate_size_is_at_least_2[sizeof(std::mbstate_t) >= 2 ? 1 : -1] = {0}; 631 632 template<> 633 class code_converter<char> : public std::codecvt<char,char,std::mbstate_t> 634 { 635 public: code_converter(std::auto_ptr<base_converter>,size_t refs=0)636 code_converter(std::auto_ptr<base_converter> /*cvt*/,size_t refs = 0) : 637 std::codecvt<char,char,std::mbstate_t>(refs) 638 { 639 } 640 }; 641 642 create_codecvt(std::locale const & in,std::auto_ptr<base_converter> cvt,character_facet_type type)643 std::locale create_codecvt(std::locale const &in,std::auto_ptr<base_converter> cvt,character_facet_type type) 644 { 645 if(!cvt.get()) 646 cvt.reset(new base_converter()); 647 switch(type) { 648 case char_facet: 649 return std::locale(in,new code_converter<char>(cvt)); 650 case wchar_t_facet: 651 return std::locale(in,new code_converter<wchar_t>(cvt)); 652 #if defined(BOOST_HAS_CHAR16_T) && !defined(BOOST_NO_CHAR16_T_CODECVT) 653 case char16_t_facet: 654 return std::locale(in,new code_converter<char16_t>(cvt)); 655 #endif 656 #if defined(BOOST_HAS_CHAR32_T) && !defined(BOOST_NO_CHAR32_T_CODECVT) 657 case char32_t_facet: 658 return std::locale(in,new code_converter<char32_t>(cvt)); 659 #endif 660 default: 661 return in; 662 } 663 } 664 665 666 } // util 667 } // locale 668 } // boost 669 670 // vim: tabstop=4 expandtab shiftwidth=4 softtabstop=4 671