1 // Locale support (codecvt) -*- C++ -*- 2 3 // Copyright (C) 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 4 // 2008, 2009, 2010 5 // Free Software Foundation, Inc. 6 // 7 // This file is part of the GNU ISO C++ Library. This library is free 8 // software; you can redistribute it and/or modify it under the 9 // terms of the GNU General Public License as published by the 10 // Free Software Foundation; either version 3, or (at your option) 11 // any later version. 12 13 // This library is distributed in the hope that it will be useful, 14 // but WITHOUT ANY WARRANTY; without even the implied warranty of 15 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16 // GNU General Public License for more details. 17 18 // Under Section 7 of GPL version 3, you are granted additional 19 // permissions described in the GCC Runtime Library Exception, version 20 // 3.1, as published by the Free Software Foundation. 21 22 // You should have received a copy of the GNU General Public License and 23 // a copy of the GCC Runtime Library Exception along with this program; 24 // see the files COPYING3 and COPYING.RUNTIME respectively. If not, see 25 // <http://www.gnu.org/licenses/>. 26 27 // 28 // ISO C++ 14882: 22.2.1.5 Template class codecvt 29 // 30 31 // Written by Benjamin Kosnik <bkoz@redhat.com> 32 33 /** @file ext/codecvt_specializations.h 34 * This file is a GNU extension to the Standard C++ Library. 35 */ 36 37 #ifndef _EXT_CODECVT_SPECIALIZATIONS_H 38 #define _EXT_CODECVT_SPECIALIZATIONS_H 1 39 40 #include <bits/c++config.h> 41 #include <locale> 42 #include <iconv.h> 43 44 namespace __gnu_cxx _GLIBCXX_VISIBILITY(default) 45 { 46 _GLIBCXX_BEGIN_NAMESPACE_VERSION 47 48 /// Extension to use iconv for dealing with character encodings. 49 // This includes conversions and comparisons between various character 50 // sets. This object encapsulates data that may need to be shared between 51 // char_traits, codecvt and ctype. 52 class encoding_state 53 { 54 public: 55 // Types: 56 // NB: A conversion descriptor subsumes and enhances the 57 // functionality of a simple state type such as mbstate_t. 58 typedef iconv_t descriptor_type; 59 60 protected: 61 // Name of internal character set encoding. 62 std::string _M_int_enc; 63 64 // Name of external character set encoding. 65 std::string _M_ext_enc; 66 67 // Conversion descriptor between external encoding to internal encoding. 68 descriptor_type _M_in_desc; 69 70 // Conversion descriptor between internal encoding to external encoding. 71 descriptor_type _M_out_desc; 72 73 // The byte-order marker for the external encoding, if necessary. 74 int _M_ext_bom; 75 76 // The byte-order marker for the internal encoding, if necessary. 77 int _M_int_bom; 78 79 // Number of external bytes needed to construct one complete 80 // character in the internal encoding. 81 // NB: -1 indicates variable, or stateful, encodings. 82 int _M_bytes; 83 84 public: 85 explicit 86 encoding_state() 87 : _M_in_desc(0), _M_out_desc(0), _M_ext_bom(0), _M_int_bom(0), _M_bytes(0) 88 { } 89 90 explicit 91 encoding_state(const char* __int, const char* __ext, 92 int __ibom = 0, int __ebom = 0, int __bytes = 1) 93 : _M_int_enc(__int), _M_ext_enc(__ext), _M_in_desc(0), _M_out_desc(0), 94 _M_ext_bom(__ebom), _M_int_bom(__ibom), _M_bytes(__bytes) 95 { init(); } 96 97 // 21.1.2 traits typedefs 98 // p4 99 // typedef STATE_T state_type 100 // requires: state_type shall meet the requirements of 101 // CopyConstructible types (20.1.3) 102 // NB: This does not preserve the actual state of the conversion 103 // descriptor member, but it does duplicate the encoding 104 // information. 105 encoding_state(const encoding_state& __obj) : _M_in_desc(0), _M_out_desc(0) 106 { construct(__obj); } 107 108 // Need assignment operator as well. 109 encoding_state& 110 operator=(const encoding_state& __obj) 111 { 112 construct(__obj); 113 return *this; 114 } 115 116 ~encoding_state() 117 { destroy(); } 118 119 bool 120 good() const throw() 121 { 122 const descriptor_type __err = (iconv_t)(-1); 123 bool __test = _M_in_desc && _M_in_desc != __err; 124 __test &= _M_out_desc && _M_out_desc != __err; 125 return __test; 126 } 127 128 int 129 character_ratio() const 130 { return _M_bytes; } 131 132 const std::string 133 internal_encoding() const 134 { return _M_int_enc; } 135 136 int 137 internal_bom() const 138 { return _M_int_bom; } 139 140 const std::string 141 external_encoding() const 142 { return _M_ext_enc; } 143 144 int 145 external_bom() const 146 { return _M_ext_bom; } 147 148 const descriptor_type& 149 in_descriptor() const 150 { return _M_in_desc; } 151 152 const descriptor_type& 153 out_descriptor() const 154 { return _M_out_desc; } 155 156 protected: 157 void 158 init() 159 { 160 const descriptor_type __err = (iconv_t)(-1); 161 const bool __have_encodings = _M_int_enc.size() && _M_ext_enc.size(); 162 if (!_M_in_desc && __have_encodings) 163 { 164 _M_in_desc = iconv_open(_M_int_enc.c_str(), _M_ext_enc.c_str()); 165 if (_M_in_desc == __err) 166 std::__throw_runtime_error(__N("encoding_state::_M_init " 167 "creating iconv input descriptor failed")); 168 } 169 if (!_M_out_desc && __have_encodings) 170 { 171 _M_out_desc = iconv_open(_M_ext_enc.c_str(), _M_int_enc.c_str()); 172 if (_M_out_desc == __err) 173 std::__throw_runtime_error(__N("encoding_state::_M_init " 174 "creating iconv output descriptor failed")); 175 } 176 } 177 178 void 179 construct(const encoding_state& __obj) 180 { 181 destroy(); 182 _M_int_enc = __obj._M_int_enc; 183 _M_ext_enc = __obj._M_ext_enc; 184 _M_ext_bom = __obj._M_ext_bom; 185 _M_int_bom = __obj._M_int_bom; 186 _M_bytes = __obj._M_bytes; 187 init(); 188 } 189 190 void 191 destroy() throw() 192 { 193 const descriptor_type __err = (iconv_t)(-1); 194 if (_M_in_desc && _M_in_desc != __err) 195 { 196 iconv_close(_M_in_desc); 197 _M_in_desc = 0; 198 } 199 if (_M_out_desc && _M_out_desc != __err) 200 { 201 iconv_close(_M_out_desc); 202 _M_out_desc = 0; 203 } 204 } 205 }; 206 207 /// encoding_char_traits 208 // Custom traits type with encoding_state for the state type, and the 209 // associated fpos<encoding_state> for the position type, all other 210 // bits equivalent to the required char_traits instantiations. 211 template<typename _CharT> 212 struct encoding_char_traits : public std::char_traits<_CharT> 213 { 214 typedef encoding_state state_type; 215 typedef typename std::fpos<state_type> pos_type; 216 }; 217 218 _GLIBCXX_END_NAMESPACE_VERSION 219 } // namespace 220 221 222 namespace std _GLIBCXX_VISIBILITY(default) 223 { 224 _GLIBCXX_BEGIN_NAMESPACE_VERSION 225 226 using __gnu_cxx::encoding_state; 227 228 /// codecvt<InternT, _ExternT, encoding_state> specialization. 229 // This partial specialization takes advantage of iconv to provide 230 // code conversions between a large number of character encodings. 231 template<typename _InternT, typename _ExternT> 232 class codecvt<_InternT, _ExternT, encoding_state> 233 : public __codecvt_abstract_base<_InternT, _ExternT, encoding_state> 234 { 235 public: 236 // Types: 237 typedef codecvt_base::result result; 238 typedef _InternT intern_type; 239 typedef _ExternT extern_type; 240 typedef __gnu_cxx::encoding_state state_type; 241 typedef state_type::descriptor_type descriptor_type; 242 243 // Data Members: 244 static locale::id id; 245 246 explicit 247 codecvt(size_t __refs = 0) 248 : __codecvt_abstract_base<intern_type, extern_type, state_type>(__refs) 249 { } 250 251 explicit 252 codecvt(state_type& __enc, size_t __refs = 0) 253 : __codecvt_abstract_base<intern_type, extern_type, state_type>(__refs) 254 { } 255 256 protected: 257 virtual 258 ~codecvt() { } 259 260 virtual result 261 do_out(state_type& __state, const intern_type* __from, 262 const intern_type* __from_end, const intern_type*& __from_next, 263 extern_type* __to, extern_type* __to_end, 264 extern_type*& __to_next) const; 265 266 virtual result 267 do_unshift(state_type& __state, extern_type* __to, 268 extern_type* __to_end, extern_type*& __to_next) const; 269 270 virtual result 271 do_in(state_type& __state, const extern_type* __from, 272 const extern_type* __from_end, const extern_type*& __from_next, 273 intern_type* __to, intern_type* __to_end, 274 intern_type*& __to_next) const; 275 276 virtual int 277 do_encoding() const throw(); 278 279 virtual bool 280 do_always_noconv() const throw(); 281 282 virtual int 283 do_length(state_type&, const extern_type* __from, 284 const extern_type* __end, size_t __max) const; 285 286 virtual int 287 do_max_length() const throw(); 288 }; 289 290 template<typename _InternT, typename _ExternT> 291 locale::id 292 codecvt<_InternT, _ExternT, encoding_state>::id; 293 294 // This adaptor works around the signature problems of the second 295 // argument to iconv(): SUSv2 and others use 'const char**', but glibc 2.2 296 // uses 'char**', which matches the POSIX 1003.1-2001 standard. 297 // Using this adaptor, g++ will do the work for us. 298 template<typename _Tp> 299 inline size_t 300 __iconv_adaptor(size_t(*__func)(iconv_t, _Tp, size_t*, char**, size_t*), 301 iconv_t __cd, char** __inbuf, size_t* __inbytes, 302 char** __outbuf, size_t* __outbytes) 303 { return __func(__cd, (_Tp)__inbuf, __inbytes, __outbuf, __outbytes); } 304 305 template<typename _InternT, typename _ExternT> 306 codecvt_base::result 307 codecvt<_InternT, _ExternT, encoding_state>:: 308 do_out(state_type& __state, const intern_type* __from, 309 const intern_type* __from_end, const intern_type*& __from_next, 310 extern_type* __to, extern_type* __to_end, 311 extern_type*& __to_next) const 312 { 313 result __ret = codecvt_base::error; 314 if (__state.good()) 315 { 316 const descriptor_type& __desc = __state.out_descriptor(); 317 const size_t __fmultiple = sizeof(intern_type); 318 size_t __fbytes = __fmultiple * (__from_end - __from); 319 const size_t __tmultiple = sizeof(extern_type); 320 size_t __tbytes = __tmultiple * (__to_end - __to); 321 322 // Argument list for iconv specifies a byte sequence. Thus, 323 // all to/from arrays must be brutally casted to char*. 324 char* __cto = reinterpret_cast<char*>(__to); 325 char* __cfrom; 326 size_t __conv; 327 328 // Some encodings need a byte order marker as the first item 329 // in the byte stream, to designate endian-ness. The default 330 // value for the byte order marker is NULL, so if this is 331 // the case, it's not necessary and we can just go on our 332 // merry way. 333 int __int_bom = __state.internal_bom(); 334 if (__int_bom) 335 { 336 size_t __size = __from_end - __from; 337 intern_type* __cfixed = static_cast<intern_type*> 338 (__builtin_alloca(sizeof(intern_type) * (__size + 1))); 339 __cfixed[0] = static_cast<intern_type>(__int_bom); 340 char_traits<intern_type>::copy(__cfixed + 1, __from, __size); 341 __cfrom = reinterpret_cast<char*>(__cfixed); 342 __conv = __iconv_adaptor(iconv, __desc, &__cfrom, 343 &__fbytes, &__cto, &__tbytes); 344 } 345 else 346 { 347 intern_type* __cfixed = const_cast<intern_type*>(__from); 348 __cfrom = reinterpret_cast<char*>(__cfixed); 349 __conv = __iconv_adaptor(iconv, __desc, &__cfrom, &__fbytes, 350 &__cto, &__tbytes); 351 } 352 353 if (__conv != size_t(-1)) 354 { 355 __from_next = reinterpret_cast<const intern_type*>(__cfrom); 356 __to_next = reinterpret_cast<extern_type*>(__cto); 357 __ret = codecvt_base::ok; 358 } 359 else 360 { 361 if (__fbytes < __fmultiple * (__from_end - __from)) 362 { 363 __from_next = reinterpret_cast<const intern_type*>(__cfrom); 364 __to_next = reinterpret_cast<extern_type*>(__cto); 365 __ret = codecvt_base::partial; 366 } 367 else 368 __ret = codecvt_base::error; 369 } 370 } 371 return __ret; 372 } 373 374 template<typename _InternT, typename _ExternT> 375 codecvt_base::result 376 codecvt<_InternT, _ExternT, encoding_state>:: 377 do_unshift(state_type& __state, extern_type* __to, 378 extern_type* __to_end, extern_type*& __to_next) const 379 { 380 result __ret = codecvt_base::error; 381 if (__state.good()) 382 { 383 const descriptor_type& __desc = __state.in_descriptor(); 384 const size_t __tmultiple = sizeof(intern_type); 385 size_t __tlen = __tmultiple * (__to_end - __to); 386 387 // Argument list for iconv specifies a byte sequence. Thus, 388 // all to/from arrays must be brutally casted to char*. 389 char* __cto = reinterpret_cast<char*>(__to); 390 size_t __conv = __iconv_adaptor(iconv,__desc, 0, 0, 391 &__cto, &__tlen); 392 393 if (__conv != size_t(-1)) 394 { 395 __to_next = reinterpret_cast<extern_type*>(__cto); 396 if (__tlen == __tmultiple * (__to_end - __to)) 397 __ret = codecvt_base::noconv; 398 else if (__tlen == 0) 399 __ret = codecvt_base::ok; 400 else 401 __ret = codecvt_base::partial; 402 } 403 else 404 __ret = codecvt_base::error; 405 } 406 return __ret; 407 } 408 409 template<typename _InternT, typename _ExternT> 410 codecvt_base::result 411 codecvt<_InternT, _ExternT, encoding_state>:: 412 do_in(state_type& __state, const extern_type* __from, 413 const extern_type* __from_end, const extern_type*& __from_next, 414 intern_type* __to, intern_type* __to_end, 415 intern_type*& __to_next) const 416 { 417 result __ret = codecvt_base::error; 418 if (__state.good()) 419 { 420 const descriptor_type& __desc = __state.in_descriptor(); 421 const size_t __fmultiple = sizeof(extern_type); 422 size_t __flen = __fmultiple * (__from_end - __from); 423 const size_t __tmultiple = sizeof(intern_type); 424 size_t __tlen = __tmultiple * (__to_end - __to); 425 426 // Argument list for iconv specifies a byte sequence. Thus, 427 // all to/from arrays must be brutally casted to char*. 428 char* __cto = reinterpret_cast<char*>(__to); 429 char* __cfrom; 430 size_t __conv; 431 432 // Some encodings need a byte order marker as the first item 433 // in the byte stream, to designate endian-ness. The default 434 // value for the byte order marker is NULL, so if this is 435 // the case, it's not necessary and we can just go on our 436 // merry way. 437 int __ext_bom = __state.external_bom(); 438 if (__ext_bom) 439 { 440 size_t __size = __from_end - __from; 441 extern_type* __cfixed = static_cast<extern_type*> 442 (__builtin_alloca(sizeof(extern_type) * (__size + 1))); 443 __cfixed[0] = static_cast<extern_type>(__ext_bom); 444 char_traits<extern_type>::copy(__cfixed + 1, __from, __size); 445 __cfrom = reinterpret_cast<char*>(__cfixed); 446 __conv = __iconv_adaptor(iconv, __desc, &__cfrom, 447 &__flen, &__cto, &__tlen); 448 } 449 else 450 { 451 extern_type* __cfixed = const_cast<extern_type*>(__from); 452 __cfrom = reinterpret_cast<char*>(__cfixed); 453 __conv = __iconv_adaptor(iconv, __desc, &__cfrom, 454 &__flen, &__cto, &__tlen); 455 } 456 457 458 if (__conv != size_t(-1)) 459 { 460 __from_next = reinterpret_cast<const extern_type*>(__cfrom); 461 __to_next = reinterpret_cast<intern_type*>(__cto); 462 __ret = codecvt_base::ok; 463 } 464 else 465 { 466 if (__flen < static_cast<size_t>(__from_end - __from)) 467 { 468 __from_next = reinterpret_cast<const extern_type*>(__cfrom); 469 __to_next = reinterpret_cast<intern_type*>(__cto); 470 __ret = codecvt_base::partial; 471 } 472 else 473 __ret = codecvt_base::error; 474 } 475 } 476 return __ret; 477 } 478 479 template<typename _InternT, typename _ExternT> 480 int 481 codecvt<_InternT, _ExternT, encoding_state>:: 482 do_encoding() const throw() 483 { 484 int __ret = 0; 485 if (sizeof(_ExternT) <= sizeof(_InternT)) 486 __ret = sizeof(_InternT) / sizeof(_ExternT); 487 return __ret; 488 } 489 490 template<typename _InternT, typename _ExternT> 491 bool 492 codecvt<_InternT, _ExternT, encoding_state>:: 493 do_always_noconv() const throw() 494 { return false; } 495 496 template<typename _InternT, typename _ExternT> 497 int 498 codecvt<_InternT, _ExternT, encoding_state>:: 499 do_length(state_type&, const extern_type* __from, 500 const extern_type* __end, size_t __max) const 501 { return std::min(__max, static_cast<size_t>(__end - __from)); } 502 503 // _GLIBCXX_RESOLVE_LIB_DEFECTS 504 // 74. Garbled text for codecvt::do_max_length 505 template<typename _InternT, typename _ExternT> 506 int 507 codecvt<_InternT, _ExternT, encoding_state>:: 508 do_max_length() const throw() 509 { return 1; } 510 511 _GLIBCXX_END_NAMESPACE_VERSION 512 } // namespace 513 514 #endif 515