1 // Locale support (codecvt) -*- C++ -*-
2 
3 // Copyright (C) 2000-2013 Free Software Foundation, Inc.
4 //
5 // This file is part of the GNU ISO C++ Library.  This library is free
6 // software; you can redistribute it and/or modify it under the
7 // terms of the GNU General Public License as published by the
8 // Free Software Foundation; either version 3, or (at your option)
9 // any later version.
10 
11 // This library is distributed in the hope that it will be useful,
12 // but WITHOUT ANY WARRANTY; without even the implied warranty of
13 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14 // GNU General Public License for more details.
15 
16 // Under Section 7 of GPL version 3, you are granted additional
17 // permissions described in the GCC Runtime Library Exception, version
18 // 3.1, as published by the Free Software Foundation.
19 
20 // You should have received a copy of the GNU General Public License and
21 // a copy of the GCC Runtime Library Exception along with this program;
22 // see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
23 // <http://www.gnu.org/licenses/>.
24 
25 //
26 // ISO C++ 14882: 22.2.1.5 Template class codecvt
27 //
28 
29 // Written by Benjamin Kosnik <bkoz@redhat.com>
30 
31 /** @file ext/codecvt_specializations.h
32  *  This file is a GNU extension to the Standard C++ Library.
33  */
34 
35 #ifndef _EXT_CODECVT_SPECIALIZATIONS_H
36 #define _EXT_CODECVT_SPECIALIZATIONS_H 1
37 
38 #include <bits/c++config.h>
39 #include <locale>
40 #include <iconv.h>
41 
_GLIBCXX_VISIBILITY(default)42 namespace __gnu_cxx _GLIBCXX_VISIBILITY(default)
43 {
44 _GLIBCXX_BEGIN_NAMESPACE_VERSION
45 
46   /// Extension to use iconv for dealing with character encodings.
47   // This includes conversions and comparisons between various character
48   // sets.  This object encapsulates data that may need to be shared between
49   // char_traits, codecvt and ctype.
50   class encoding_state
51   {
52   public:
53     // Types:
54     // NB: A conversion descriptor subsumes and enhances the
55     // functionality of a simple state type such as mbstate_t.
56     typedef iconv_t	descriptor_type;
57 
58   protected:
59     // Name of internal character set encoding.
60     std::string	       	_M_int_enc;
61 
62     // Name of external character set encoding.
63     std::string  	_M_ext_enc;
64 
65     // Conversion descriptor between external encoding to internal encoding.
66     descriptor_type	_M_in_desc;
67 
68     // Conversion descriptor between internal encoding to external encoding.
69     descriptor_type	_M_out_desc;
70 
71     // The byte-order marker for the external encoding, if necessary.
72     int			_M_ext_bom;
73 
74     // The byte-order marker for the internal encoding, if necessary.
75     int			_M_int_bom;
76 
77     // Number of external bytes needed to construct one complete
78     // character in the internal encoding.
79     // NB: -1 indicates variable, or stateful, encodings.
80     int 		_M_bytes;
81 
82   public:
83     explicit
84     encoding_state()
85     : _M_in_desc(0), _M_out_desc(0), _M_ext_bom(0), _M_int_bom(0), _M_bytes(0)
86     { }
87 
88     explicit
89     encoding_state(const char* __int, const char* __ext,
90 		   int __ibom = 0, int __ebom = 0, int __bytes = 1)
91     : _M_int_enc(__int), _M_ext_enc(__ext), _M_in_desc(0), _M_out_desc(0),
92       _M_ext_bom(__ebom), _M_int_bom(__ibom), _M_bytes(__bytes)
93     { init(); }
94 
95     // 21.1.2 traits typedefs
96     // p4
97     // typedef STATE_T state_type
98     // requires: state_type shall meet the requirements of
99     // CopyConstructible types (20.1.3)
100     // NB: This does not preserve the actual state of the conversion
101     // descriptor member, but it does duplicate the encoding
102     // information.
103     encoding_state(const encoding_state& __obj) : _M_in_desc(0), _M_out_desc(0)
104     { construct(__obj); }
105 
106     // Need assignment operator as well.
107     encoding_state&
108     operator=(const encoding_state& __obj)
109     {
110       construct(__obj);
111       return *this;
112     }
113 
114     ~encoding_state()
115     { destroy(); }
116 
117     bool
118     good() const throw()
119     {
120       const descriptor_type __err = (iconv_t)(-1);
121       bool __test = _M_in_desc && _M_in_desc != __err;
122       __test &=  _M_out_desc && _M_out_desc != __err;
123       return __test;
124     }
125 
126     int
127     character_ratio() const
128     { return _M_bytes; }
129 
130     const std::string
131     internal_encoding() const
132     { return _M_int_enc; }
133 
134     int
135     internal_bom() const
136     { return _M_int_bom; }
137 
138     const std::string
139     external_encoding() const
140     { return _M_ext_enc; }
141 
142     int
143     external_bom() const
144     { return _M_ext_bom; }
145 
146     const descriptor_type&
147     in_descriptor() const
148     { return _M_in_desc; }
149 
150     const descriptor_type&
151     out_descriptor() const
152     { return _M_out_desc; }
153 
154   protected:
155     void
156     init()
157     {
158       const descriptor_type __err = (iconv_t)(-1);
159       const bool __have_encodings = _M_int_enc.size() && _M_ext_enc.size();
160       if (!_M_in_desc && __have_encodings)
161 	{
162 	  _M_in_desc = iconv_open(_M_int_enc.c_str(), _M_ext_enc.c_str());
163 	  if (_M_in_desc == __err)
164 	    std::__throw_runtime_error(__N("encoding_state::_M_init "
165 				    "creating iconv input descriptor failed"));
166 	}
167       if (!_M_out_desc && __have_encodings)
168 	{
169 	  _M_out_desc = iconv_open(_M_ext_enc.c_str(), _M_int_enc.c_str());
170 	  if (_M_out_desc == __err)
171 	    std::__throw_runtime_error(__N("encoding_state::_M_init "
172 				  "creating iconv output descriptor failed"));
173 	}
174     }
175 
176     void
177     construct(const encoding_state& __obj)
178     {
179       destroy();
180       _M_int_enc = __obj._M_int_enc;
181       _M_ext_enc = __obj._M_ext_enc;
182       _M_ext_bom = __obj._M_ext_bom;
183       _M_int_bom = __obj._M_int_bom;
184       _M_bytes = __obj._M_bytes;
185       init();
186     }
187 
188     void
189     destroy() throw()
190     {
191       const descriptor_type __err = (iconv_t)(-1);
192       if (_M_in_desc && _M_in_desc != __err)
193 	{
194 	  iconv_close(_M_in_desc);
195 	  _M_in_desc = 0;
196 	}
197       if (_M_out_desc && _M_out_desc != __err)
198 	{
199 	  iconv_close(_M_out_desc);
200 	  _M_out_desc = 0;
201 	}
202     }
203   };
204 
205   /// encoding_char_traits
206   // Custom traits type with encoding_state for the state type, and the
207   // associated fpos<encoding_state> for the position type, all other
208   // bits equivalent to the required char_traits instantiations.
209   template<typename _CharT>
210     struct encoding_char_traits : public std::char_traits<_CharT>
211     {
212       typedef encoding_state				state_type;
213       typedef typename std::fpos<state_type>		pos_type;
214     };
215 
216 _GLIBCXX_END_NAMESPACE_VERSION
217 } // namespace
218 
219 
_GLIBCXX_VISIBILITY(default)220 namespace std _GLIBCXX_VISIBILITY(default)
221 {
222 _GLIBCXX_BEGIN_NAMESPACE_VERSION
223 
224   using __gnu_cxx::encoding_state;
225 
226   /// codecvt<InternT, _ExternT, encoding_state> specialization.
227   // This partial specialization takes advantage of iconv to provide
228   // code conversions between a large number of character encodings.
229   template<typename _InternT, typename _ExternT>
230     class codecvt<_InternT, _ExternT, encoding_state>
231     : public __codecvt_abstract_base<_InternT, _ExternT, encoding_state>
232     {
233     public:
234       // Types:
235       typedef codecvt_base::result			result;
236       typedef _InternT 					intern_type;
237       typedef _ExternT 					extern_type;
238       typedef __gnu_cxx::encoding_state 		state_type;
239       typedef state_type::descriptor_type 		descriptor_type;
240 
241       // Data Members:
242       static locale::id 		id;
243 
244       explicit
245       codecvt(size_t __refs = 0)
246       : __codecvt_abstract_base<intern_type, extern_type, state_type>(__refs)
247       { }
248 
249       explicit
250       codecvt(state_type& __enc, size_t __refs = 0)
251       : __codecvt_abstract_base<intern_type, extern_type, state_type>(__refs)
252       { }
253 
254      protected:
255       virtual
256       ~codecvt() { }
257 
258       virtual result
259       do_out(state_type& __state, const intern_type* __from,
260 	     const intern_type* __from_end, const intern_type*& __from_next,
261 	     extern_type* __to, extern_type* __to_end,
262 	     extern_type*& __to_next) const;
263 
264       virtual result
265       do_unshift(state_type& __state, extern_type* __to,
266 		 extern_type* __to_end, extern_type*& __to_next) const;
267 
268       virtual result
269       do_in(state_type& __state, const extern_type* __from,
270 	    const extern_type* __from_end, const extern_type*& __from_next,
271 	    intern_type* __to, intern_type* __to_end,
272 	    intern_type*& __to_next) const;
273 
274       virtual int
275       do_encoding() const throw();
276 
277       virtual bool
278       do_always_noconv() const throw();
279 
280       virtual int
281       do_length(state_type&, const extern_type* __from,
282 		const extern_type* __end, size_t __max) const;
283 
284       virtual int
285       do_max_length() const throw();
286     };
287 
288   template<typename _InternT, typename _ExternT>
289     locale::id
290     codecvt<_InternT, _ExternT, encoding_state>::id;
291 
292   // This adaptor works around the signature problems of the second
293   // argument to iconv():  SUSv2 and others use 'const char**', but glibc 2.2
294   // uses 'char**', which matches the POSIX 1003.1-2001 standard.
295   // Using this adaptor, g++ will do the work for us.
296   template<typename _Tp>
297     inline size_t
298     __iconv_adaptor(size_t(*__func)(iconv_t, _Tp, size_t*, char**, size_t*),
299                     iconv_t __cd, char** __inbuf, size_t* __inbytes,
300                     char** __outbuf, size_t* __outbytes)
301     { return __func(__cd, (_Tp)__inbuf, __inbytes, __outbuf, __outbytes); }
302 
303   template<typename _InternT, typename _ExternT>
304     codecvt_base::result
305     codecvt<_InternT, _ExternT, encoding_state>::
306     do_out(state_type& __state, const intern_type* __from,
307 	   const intern_type* __from_end, const intern_type*& __from_next,
308 	   extern_type* __to, extern_type* __to_end,
309 	   extern_type*& __to_next) const
310     {
311       result __ret = codecvt_base::error;
312       if (__state.good())
313 	{
314 	  const descriptor_type& __desc = __state.out_descriptor();
315 	  const size_t __fmultiple = sizeof(intern_type);
316 	  size_t __fbytes = __fmultiple * (__from_end - __from);
317 	  const size_t __tmultiple = sizeof(extern_type);
318 	  size_t __tbytes = __tmultiple * (__to_end - __to);
319 
320 	  // Argument list for iconv specifies a byte sequence. Thus,
321 	  // all to/from arrays must be brutally casted to char*.
322 	  char* __cto = reinterpret_cast<char*>(__to);
323 	  char* __cfrom;
324 	  size_t __conv;
325 
326 	  // Some encodings need a byte order marker as the first item
327 	  // in the byte stream, to designate endian-ness. The default
328 	  // value for the byte order marker is NULL, so if this is
329 	  // the case, it's not necessary and we can just go on our
330 	  // merry way.
331 	  int __int_bom = __state.internal_bom();
332 	  if (__int_bom)
333 	    {
334 	      size_t __size = __from_end - __from;
335 	      intern_type* __cfixed = static_cast<intern_type*>
336 		(__builtin_alloca(sizeof(intern_type) * (__size + 1)));
337 	      __cfixed[0] = static_cast<intern_type>(__int_bom);
338 	      char_traits<intern_type>::copy(__cfixed + 1, __from, __size);
339 	      __cfrom = reinterpret_cast<char*>(__cfixed);
340 	      __conv = __iconv_adaptor(iconv, __desc, &__cfrom,
341                                         &__fbytes, &__cto, &__tbytes);
342 	    }
343 	  else
344 	    {
345 	      intern_type* __cfixed = const_cast<intern_type*>(__from);
346 	      __cfrom = reinterpret_cast<char*>(__cfixed);
347 	      __conv = __iconv_adaptor(iconv, __desc, &__cfrom, &__fbytes,
348 				       &__cto, &__tbytes);
349 	    }
350 
351 	  if (__conv != size_t(-1))
352 	    {
353 	      __from_next = reinterpret_cast<const intern_type*>(__cfrom);
354 	      __to_next = reinterpret_cast<extern_type*>(__cto);
355 	      __ret = codecvt_base::ok;
356 	    }
357 	  else
358 	    {
359 	      if (__fbytes < __fmultiple * (__from_end - __from))
360 		{
361 		  __from_next = reinterpret_cast<const intern_type*>(__cfrom);
362 		  __to_next = reinterpret_cast<extern_type*>(__cto);
363 		  __ret = codecvt_base::partial;
364 		}
365 	      else
366 		__ret = codecvt_base::error;
367 	    }
368 	}
369       return __ret;
370     }
371 
372   template<typename _InternT, typename _ExternT>
373     codecvt_base::result
374     codecvt<_InternT, _ExternT, encoding_state>::
375     do_unshift(state_type& __state, extern_type* __to,
376 	       extern_type* __to_end, extern_type*& __to_next) const
377     {
378       result __ret = codecvt_base::error;
379       if (__state.good())
380 	{
381 	  const descriptor_type& __desc = __state.in_descriptor();
382 	  const size_t __tmultiple = sizeof(intern_type);
383 	  size_t __tlen = __tmultiple * (__to_end - __to);
384 
385 	  // Argument list for iconv specifies a byte sequence. Thus,
386 	  // all to/from arrays must be brutally casted to char*.
387 	  char* __cto = reinterpret_cast<char*>(__to);
388 	  size_t __conv = __iconv_adaptor(iconv,__desc, 0, 0,
389                                           &__cto, &__tlen);
390 
391 	  if (__conv != size_t(-1))
392 	    {
393 	      __to_next = reinterpret_cast<extern_type*>(__cto);
394 	      if (__tlen == __tmultiple * (__to_end - __to))
395 		__ret = codecvt_base::noconv;
396 	      else if (__tlen == 0)
397 		__ret = codecvt_base::ok;
398 	      else
399 		__ret = codecvt_base::partial;
400 	    }
401 	  else
402 	    __ret = codecvt_base::error;
403 	}
404       return __ret;
405     }
406 
407   template<typename _InternT, typename _ExternT>
408     codecvt_base::result
409     codecvt<_InternT, _ExternT, encoding_state>::
410     do_in(state_type& __state, const extern_type* __from,
411 	  const extern_type* __from_end, const extern_type*& __from_next,
412 	  intern_type* __to, intern_type* __to_end,
413 	  intern_type*& __to_next) const
414     {
415       result __ret = codecvt_base::error;
416       if (__state.good())
417 	{
418 	  const descriptor_type& __desc = __state.in_descriptor();
419 	  const size_t __fmultiple = sizeof(extern_type);
420 	  size_t __flen = __fmultiple * (__from_end - __from);
421 	  const size_t __tmultiple = sizeof(intern_type);
422 	  size_t __tlen = __tmultiple * (__to_end - __to);
423 
424 	  // Argument list for iconv specifies a byte sequence. Thus,
425 	  // all to/from arrays must be brutally casted to char*.
426 	  char* __cto = reinterpret_cast<char*>(__to);
427 	  char* __cfrom;
428 	  size_t __conv;
429 
430 	  // Some encodings need a byte order marker as the first item
431 	  // in the byte stream, to designate endian-ness. The default
432 	  // value for the byte order marker is NULL, so if this is
433 	  // the case, it's not necessary and we can just go on our
434 	  // merry way.
435 	  int __ext_bom = __state.external_bom();
436 	  if (__ext_bom)
437 	    {
438 	      size_t __size = __from_end - __from;
439 	      extern_type* __cfixed =  static_cast<extern_type*>
440 		(__builtin_alloca(sizeof(extern_type) * (__size + 1)));
441 	      __cfixed[0] = static_cast<extern_type>(__ext_bom);
442 	      char_traits<extern_type>::copy(__cfixed + 1, __from, __size);
443 	      __cfrom = reinterpret_cast<char*>(__cfixed);
444 	      __conv = __iconv_adaptor(iconv, __desc, &__cfrom,
445                                        &__flen, &__cto, &__tlen);
446 	    }
447 	  else
448 	    {
449 	      extern_type* __cfixed = const_cast<extern_type*>(__from);
450 	      __cfrom = reinterpret_cast<char*>(__cfixed);
451 	      __conv = __iconv_adaptor(iconv, __desc, &__cfrom,
452                                        &__flen, &__cto, &__tlen);
453 	    }
454 
455 
456 	  if (__conv != size_t(-1))
457 	    {
458 	      __from_next = reinterpret_cast<const extern_type*>(__cfrom);
459 	      __to_next = reinterpret_cast<intern_type*>(__cto);
460 	      __ret = codecvt_base::ok;
461 	    }
462 	  else
463 	    {
464 	      if (__flen < static_cast<size_t>(__from_end - __from))
465 		{
466 		  __from_next = reinterpret_cast<const extern_type*>(__cfrom);
467 		  __to_next = reinterpret_cast<intern_type*>(__cto);
468 		  __ret = codecvt_base::partial;
469 		}
470 	      else
471 		__ret = codecvt_base::error;
472 	    }
473 	}
474       return __ret;
475     }
476 
477   template<typename _InternT, typename _ExternT>
478     int
479     codecvt<_InternT, _ExternT, encoding_state>::
480     do_encoding() const throw()
481     {
482       int __ret = 0;
483       if (sizeof(_ExternT) <= sizeof(_InternT))
484 	__ret = sizeof(_InternT) / sizeof(_ExternT);
485       return __ret;
486     }
487 
488   template<typename _InternT, typename _ExternT>
489     bool
490     codecvt<_InternT, _ExternT, encoding_state>::
491     do_always_noconv() const throw()
492     { return false; }
493 
494   template<typename _InternT, typename _ExternT>
495     int
496     codecvt<_InternT, _ExternT, encoding_state>::
497     do_length(state_type&, const extern_type* __from,
498 	      const extern_type* __end, size_t __max) const
499     { return std::min(__max, static_cast<size_t>(__end - __from)); }
500 
501   // _GLIBCXX_RESOLVE_LIB_DEFECTS
502   // 74.  Garbled text for codecvt::do_max_length
503   template<typename _InternT, typename _ExternT>
504     int
505     codecvt<_InternT, _ExternT, encoding_state>::
506     do_max_length() const throw()
507     { return 1; }
508 
509 _GLIBCXX_END_NAMESPACE_VERSION
510 } // namespace
511 
512 #endif
513