1 // Locale support (codecvt) -*- C++ -*-
2 
3 // Copyright (C) 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007,
4 // 2008, 2009, 2010
5 // Free Software Foundation, Inc.
6 //
7 // This file is part of the GNU ISO C++ Library.  This library is free
8 // software; you can redistribute it and/or modify it under the
9 // terms of the GNU General Public License as published by the
10 // Free Software Foundation; either version 3, or (at your option)
11 // any later version.
12 
13 // This library is distributed in the hope that it will be useful,
14 // but WITHOUT ANY WARRANTY; without even the implied warranty of
15 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16 // GNU General Public License for more details.
17 
18 // Under Section 7 of GPL version 3, you are granted additional
19 // permissions described in the GCC Runtime Library Exception, version
20 // 3.1, as published by the Free Software Foundation.
21 
22 // You should have received a copy of the GNU General Public License and
23 // a copy of the GCC Runtime Library Exception along with this program;
24 // see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
25 // <http://www.gnu.org/licenses/>.
26 
27 //
28 // ISO C++ 14882: 22.2.1.5 Template class codecvt
29 //
30 
31 // Written by Benjamin Kosnik <bkoz@redhat.com>
32 
33 /** @file ext/codecvt_specializations.h
34  *  This file is a GNU extension to the Standard C++ Library.
35  */
36 
37 #ifndef _EXT_CODECVT_SPECIALIZATIONS_H
38 #define _EXT_CODECVT_SPECIALIZATIONS_H 1
39 
40 #include <bits/c++config.h>
41 #include <locale>
42 #include <iconv.h>
43 
44 namespace __gnu_cxx _GLIBCXX_VISIBILITY(default)
45 {
46 _GLIBCXX_BEGIN_NAMESPACE_VERSION
47 
48   /// Extension to use iconv for dealing with character encodings.
49   // This includes conversions and comparisons between various character
50   // sets.  This object encapsulates data that may need to be shared between
51   // char_traits, codecvt and ctype.
52   class encoding_state
53   {
54   public:
55     // Types:
56     // NB: A conversion descriptor subsumes and enhances the
57     // functionality of a simple state type such as mbstate_t.
58     typedef iconv_t	descriptor_type;
59 
60   protected:
61     // Name of internal character set encoding.
62     std::string	       	_M_int_enc;
63 
64     // Name of external character set encoding.
65     std::string  	_M_ext_enc;
66 
67     // Conversion descriptor between external encoding to internal encoding.
68     descriptor_type	_M_in_desc;
69 
70     // Conversion descriptor between internal encoding to external encoding.
71     descriptor_type	_M_out_desc;
72 
73     // The byte-order marker for the external encoding, if necessary.
74     int			_M_ext_bom;
75 
76     // The byte-order marker for the internal encoding, if necessary.
77     int			_M_int_bom;
78 
79     // Number of external bytes needed to construct one complete
80     // character in the internal encoding.
81     // NB: -1 indicates variable, or stateful, encodings.
82     int 		_M_bytes;
83 
84   public:
85     explicit
86     encoding_state()
87     : _M_in_desc(0), _M_out_desc(0), _M_ext_bom(0), _M_int_bom(0), _M_bytes(0)
88     { }
89 
90     explicit
91     encoding_state(const char* __int, const char* __ext,
92 		   int __ibom = 0, int __ebom = 0, int __bytes = 1)
93     : _M_int_enc(__int), _M_ext_enc(__ext), _M_in_desc(0), _M_out_desc(0),
94       _M_ext_bom(__ebom), _M_int_bom(__ibom), _M_bytes(__bytes)
95     { init(); }
96 
97     // 21.1.2 traits typedefs
98     // p4
99     // typedef STATE_T state_type
100     // requires: state_type shall meet the requirements of
101     // CopyConstructible types (20.1.3)
102     // NB: This does not preserve the actual state of the conversion
103     // descriptor member, but it does duplicate the encoding
104     // information.
105     encoding_state(const encoding_state& __obj) : _M_in_desc(0), _M_out_desc(0)
106     { construct(__obj); }
107 
108     // Need assignment operator as well.
109     encoding_state&
110     operator=(const encoding_state& __obj)
111     {
112       construct(__obj);
113       return *this;
114     }
115 
116     ~encoding_state()
117     { destroy(); }
118 
119     bool
120     good() const throw()
121     {
122       const descriptor_type __err = (iconv_t)(-1);
123       bool __test = _M_in_desc && _M_in_desc != __err;
124       __test &=  _M_out_desc && _M_out_desc != __err;
125       return __test;
126     }
127 
128     int
129     character_ratio() const
130     { return _M_bytes; }
131 
132     const std::string
133     internal_encoding() const
134     { return _M_int_enc; }
135 
136     int
137     internal_bom() const
138     { return _M_int_bom; }
139 
140     const std::string
141     external_encoding() const
142     { return _M_ext_enc; }
143 
144     int
145     external_bom() const
146     { return _M_ext_bom; }
147 
148     const descriptor_type&
149     in_descriptor() const
150     { return _M_in_desc; }
151 
152     const descriptor_type&
153     out_descriptor() const
154     { return _M_out_desc; }
155 
156   protected:
157     void
158     init()
159     {
160       const descriptor_type __err = (iconv_t)(-1);
161       const bool __have_encodings = _M_int_enc.size() && _M_ext_enc.size();
162       if (!_M_in_desc && __have_encodings)
163 	{
164 	  _M_in_desc = iconv_open(_M_int_enc.c_str(), _M_ext_enc.c_str());
165 	  if (_M_in_desc == __err)
166 	    std::__throw_runtime_error(__N("encoding_state::_M_init "
167 				    "creating iconv input descriptor failed"));
168 	}
169       if (!_M_out_desc && __have_encodings)
170 	{
171 	  _M_out_desc = iconv_open(_M_ext_enc.c_str(), _M_int_enc.c_str());
172 	  if (_M_out_desc == __err)
173 	    std::__throw_runtime_error(__N("encoding_state::_M_init "
174 				  "creating iconv output descriptor failed"));
175 	}
176     }
177 
178     void
179     construct(const encoding_state& __obj)
180     {
181       destroy();
182       _M_int_enc = __obj._M_int_enc;
183       _M_ext_enc = __obj._M_ext_enc;
184       _M_ext_bom = __obj._M_ext_bom;
185       _M_int_bom = __obj._M_int_bom;
186       _M_bytes = __obj._M_bytes;
187       init();
188     }
189 
190     void
191     destroy() throw()
192     {
193       const descriptor_type __err = (iconv_t)(-1);
194       if (_M_in_desc && _M_in_desc != __err)
195 	{
196 	  iconv_close(_M_in_desc);
197 	  _M_in_desc = 0;
198 	}
199       if (_M_out_desc && _M_out_desc != __err)
200 	{
201 	  iconv_close(_M_out_desc);
202 	  _M_out_desc = 0;
203 	}
204     }
205   };
206 
207   /// encoding_char_traits
208   // Custom traits type with encoding_state for the state type, and the
209   // associated fpos<encoding_state> for the position type, all other
210   // bits equivalent to the required char_traits instantiations.
211   template<typename _CharT>
212     struct encoding_char_traits : public std::char_traits<_CharT>
213     {
214       typedef encoding_state				state_type;
215       typedef typename std::fpos<state_type>		pos_type;
216     };
217 
218 _GLIBCXX_END_NAMESPACE_VERSION
219 } // namespace
220 
221 
222 namespace std _GLIBCXX_VISIBILITY(default)
223 {
224 _GLIBCXX_BEGIN_NAMESPACE_VERSION
225 
226   using __gnu_cxx::encoding_state;
227 
228   /// codecvt<InternT, _ExternT, encoding_state> specialization.
229   // This partial specialization takes advantage of iconv to provide
230   // code conversions between a large number of character encodings.
231   template<typename _InternT, typename _ExternT>
232     class codecvt<_InternT, _ExternT, encoding_state>
233     : public __codecvt_abstract_base<_InternT, _ExternT, encoding_state>
234     {
235     public:
236       // Types:
237       typedef codecvt_base::result			result;
238       typedef _InternT 					intern_type;
239       typedef _ExternT 					extern_type;
240       typedef __gnu_cxx::encoding_state 		state_type;
241       typedef state_type::descriptor_type 		descriptor_type;
242 
243       // Data Members:
244       static locale::id 		id;
245 
246       explicit
247       codecvt(size_t __refs = 0)
248       : __codecvt_abstract_base<intern_type, extern_type, state_type>(__refs)
249       { }
250 
251       explicit
252       codecvt(state_type& __enc, size_t __refs = 0)
253       : __codecvt_abstract_base<intern_type, extern_type, state_type>(__refs)
254       { }
255 
256      protected:
257       virtual
258       ~codecvt() { }
259 
260       virtual result
261       do_out(state_type& __state, const intern_type* __from,
262 	     const intern_type* __from_end, const intern_type*& __from_next,
263 	     extern_type* __to, extern_type* __to_end,
264 	     extern_type*& __to_next) const;
265 
266       virtual result
267       do_unshift(state_type& __state, extern_type* __to,
268 		 extern_type* __to_end, extern_type*& __to_next) const;
269 
270       virtual result
271       do_in(state_type& __state, const extern_type* __from,
272 	    const extern_type* __from_end, const extern_type*& __from_next,
273 	    intern_type* __to, intern_type* __to_end,
274 	    intern_type*& __to_next) const;
275 
276       virtual int
277       do_encoding() const throw();
278 
279       virtual bool
280       do_always_noconv() const throw();
281 
282       virtual int
283       do_length(state_type&, const extern_type* __from,
284 		const extern_type* __end, size_t __max) const;
285 
286       virtual int
287       do_max_length() const throw();
288     };
289 
290   template<typename _InternT, typename _ExternT>
291     locale::id
292     codecvt<_InternT, _ExternT, encoding_state>::id;
293 
294   // This adaptor works around the signature problems of the second
295   // argument to iconv():  SUSv2 and others use 'const char**', but glibc 2.2
296   // uses 'char**', which matches the POSIX 1003.1-2001 standard.
297   // Using this adaptor, g++ will do the work for us.
298   template<typename _Tp>
299     inline size_t
300     __iconv_adaptor(size_t(*__func)(iconv_t, _Tp, size_t*, char**, size_t*),
301                     iconv_t __cd, char** __inbuf, size_t* __inbytes,
302                     char** __outbuf, size_t* __outbytes)
303     { return __func(__cd, (_Tp)__inbuf, __inbytes, __outbuf, __outbytes); }
304 
305   template<typename _InternT, typename _ExternT>
306     codecvt_base::result
307     codecvt<_InternT, _ExternT, encoding_state>::
308     do_out(state_type& __state, const intern_type* __from,
309 	   const intern_type* __from_end, const intern_type*& __from_next,
310 	   extern_type* __to, extern_type* __to_end,
311 	   extern_type*& __to_next) const
312     {
313       result __ret = codecvt_base::error;
314       if (__state.good())
315 	{
316 	  const descriptor_type& __desc = __state.out_descriptor();
317 	  const size_t __fmultiple = sizeof(intern_type);
318 	  size_t __fbytes = __fmultiple * (__from_end - __from);
319 	  const size_t __tmultiple = sizeof(extern_type);
320 	  size_t __tbytes = __tmultiple * (__to_end - __to);
321 
322 	  // Argument list for iconv specifies a byte sequence. Thus,
323 	  // all to/from arrays must be brutally casted to char*.
324 	  char* __cto = reinterpret_cast<char*>(__to);
325 	  char* __cfrom;
326 	  size_t __conv;
327 
328 	  // Some encodings need a byte order marker as the first item
329 	  // in the byte stream, to designate endian-ness. The default
330 	  // value for the byte order marker is NULL, so if this is
331 	  // the case, it's not necessary and we can just go on our
332 	  // merry way.
333 	  int __int_bom = __state.internal_bom();
334 	  if (__int_bom)
335 	    {
336 	      size_t __size = __from_end - __from;
337 	      intern_type* __cfixed = static_cast<intern_type*>
338 		(__builtin_alloca(sizeof(intern_type) * (__size + 1)));
339 	      __cfixed[0] = static_cast<intern_type>(__int_bom);
340 	      char_traits<intern_type>::copy(__cfixed + 1, __from, __size);
341 	      __cfrom = reinterpret_cast<char*>(__cfixed);
342 	      __conv = __iconv_adaptor(iconv, __desc, &__cfrom,
343                                         &__fbytes, &__cto, &__tbytes);
344 	    }
345 	  else
346 	    {
347 	      intern_type* __cfixed = const_cast<intern_type*>(__from);
348 	      __cfrom = reinterpret_cast<char*>(__cfixed);
349 	      __conv = __iconv_adaptor(iconv, __desc, &__cfrom, &__fbytes,
350 				       &__cto, &__tbytes);
351 	    }
352 
353 	  if (__conv != size_t(-1))
354 	    {
355 	      __from_next = reinterpret_cast<const intern_type*>(__cfrom);
356 	      __to_next = reinterpret_cast<extern_type*>(__cto);
357 	      __ret = codecvt_base::ok;
358 	    }
359 	  else
360 	    {
361 	      if (__fbytes < __fmultiple * (__from_end - __from))
362 		{
363 		  __from_next = reinterpret_cast<const intern_type*>(__cfrom);
364 		  __to_next = reinterpret_cast<extern_type*>(__cto);
365 		  __ret = codecvt_base::partial;
366 		}
367 	      else
368 		__ret = codecvt_base::error;
369 	    }
370 	}
371       return __ret;
372     }
373 
374   template<typename _InternT, typename _ExternT>
375     codecvt_base::result
376     codecvt<_InternT, _ExternT, encoding_state>::
377     do_unshift(state_type& __state, extern_type* __to,
378 	       extern_type* __to_end, extern_type*& __to_next) const
379     {
380       result __ret = codecvt_base::error;
381       if (__state.good())
382 	{
383 	  const descriptor_type& __desc = __state.in_descriptor();
384 	  const size_t __tmultiple = sizeof(intern_type);
385 	  size_t __tlen = __tmultiple * (__to_end - __to);
386 
387 	  // Argument list for iconv specifies a byte sequence. Thus,
388 	  // all to/from arrays must be brutally casted to char*.
389 	  char* __cto = reinterpret_cast<char*>(__to);
390 	  size_t __conv = __iconv_adaptor(iconv,__desc, 0, 0,
391                                           &__cto, &__tlen);
392 
393 	  if (__conv != size_t(-1))
394 	    {
395 	      __to_next = reinterpret_cast<extern_type*>(__cto);
396 	      if (__tlen == __tmultiple * (__to_end - __to))
397 		__ret = codecvt_base::noconv;
398 	      else if (__tlen == 0)
399 		__ret = codecvt_base::ok;
400 	      else
401 		__ret = codecvt_base::partial;
402 	    }
403 	  else
404 	    __ret = codecvt_base::error;
405 	}
406       return __ret;
407     }
408 
409   template<typename _InternT, typename _ExternT>
410     codecvt_base::result
411     codecvt<_InternT, _ExternT, encoding_state>::
412     do_in(state_type& __state, const extern_type* __from,
413 	  const extern_type* __from_end, const extern_type*& __from_next,
414 	  intern_type* __to, intern_type* __to_end,
415 	  intern_type*& __to_next) const
416     {
417       result __ret = codecvt_base::error;
418       if (__state.good())
419 	{
420 	  const descriptor_type& __desc = __state.in_descriptor();
421 	  const size_t __fmultiple = sizeof(extern_type);
422 	  size_t __flen = __fmultiple * (__from_end - __from);
423 	  const size_t __tmultiple = sizeof(intern_type);
424 	  size_t __tlen = __tmultiple * (__to_end - __to);
425 
426 	  // Argument list for iconv specifies a byte sequence. Thus,
427 	  // all to/from arrays must be brutally casted to char*.
428 	  char* __cto = reinterpret_cast<char*>(__to);
429 	  char* __cfrom;
430 	  size_t __conv;
431 
432 	  // Some encodings need a byte order marker as the first item
433 	  // in the byte stream, to designate endian-ness. The default
434 	  // value for the byte order marker is NULL, so if this is
435 	  // the case, it's not necessary and we can just go on our
436 	  // merry way.
437 	  int __ext_bom = __state.external_bom();
438 	  if (__ext_bom)
439 	    {
440 	      size_t __size = __from_end - __from;
441 	      extern_type* __cfixed =  static_cast<extern_type*>
442 		(__builtin_alloca(sizeof(extern_type) * (__size + 1)));
443 	      __cfixed[0] = static_cast<extern_type>(__ext_bom);
444 	      char_traits<extern_type>::copy(__cfixed + 1, __from, __size);
445 	      __cfrom = reinterpret_cast<char*>(__cfixed);
446 	      __conv = __iconv_adaptor(iconv, __desc, &__cfrom,
447                                        &__flen, &__cto, &__tlen);
448 	    }
449 	  else
450 	    {
451 	      extern_type* __cfixed = const_cast<extern_type*>(__from);
452 	      __cfrom = reinterpret_cast<char*>(__cfixed);
453 	      __conv = __iconv_adaptor(iconv, __desc, &__cfrom,
454                                        &__flen, &__cto, &__tlen);
455 	    }
456 
457 
458 	  if (__conv != size_t(-1))
459 	    {
460 	      __from_next = reinterpret_cast<const extern_type*>(__cfrom);
461 	      __to_next = reinterpret_cast<intern_type*>(__cto);
462 	      __ret = codecvt_base::ok;
463 	    }
464 	  else
465 	    {
466 	      if (__flen < static_cast<size_t>(__from_end - __from))
467 		{
468 		  __from_next = reinterpret_cast<const extern_type*>(__cfrom);
469 		  __to_next = reinterpret_cast<intern_type*>(__cto);
470 		  __ret = codecvt_base::partial;
471 		}
472 	      else
473 		__ret = codecvt_base::error;
474 	    }
475 	}
476       return __ret;
477     }
478 
479   template<typename _InternT, typename _ExternT>
480     int
481     codecvt<_InternT, _ExternT, encoding_state>::
482     do_encoding() const throw()
483     {
484       int __ret = 0;
485       if (sizeof(_ExternT) <= sizeof(_InternT))
486 	__ret = sizeof(_InternT) / sizeof(_ExternT);
487       return __ret;
488     }
489 
490   template<typename _InternT, typename _ExternT>
491     bool
492     codecvt<_InternT, _ExternT, encoding_state>::
493     do_always_noconv() const throw()
494     { return false; }
495 
496   template<typename _InternT, typename _ExternT>
497     int
498     codecvt<_InternT, _ExternT, encoding_state>::
499     do_length(state_type&, const extern_type* __from,
500 	      const extern_type* __end, size_t __max) const
501     { return std::min(__max, static_cast<size_t>(__end - __from)); }
502 
503   // _GLIBCXX_RESOLVE_LIB_DEFECTS
504   // 74.  Garbled text for codecvt::do_max_length
505   template<typename _InternT, typename _ExternT>
506     int
507     codecvt<_InternT, _ExternT, encoding_state>::
508     do_max_length() const throw()
509     { return 1; }
510 
511 _GLIBCXX_END_NAMESPACE_VERSION
512 } // namespace
513 
514 #endif
515