1 // Locale support (codecvt) -*- C++ -*-
2 
3 // Copyright (C) 2000-2018 Free Software Foundation, Inc.
4 //
5 // This file is part of the GNU ISO C++ Library.  This library is free
6 // software; you can redistribute it and/or modify it under the
7 // terms of the GNU General Public License as published by the
8 // Free Software Foundation; either version 3, or (at your option)
9 // any later version.
10 
11 // This library is distributed in the hope that it will be useful,
12 // but WITHOUT ANY WARRANTY; without even the implied warranty of
13 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14 // GNU General Public License for more details.
15 
16 // Under Section 7 of GPL version 3, you are granted additional
17 // permissions described in the GCC Runtime Library Exception, version
18 // 3.1, as published by the Free Software Foundation.
19 
20 // You should have received a copy of the GNU General Public License and
21 // a copy of the GCC Runtime Library Exception along with this program;
22 // see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
23 // <http://www.gnu.org/licenses/>.
24 
25 //
26 // ISO C++ 14882: 22.2.1.5 Template class codecvt
27 //
28 
29 // Written by Benjamin Kosnik <bkoz@redhat.com>
30 
31 /** @file ext/codecvt_specializations.h
32  *  This file is a GNU extension to the Standard C++ Library.
33  */
34 
35 #ifndef _EXT_CODECVT_SPECIALIZATIONS_H
36 #define _EXT_CODECVT_SPECIALIZATIONS_H 1
37 
38 #include <bits/c++config.h>
39 #include <locale>
40 #include <iconv.h>
41 
42 namespace __gnu_cxx _GLIBCXX_VISIBILITY(default)
43 {
44 _GLIBCXX_BEGIN_NAMESPACE_VERSION
45 _GLIBCXX_BEGIN_NAMESPACE_CXX11
46 
47   /// Extension to use iconv for dealing with character encodings.
48   // This includes conversions and comparisons between various character
49   // sets.  This object encapsulates data that may need to be shared between
50   // char_traits, codecvt and ctype.
51   class encoding_state
52   {
53   public:
54     // Types:
55     // NB: A conversion descriptor subsumes and enhances the
56     // functionality of a simple state type such as mbstate_t.
57     typedef iconv_t	descriptor_type;
58 
59   protected:
60     // Name of internal character set encoding.
61     std::string	       	_M_int_enc;
62 
63     // Name of external character set encoding.
64     std::string  	_M_ext_enc;
65 
66     // Conversion descriptor between external encoding to internal encoding.
67     descriptor_type	_M_in_desc;
68 
69     // Conversion descriptor between internal encoding to external encoding.
70     descriptor_type	_M_out_desc;
71 
72     // The byte-order marker for the external encoding, if necessary.
73     int			_M_ext_bom;
74 
75     // The byte-order marker for the internal encoding, if necessary.
76     int			_M_int_bom;
77 
78     // Number of external bytes needed to construct one complete
79     // character in the internal encoding.
80     // NB: -1 indicates variable, or stateful, encodings.
81     int 		_M_bytes;
82 
83   public:
84     explicit
85     encoding_state()
86     : _M_in_desc(0), _M_out_desc(0), _M_ext_bom(0), _M_int_bom(0), _M_bytes(0)
87     { }
88 
89     explicit
90     encoding_state(const char* __int, const char* __ext,
91 		   int __ibom = 0, int __ebom = 0, int __bytes = 1)
92     : _M_int_enc(__int), _M_ext_enc(__ext), _M_in_desc(0), _M_out_desc(0),
93       _M_ext_bom(__ebom), _M_int_bom(__ibom), _M_bytes(__bytes)
94     { init(); }
95 
96     // 21.1.2 traits typedefs
97     // p4
98     // typedef STATE_T state_type
99     // requires: state_type shall meet the requirements of
100     // CopyConstructible types (20.1.3)
101     // NB: This does not preserve the actual state of the conversion
102     // descriptor member, but it does duplicate the encoding
103     // information.
104     encoding_state(const encoding_state& __obj) : _M_in_desc(0), _M_out_desc(0)
105     { construct(__obj); }
106 
107     // Need assignment operator as well.
108     encoding_state&
109     operator=(const encoding_state& __obj)
110     {
111       construct(__obj);
112       return *this;
113     }
114 
115     ~encoding_state()
116     { destroy(); }
117 
118     bool
119     good() const throw()
120     {
121       const descriptor_type __err = (iconv_t)(-1);
122       bool __test = _M_in_desc && _M_in_desc != __err;
123       __test &=  _M_out_desc && _M_out_desc != __err;
124       return __test;
125     }
126 
127     int
128     character_ratio() const
129     { return _M_bytes; }
130 
131     const std::string
132     internal_encoding() const
133     { return _M_int_enc; }
134 
135     int
136     internal_bom() const
137     { return _M_int_bom; }
138 
139     const std::string
140     external_encoding() const
141     { return _M_ext_enc; }
142 
143     int
144     external_bom() const
145     { return _M_ext_bom; }
146 
147     const descriptor_type&
148     in_descriptor() const
149     { return _M_in_desc; }
150 
151     const descriptor_type&
152     out_descriptor() const
153     { return _M_out_desc; }
154 
155   protected:
156     void
157     init()
158     {
159       const descriptor_type __err = (iconv_t)(-1);
160       const bool __have_encodings = _M_int_enc.size() && _M_ext_enc.size();
161       if (!_M_in_desc && __have_encodings)
162 	{
163 	  _M_in_desc = iconv_open(_M_int_enc.c_str(), _M_ext_enc.c_str());
164 	  if (_M_in_desc == __err)
165 	    std::__throw_runtime_error(__N("encoding_state::_M_init "
166 				    "creating iconv input descriptor failed"));
167 	}
168       if (!_M_out_desc && __have_encodings)
169 	{
170 	  _M_out_desc = iconv_open(_M_ext_enc.c_str(), _M_int_enc.c_str());
171 	  if (_M_out_desc == __err)
172 	    std::__throw_runtime_error(__N("encoding_state::_M_init "
173 				  "creating iconv output descriptor failed"));
174 	}
175     }
176 
177     void
178     construct(const encoding_state& __obj)
179     {
180       destroy();
181       _M_int_enc = __obj._M_int_enc;
182       _M_ext_enc = __obj._M_ext_enc;
183       _M_ext_bom = __obj._M_ext_bom;
184       _M_int_bom = __obj._M_int_bom;
185       _M_bytes = __obj._M_bytes;
186       init();
187     }
188 
189     void
190     destroy() throw()
191     {
192       const descriptor_type __err = (iconv_t)(-1);
193       if (_M_in_desc && _M_in_desc != __err)
194 	{
195 	  iconv_close(_M_in_desc);
196 	  _M_in_desc = 0;
197 	}
198       if (_M_out_desc && _M_out_desc != __err)
199 	{
200 	  iconv_close(_M_out_desc);
201 	  _M_out_desc = 0;
202 	}
203     }
204   };
205 
206   /// encoding_char_traits
207   // Custom traits type with encoding_state for the state type, and the
208   // associated fpos<encoding_state> for the position type, all other
209   // bits equivalent to the required char_traits instantiations.
210   template<typename _CharT>
211     struct encoding_char_traits
212     : public std::char_traits<_CharT>
213     {
214       typedef encoding_state				state_type;
215       typedef typename std::fpos<state_type>		pos_type;
216     };
217 
218 _GLIBCXX_END_NAMESPACE_CXX11
219 _GLIBCXX_END_NAMESPACE_VERSION
220 } // namespace
221 
222 
223 namespace std _GLIBCXX_VISIBILITY(default)
224 {
225 _GLIBCXX_BEGIN_NAMESPACE_VERSION
226 
227   using __gnu_cxx::encoding_state;
228 
229   /// codecvt<InternT, _ExternT, encoding_state> specialization.
230   // This partial specialization takes advantage of iconv to provide
231   // code conversions between a large number of character encodings.
232   template<typename _InternT, typename _ExternT>
233     class codecvt<_InternT, _ExternT, encoding_state>
234     : public __codecvt_abstract_base<_InternT, _ExternT, encoding_state>
235     {
236     public:
237       // Types:
238       typedef codecvt_base::result			result;
239       typedef _InternT 					intern_type;
240       typedef _ExternT 					extern_type;
241       typedef __gnu_cxx::encoding_state 		state_type;
242       typedef state_type::descriptor_type 		descriptor_type;
243 
244       // Data Members:
245       static locale::id 		id;
246 
247       explicit
248       codecvt(size_t __refs = 0)
249       : __codecvt_abstract_base<intern_type, extern_type, state_type>(__refs)
250       { }
251 
252       explicit
253       codecvt(state_type& __enc, size_t __refs = 0)
254       : __codecvt_abstract_base<intern_type, extern_type, state_type>(__refs)
255       { }
256 
257      protected:
258       virtual
259       ~codecvt() { }
260 
261       virtual result
262       do_out(state_type& __state, const intern_type* __from,
263 	     const intern_type* __from_end, const intern_type*& __from_next,
264 	     extern_type* __to, extern_type* __to_end,
265 	     extern_type*& __to_next) const;
266 
267       virtual result
268       do_unshift(state_type& __state, extern_type* __to,
269 		 extern_type* __to_end, extern_type*& __to_next) const;
270 
271       virtual result
272       do_in(state_type& __state, const extern_type* __from,
273 	    const extern_type* __from_end, const extern_type*& __from_next,
274 	    intern_type* __to, intern_type* __to_end,
275 	    intern_type*& __to_next) const;
276 
277       virtual int
278       do_encoding() const throw();
279 
280       virtual bool
281       do_always_noconv() const throw();
282 
283       virtual int
284       do_length(state_type&, const extern_type* __from,
285 		const extern_type* __end, size_t __max) const;
286 
287       virtual int
288       do_max_length() const throw();
289     };
290 
291   template<typename _InternT, typename _ExternT>
292     locale::id
293     codecvt<_InternT, _ExternT, encoding_state>::id;
294 
295   // This adaptor works around the signature problems of the second
296   // argument to iconv():  SUSv2 and others use 'const char**', but glibc 2.2
297   // uses 'char**', which matches the POSIX 1003.1-2001 standard.
298   // Using this adaptor, g++ will do the work for us.
299   template<typename _Tp>
300     inline size_t
301     __iconv_adaptor(size_t(*__func)(iconv_t, _Tp, size_t*, char**, size_t*),
302                     iconv_t __cd, char** __inbuf, size_t* __inbytes,
303                     char** __outbuf, size_t* __outbytes)
304     { return __func(__cd, (_Tp)__inbuf, __inbytes, __outbuf, __outbytes); }
305 
306   template<typename _InternT, typename _ExternT>
307     codecvt_base::result
308     codecvt<_InternT, _ExternT, encoding_state>::
309     do_out(state_type& __state, const intern_type* __from,
310 	   const intern_type* __from_end, const intern_type*& __from_next,
311 	   extern_type* __to, extern_type* __to_end,
312 	   extern_type*& __to_next) const
313     {
314       result __ret = codecvt_base::error;
315       if (__state.good())
316 	{
317 	  const descriptor_type& __desc = __state.out_descriptor();
318 	  const size_t __fmultiple = sizeof(intern_type);
319 	  size_t __fbytes = __fmultiple * (__from_end - __from);
320 	  const size_t __tmultiple = sizeof(extern_type);
321 	  size_t __tbytes = __tmultiple * (__to_end - __to);
322 
323 	  // Argument list for iconv specifies a byte sequence. Thus,
324 	  // all to/from arrays must be brutally casted to char*.
325 	  char* __cto = reinterpret_cast<char*>(__to);
326 	  char* __cfrom;
327 	  size_t __conv;
328 
329 	  // Some encodings need a byte order marker as the first item
330 	  // in the byte stream, to designate endian-ness. The default
331 	  // value for the byte order marker is NULL, so if this is
332 	  // the case, it's not necessary and we can just go on our
333 	  // merry way.
334 	  int __int_bom = __state.internal_bom();
335 	  if (__int_bom)
336 	    {
337 	      size_t __size = __from_end - __from;
338 	      intern_type* __cfixed = static_cast<intern_type*>
339 		(__builtin_alloca(sizeof(intern_type) * (__size + 1)));
340 	      __cfixed[0] = static_cast<intern_type>(__int_bom);
341 	      char_traits<intern_type>::copy(__cfixed + 1, __from, __size);
342 	      __cfrom = reinterpret_cast<char*>(__cfixed);
343 	      __conv = __iconv_adaptor(iconv, __desc, &__cfrom,
344                                         &__fbytes, &__cto, &__tbytes);
345 	    }
346 	  else
347 	    {
348 	      intern_type* __cfixed = const_cast<intern_type*>(__from);
349 	      __cfrom = reinterpret_cast<char*>(__cfixed);
350 	      __conv = __iconv_adaptor(iconv, __desc, &__cfrom, &__fbytes,
351 				       &__cto, &__tbytes);
352 	    }
353 
354 	  if (__conv != size_t(-1))
355 	    {
356 	      __from_next = reinterpret_cast<const intern_type*>(__cfrom);
357 	      __to_next = reinterpret_cast<extern_type*>(__cto);
358 	      __ret = codecvt_base::ok;
359 	    }
360 	  else
361 	    {
362 	      if (__fbytes < __fmultiple * (__from_end - __from))
363 		{
364 		  __from_next = reinterpret_cast<const intern_type*>(__cfrom);
365 		  __to_next = reinterpret_cast<extern_type*>(__cto);
366 		  __ret = codecvt_base::partial;
367 		}
368 	      else
369 		__ret = codecvt_base::error;
370 	    }
371 	}
372       return __ret;
373     }
374 
375   template<typename _InternT, typename _ExternT>
376     codecvt_base::result
377     codecvt<_InternT, _ExternT, encoding_state>::
378     do_unshift(state_type& __state, extern_type* __to,
379 	       extern_type* __to_end, extern_type*& __to_next) const
380     {
381       result __ret = codecvt_base::error;
382       if (__state.good())
383 	{
384 	  const descriptor_type& __desc = __state.in_descriptor();
385 	  const size_t __tmultiple = sizeof(intern_type);
386 	  size_t __tlen = __tmultiple * (__to_end - __to);
387 
388 	  // Argument list for iconv specifies a byte sequence. Thus,
389 	  // all to/from arrays must be brutally casted to char*.
390 	  char* __cto = reinterpret_cast<char*>(__to);
391 	  size_t __conv = __iconv_adaptor(iconv,__desc, 0, 0,
392                                           &__cto, &__tlen);
393 
394 	  if (__conv != size_t(-1))
395 	    {
396 	      __to_next = reinterpret_cast<extern_type*>(__cto);
397 	      if (__tlen == __tmultiple * (__to_end - __to))
398 		__ret = codecvt_base::noconv;
399 	      else if (__tlen == 0)
400 		__ret = codecvt_base::ok;
401 	      else
402 		__ret = codecvt_base::partial;
403 	    }
404 	  else
405 	    __ret = codecvt_base::error;
406 	}
407       return __ret;
408     }
409 
410   template<typename _InternT, typename _ExternT>
411     codecvt_base::result
412     codecvt<_InternT, _ExternT, encoding_state>::
413     do_in(state_type& __state, const extern_type* __from,
414 	  const extern_type* __from_end, const extern_type*& __from_next,
415 	  intern_type* __to, intern_type* __to_end,
416 	  intern_type*& __to_next) const
417     {
418       result __ret = codecvt_base::error;
419       if (__state.good())
420 	{
421 	  const descriptor_type& __desc = __state.in_descriptor();
422 	  const size_t __fmultiple = sizeof(extern_type);
423 	  size_t __flen = __fmultiple * (__from_end - __from);
424 	  const size_t __tmultiple = sizeof(intern_type);
425 	  size_t __tlen = __tmultiple * (__to_end - __to);
426 
427 	  // Argument list for iconv specifies a byte sequence. Thus,
428 	  // all to/from arrays must be brutally casted to char*.
429 	  char* __cto = reinterpret_cast<char*>(__to);
430 	  char* __cfrom;
431 	  size_t __conv;
432 
433 	  // Some encodings need a byte order marker as the first item
434 	  // in the byte stream, to designate endian-ness. The default
435 	  // value for the byte order marker is NULL, so if this is
436 	  // the case, it's not necessary and we can just go on our
437 	  // merry way.
438 	  int __ext_bom = __state.external_bom();
439 	  if (__ext_bom)
440 	    {
441 	      size_t __size = __from_end - __from;
442 	      extern_type* __cfixed =  static_cast<extern_type*>
443 		(__builtin_alloca(sizeof(extern_type) * (__size + 1)));
444 	      __cfixed[0] = static_cast<extern_type>(__ext_bom);
445 	      char_traits<extern_type>::copy(__cfixed + 1, __from, __size);
446 	      __cfrom = reinterpret_cast<char*>(__cfixed);
447 	      __conv = __iconv_adaptor(iconv, __desc, &__cfrom,
448                                        &__flen, &__cto, &__tlen);
449 	    }
450 	  else
451 	    {
452 	      extern_type* __cfixed = const_cast<extern_type*>(__from);
453 	      __cfrom = reinterpret_cast<char*>(__cfixed);
454 	      __conv = __iconv_adaptor(iconv, __desc, &__cfrom,
455                                        &__flen, &__cto, &__tlen);
456 	    }
457 
458 
459 	  if (__conv != size_t(-1))
460 	    {
461 	      __from_next = reinterpret_cast<const extern_type*>(__cfrom);
462 	      __to_next = reinterpret_cast<intern_type*>(__cto);
463 	      __ret = codecvt_base::ok;
464 	    }
465 	  else
466 	    {
467 	      if (__flen < static_cast<size_t>(__from_end - __from))
468 		{
469 		  __from_next = reinterpret_cast<const extern_type*>(__cfrom);
470 		  __to_next = reinterpret_cast<intern_type*>(__cto);
471 		  __ret = codecvt_base::partial;
472 		}
473 	      else
474 		__ret = codecvt_base::error;
475 	    }
476 	}
477       return __ret;
478     }
479 
480   template<typename _InternT, typename _ExternT>
481     int
482     codecvt<_InternT, _ExternT, encoding_state>::
483     do_encoding() const throw()
484     {
485       int __ret = 0;
486       if (sizeof(_ExternT) <= sizeof(_InternT))
487 	__ret = sizeof(_InternT) / sizeof(_ExternT);
488       return __ret;
489     }
490 
491   template<typename _InternT, typename _ExternT>
492     bool
493     codecvt<_InternT, _ExternT, encoding_state>::
494     do_always_noconv() const throw()
495     { return false; }
496 
497   template<typename _InternT, typename _ExternT>
498     int
499     codecvt<_InternT, _ExternT, encoding_state>::
500     do_length(state_type&, const extern_type* __from,
501 	      const extern_type* __end, size_t __max) const
502     { return std::min(__max, static_cast<size_t>(__end - __from)); }
503 
504   // _GLIBCXX_RESOLVE_LIB_DEFECTS
505   // 74.  Garbled text for codecvt::do_max_length
506   template<typename _InternT, typename _ExternT>
507     int
508     codecvt<_InternT, _ExternT, encoding_state>::
509     do_max_length() const throw()
510     { return 1; }
511 
512 _GLIBCXX_END_NAMESPACE_VERSION
513 } // namespace
514 
515 #endif
516