1 //
2 //  Copyright (c) 2009-2011 Artyom Beilis (Tonkikh)
3 //
4 //  Distributed under the Boost Software License, Version 1.0. (See
5 //  accompanying file LICENSE_1_0.txt or copy at
6 //  http://www.boost.org/LICENSE_1_0.txt)
7 //
8 #define BOOST_LOCALE_SOURCE
9 #include <boost/locale/generator.hpp>
10 #include <boost/locale/encoding.hpp>
11 #include <boost/locale/utf8_codecvt.hpp>
12 
13 #include "../encoding/conv.hpp"
14 
15 #include <boost/locale/util.hpp>
16 
17 #ifdef BOOST_MSVC
18 #  pragma warning(disable : 4244 4996) // loose data
19 #endif
20 
21 #include <cstddef>
22 #include <string.h>
23 #include <vector>
24 #include <algorithm>
25 
26 //#define DEBUG_CODECVT
27 
28 #ifdef DEBUG_CODECVT
29 #include <iostream>
30 #endif
31 
32 namespace boost {
33 namespace locale {
34 namespace util {
35 
36     class utf8_converter  : public base_converter {
37     public:
max_len() const38         virtual int max_len() const
39         {
40             return 4;
41         }
42 
clone() const43         virtual utf8_converter *clone() const
44         {
45             return new utf8_converter();
46         }
47 
is_thread_safe() const48         bool is_thread_safe() const
49         {
50             return true;
51         }
52 
to_unicode(char const * & begin,char const * end)53         virtual uint32_t to_unicode(char const *&begin,char const *end)
54         {
55             char const *p=begin;
56 
57             utf::code_point c = utf::utf_traits<char>::decode(p,end);
58 
59             if(c==utf::illegal)
60                 return illegal;
61 
62             if(c==utf::incomplete)
63                 return incomplete;
64 
65             begin = p;
66             return c;
67         }
68 
from_unicode(uint32_t u,char * begin,char const * end)69         virtual uint32_t from_unicode(uint32_t u,char *begin,char const *end)
70         {
71             if(!utf::is_valid_codepoint(u))
72                 return illegal;
73             int width = utf::utf_traits<char>::width(u);
74             std::ptrdiff_t d=end-begin;
75             if(d < width)
76                 return incomplete;
77             utf::utf_traits<char>::encode(u,begin);
78             return width;
79         }
80     }; // utf8_converter
81 
82     class simple_converter_impl {
83     public:
84 
85         static const int hash_table_size = 1024;
86 
simple_converter_impl(std::string const & encoding)87         simple_converter_impl(std::string const &encoding)
88         {
89             for(unsigned i=0;i<128;i++)
90                 to_unicode_tbl_[i]=i;
91             for(unsigned i=128;i<256;i++) {
92                 char buf[2] = { char(i) , 0 };
93                 uint32_t uchar=utf::illegal;
94                 try {
95                     std::wstring const tmp = conv::to_utf<wchar_t>(buf,buf+1,encoding,conv::stop);
96                     if(tmp.size() == 1) {
97                         uchar = tmp[0];
98                     }
99                     else {
100                         uchar = utf::illegal;
101                     }
102                 }
103                 catch(conv::conversion_error const &/*e*/) {
104                     uchar = utf::illegal;
105                 }
106                 to_unicode_tbl_[i]=uchar;
107             }
108             for(int i=0;i<hash_table_size;i++)
109                 from_unicode_tbl_[i]=0;
110             for(unsigned i=1;i<256;i++) {
111                 if(to_unicode_tbl_[i]!=utf::illegal) {
112                     unsigned pos = to_unicode_tbl_[i] % hash_table_size;
113                     while(from_unicode_tbl_[pos]!=0)
114                         pos = (pos + 1) % hash_table_size;
115                     from_unicode_tbl_[pos] = i;
116                 }
117             }
118         }
119 
to_unicode(char const * & begin,char const * end) const120         uint32_t to_unicode(char const *&begin,char const *end) const
121         {
122             if(begin==end)
123                 return utf::incomplete;
124             unsigned char c = *begin++;
125             return to_unicode_tbl_[c];
126         }
from_unicode(uint32_t u,char * begin,char const * end) const127         uint32_t from_unicode(uint32_t u,char *begin,char const *end) const
128         {
129             if(begin==end)
130                 return utf::incomplete;
131             if(u==0) {
132                 *begin = 0;
133                 return 1;
134             }
135             unsigned pos = u % hash_table_size;
136             unsigned char c;
137             while((c=from_unicode_tbl_[pos])!=0 && to_unicode_tbl_[c]!=u)
138                 pos = (pos + 1) % hash_table_size;
139             if(c==0)
140                return utf::illegal;
141             *begin = c;
142             return 1;
143         }
144     private:
145         uint32_t to_unicode_tbl_[256];
146         unsigned char from_unicode_tbl_[hash_table_size];
147     };
148 
149     class simple_converter : public base_converter {
150     public:
151 
~simple_converter()152         virtual ~simple_converter()
153         {
154         }
155 
simple_converter(std::string const & encoding)156         simple_converter(std::string const &encoding) :
157             cvt_(encoding)
158         {
159         }
160 
max_len() const161         virtual int max_len() const
162         {
163             return 1;
164         }
165 
is_thread_safe() const166         virtual bool is_thread_safe() const
167         {
168             return true;
169         }
clone() const170         virtual base_converter *clone() const
171         {
172            return new simple_converter(*this);
173         }
174 
to_unicode(char const * & begin,char const * end)175         virtual uint32_t to_unicode(char const *&begin,char const *end)
176         {
177             return cvt_.to_unicode(begin,end);
178         }
from_unicode(uint32_t u,char * begin,char const * end)179         virtual uint32_t from_unicode(uint32_t u,char *begin,char const *end)
180         {
181             return cvt_.from_unicode(u,begin,end);
182         }
183     private:
184         simple_converter_impl cvt_;
185     };
186 
187     template<typename CharType>
188     class simple_codecvt : public generic_codecvt<CharType,simple_codecvt<CharType> >
189     {
190     public:
191 
simple_codecvt(std::string const & encoding,size_t refs=0)192         simple_codecvt(std::string const &encoding,size_t refs = 0) :
193             generic_codecvt<CharType,simple_codecvt<CharType> >(refs),
194             cvt_(encoding)
195         {
196         }
197 
198         struct state_type {};
initial_state(generic_codecvt_base::initial_convertion_state)199         static state_type initial_state(generic_codecvt_base::initial_convertion_state /* unused */)
200         {
201             return state_type();
202         }
max_encoding_length()203         static int max_encoding_length()
204         {
205             return 1;
206         }
207 
to_unicode(state_type &,char const * & begin,char const * end) const208         utf::code_point to_unicode(state_type &,char const *&begin,char const *end) const
209         {
210             return cvt_.to_unicode(begin,end);
211         }
212 
from_unicode(state_type &,utf::code_point u,char * begin,char const * end) const213         utf::code_point from_unicode(state_type &,utf::code_point u,char *begin,char const *end) const
214         {
215             return cvt_.from_unicode(u,begin,end);
216         }
217     private:
218         simple_converter_impl cvt_;
219 
220     };
221 
222     namespace {
223         char const *simple_encoding_table[] = {
224             "cp1250",
225             "cp1251",
226             "cp1252",
227             "cp1253",
228             "cp1254",
229             "cp1255",
230             "cp1256",
231             "cp1257",
232             "iso88591",
233             "iso885913",
234             "iso885915",
235             "iso88592",
236             "iso88593",
237             "iso88594",
238             "iso88595",
239             "iso88596",
240             "iso88597",
241             "iso88598",
242             "iso88599",
243             "koi8r",
244             "koi8u",
245             "usascii",
246             "windows1250",
247             "windows1251",
248             "windows1252",
249             "windows1253",
250             "windows1254",
251             "windows1255",
252             "windows1256",
253             "windows1257"
254         };
255 
compare_strings(char const * l,char const * r)256         bool compare_strings(char const *l,char const *r)
257         {
258             return strcmp(l,r) < 0;
259         }
260     }
261 
check_is_simple_encoding(std::string const & encoding)262     bool check_is_simple_encoding(std::string const &encoding)
263     {
264         std::string norm = conv::impl::normalize_encoding(encoding.c_str());
265         return std::binary_search<char const **>( simple_encoding_table,
266                         simple_encoding_table + sizeof(simple_encoding_table)/sizeof(char const *),
267                         norm.c_str(),
268                         compare_strings);
269         return 0;
270     }
271 
272     #if !defined(BOOST_LOCALE_HIDE_AUTO_PTR) && !defined(BOOST_NO_AUTO_PTR)
create_utf8_converter()273     std::auto_ptr<base_converter> create_utf8_converter()
274     {
275         std::auto_ptr<base_converter> res(create_utf8_converter_new_ptr());
276         return res;
277     }
create_simple_converter(std::string const & encoding)278     std::auto_ptr<base_converter> create_simple_converter(std::string const &encoding)
279     {
280         std::auto_ptr<base_converter> res(create_simple_converter_new_ptr(encoding));
281         return res;
282     }
create_codecvt(std::locale const & in,std::auto_ptr<base_converter> cvt,character_facet_type type)283     std::locale create_codecvt(std::locale const &in,std::auto_ptr<base_converter> cvt,character_facet_type type)
284     {
285         return create_codecvt_from_pointer(in,cvt.release(),type);
286     }
287     #endif
288     #ifndef BOOST_NO_CXX11_SMART_PTR
create_utf8_converter_unique_ptr()289     std::unique_ptr<base_converter> create_utf8_converter_unique_ptr()
290     {
291         std::unique_ptr<base_converter> res(create_utf8_converter_new_ptr());
292         return res;
293     }
create_simple_converter_unique_ptr(std::string const & encoding)294     std::unique_ptr<base_converter> create_simple_converter_unique_ptr(std::string const &encoding)
295     {
296         std::unique_ptr<base_converter> res(create_simple_converter_new_ptr(encoding));
297         return res;
298     }
create_codecvt(std::locale const & in,std::unique_ptr<base_converter> cvt,character_facet_type type)299     std::locale create_codecvt(std::locale const &in,std::unique_ptr<base_converter> cvt,character_facet_type type)
300     {
301         return create_codecvt_from_pointer(in,cvt.release(),type);
302     }
303     #endif
304 
create_simple_converter_new_ptr(std::string const & encoding)305     base_converter *create_simple_converter_new_ptr(std::string const &encoding)
306     {
307         if(check_is_simple_encoding(encoding))
308             return new simple_converter(encoding);
309         return 0;
310     }
311 
create_utf8_converter_new_ptr()312     base_converter *create_utf8_converter_new_ptr()
313     {
314         return new utf8_converter();
315     }
316 
317     template<typename CharType>
318     class code_converter : public generic_codecvt<CharType,code_converter<CharType> >
319     {
320     public:
321         #ifndef BOOST_NO_CXX11_SMART_PTR
322         typedef std::unique_ptr<base_converter> base_converter_ptr;
323         #define PTR_TRANS(x) std::move((x))
324         #else
325         typedef std::auto_ptr<base_converter> base_converter_ptr;
326         #define PTR_TRANS(x) (x)
327         #endif
328         typedef base_converter_ptr state_type;
329 
code_converter(base_converter_ptr cvt,size_t refs=0)330         code_converter(base_converter_ptr cvt,size_t refs = 0) :
331             generic_codecvt<CharType,code_converter<CharType> >(refs),
332             cvt_(PTR_TRANS(cvt))
333         {
334             max_len_ = cvt_->max_len();
335             thread_safe_ = cvt_->is_thread_safe();
336         }
337 
338 
max_encoding_length() const339         int max_encoding_length() const
340         {
341             return max_len_;
342         }
343 
initial_state(generic_codecvt_base::initial_convertion_state) const344         base_converter_ptr initial_state(generic_codecvt_base::initial_convertion_state /* unused */) const
345         {
346             base_converter_ptr r;
347             if(!thread_safe_)
348                 r.reset(cvt_->clone());
349             return r;
350         }
351 
to_unicode(base_converter_ptr & ptr,char const * & begin,char const * end) const352         utf::code_point to_unicode(base_converter_ptr &ptr,char const *&begin,char const *end) const
353         {
354             if(thread_safe_)
355                 return cvt_->to_unicode(begin,end);
356             else
357                 return ptr->to_unicode(begin,end);
358         }
359 
from_unicode(base_converter_ptr & ptr,utf::code_point u,char * begin,char const * end) const360         utf::code_point from_unicode(base_converter_ptr &ptr,utf::code_point u,char *begin,char const *end) const
361         {
362             if(thread_safe_)
363                 return cvt_->from_unicode(u,begin,end);
364             else
365                 return ptr->from_unicode(u,begin,end);
366         }
367 
368     private:
369         base_converter_ptr cvt_;
370         int max_len_;
371         bool thread_safe_;
372     };
373 
374 
create_codecvt_from_pointer(std::locale const & in,base_converter * pcvt,character_facet_type type)375     std::locale create_codecvt_from_pointer(std::locale const &in,base_converter *pcvt,character_facet_type type)
376     {
377         code_converter<char>::base_converter_ptr cvt(pcvt);
378         if(!cvt.get())
379             cvt.reset(new base_converter());
380         switch(type) {
381         case char_facet:
382             return std::locale(in,new code_converter<char>(PTR_TRANS(cvt)));
383         case wchar_t_facet:
384             return std::locale(in,new code_converter<wchar_t>(PTR_TRANS(cvt)));
385         #if defined(BOOST_LOCALE_ENABLE_CHAR16_T) && !defined(BOOST_NO_CHAR16_T_CODECVT)
386         case char16_t_facet:
387             return std::locale(in,new code_converter<char16_t>(PTR_TRANS(cvt)));
388         #endif
389         #if defined(BOOST_LOCALE_ENABLE_CHAR32_T) && !defined(BOOST_NO_CHAR32_T_CODECVT)
390         case char32_t_facet:
391             return std::locale(in,new code_converter<char32_t>(PTR_TRANS(cvt)));
392         #endif
393         default:
394             return in;
395         }
396     }
397 
398 
399     ///
400     /// Install utf8 codecvt to UTF-16 or UTF-32 into locale \a in and return
401     /// new locale that is based on \a in and uses new facet.
402     ///
create_utf8_codecvt(std::locale const & in,character_facet_type type)403     std::locale create_utf8_codecvt(std::locale const &in,character_facet_type type)
404     {
405         switch(type) {
406         case char_facet:
407             return std::locale(in,new utf8_codecvt<char>());
408         case wchar_t_facet:
409             return std::locale(in,new utf8_codecvt<wchar_t>());
410         #if defined(BOOST_LOCALE_ENABLE_CHAR16_T) && !defined(BOOST_NO_CHAR16_T_CODECVT)
411         case char16_t_facet:
412             return std::locale(in,new utf8_codecvt<char16_t>());
413         #endif
414         #if defined(BOOST_LOCALE_ENABLE_CHAR32_T) && !defined(BOOST_NO_CHAR32_T_CODECVT)
415         case char32_t_facet:
416             return std::locale(in,new utf8_codecvt<char32_t>());
417         #endif
418         default:
419             return in;
420         }
421     }
422 
423     ///
424     /// This function installs codecvt that can be used for conversion between single byte
425     /// character encodings like ISO-8859-1, koi8-r, windows-1255 and Unicode code points,
426     ///
427     /// Throws invalid_charset_error if the chacater set is not supported or isn't single byte character
428     /// set
create_simple_codecvt(std::locale const & in,std::string const & encoding,character_facet_type type)429     std::locale create_simple_codecvt(std::locale const &in,std::string const &encoding,character_facet_type type)
430     {
431         if(!check_is_simple_encoding(encoding))
432             throw boost::locale::conv::invalid_charset_error("Invalid simple encoding " + encoding);
433 
434         switch(type) {
435         case char_facet:
436             return std::locale(in,new simple_codecvt<char>(encoding));
437         case wchar_t_facet:
438             return std::locale(in,new simple_codecvt<wchar_t>(encoding));
439         #if defined(BOOST_LOCALE_ENABLE_CHAR16_T) && !defined(BOOST_NO_CHAR16_T_CODECVT)
440         case char16_t_facet:
441             return std::locale(in,new simple_codecvt<char16_t>(encoding));
442         #endif
443         #if defined(BOOST_LOCALE_ENABLE_CHAR32_T) && !defined(BOOST_NO_CHAR32_T_CODECVT)
444         case char32_t_facet:
445             return std::locale(in,new simple_codecvt<char32_t>(encoding));
446         #endif
447         default:
448             return in;
449         }
450     }
451 
452 
453 
454 } // util
455 } // locale
456 } // boost
457 
458 // vim: tabstop=4 expandtab shiftwidth=4 softtabstop=4
459