1 //
2 //  Copyright (c) 2009-2011 Artyom Beilis (Tonkikh)
3 //
4 //  Distributed under the Boost Software License, Version 1.0. (See
5 //  accompanying file LICENSE_1_0.txt or copy at
6 //  http://www.boost.org/LICENSE_1_0.txt)
7 //
8 #define BOOST_LOCALE_SOURCE
9 #include <boost/locale/generator.hpp>
10 #include <boost/locale/encoding.hpp>
11 
12 #include "../encoding/conv.hpp"
13 
14 #include <boost/locale/util.hpp>
15 
16 #ifdef BOOST_MSVC
17 #  pragma warning(disable : 4244 4996) // loose data
18 #endif
19 
20 #include <cstddef>
21 #include <string.h>
22 #include <vector>
23 #include <algorithm>
24 
25 //#define DEBUG_CODECVT
26 
27 #ifdef DEBUG_CODECVT
28 #include <iostream>
29 #endif
30 
31 namespace boost {
32 namespace locale {
33 namespace util {
34 
35     class utf8_converter  : public base_converter {
36     public:
max_len() const37         virtual int max_len() const
38         {
39             return 4;
40         }
41 
clone() const42         virtual utf8_converter *clone() const
43         {
44             return new utf8_converter();
45         }
46 
is_thread_safe() const47         bool is_thread_safe() const
48         {
49             return true;
50         }
51 
to_unicode(char const * & begin,char const * end)52         virtual uint32_t to_unicode(char const *&begin,char const *end)
53         {
54             char const *p=begin;
55 
56             utf::code_point c = utf::utf_traits<char>::decode(p,end);
57 
58             if(c==utf::illegal)
59                 return illegal;
60 
61             if(c==utf::incomplete)
62                 return incomplete;
63 
64             begin = p;
65             return c;
66         }
67 
from_unicode(uint32_t u,char * begin,char const * end)68         virtual uint32_t from_unicode(uint32_t u,char *begin,char const *end)
69         {
70             if(!utf::is_valid_codepoint(u))
71                 return illegal;
72             int width = utf::utf_traits<char>::width(u);
73             std::ptrdiff_t d=end-begin;
74             if(d < width)
75                 return incomplete;
76             utf::utf_traits<char>::encode(u,begin);
77             return width;
78         }
79     }; // utf8_converter
80 
81     class simple_converter : public base_converter {
82     public:
83 
~simple_converter()84         virtual ~simple_converter()
85         {
86         }
87 
simple_converter(std::string const & encoding)88         simple_converter(std::string const &encoding)
89         {
90             for(unsigned i=0;i<128;i++)
91                 to_unicode_tbl_[i]=i;
92             for(unsigned i=128;i<256;i++) {
93                 char buf[2] = { char(i) , 0 };
94                 try {
95                     std::wstring const tmp = conv::to_utf<wchar_t>(buf,buf+1,encoding,conv::stop);
96                     if(tmp.size() == 1) {
97                         to_unicode_tbl_[i] = tmp[0];
98                     }
99                     else {
100                         to_unicode_tbl_[i] = illegal;
101                     }
102                 }
103                 catch(conv::conversion_error const &/*e*/) {
104                     to_unicode_tbl_[i] = illegal;
105                 }
106             }
107             from_unicode_tbl_.resize(256);
108             for(unsigned i=0;i<256;i++) {
109                 from_unicode_tbl_[to_unicode_tbl_[i] & 0xFF].push_back(i);
110             }
111         }
112 
max_len() const113         virtual int max_len() const
114         {
115             return 1;
116         }
117 
is_thread_safe() const118         virtual bool is_thread_safe() const
119         {
120             return true;
121         }
clone() const122         virtual base_converter *clone() const
123         {
124            return new simple_converter(*this);
125         }
to_unicode(char const * & begin,char const * end)126         virtual uint32_t to_unicode(char const *&begin,char const *end)
127         {
128             if(begin==end)
129                 return incomplete;
130             unsigned char c = *begin++;
131             return to_unicode_tbl_[c];
132         }
from_unicode(uint32_t u,char * begin,char const * end)133         virtual uint32_t from_unicode(uint32_t u,char *begin,char const *end)
134         {
135             if(begin==end)
136                 return incomplete;
137             std::vector<unsigned char> const &tbl = from_unicode_tbl_[u & 0xFF];
138             for(std::vector<unsigned char>::const_iterator p=tbl.begin();p!=tbl.end();++p) {
139                 if(to_unicode_tbl_[*p]==u) {
140                     *begin++ = *p;
141                     return 1;
142                 }
143             }
144             return illegal;
145         }
146     private:
147         uint32_t to_unicode_tbl_[256];
148         std::vector<std::vector<unsigned char> > from_unicode_tbl_;
149     };
150 
151     namespace {
152         char const *simple_encoding_table[] = {
153             "cp1250",
154             "cp1251",
155             "cp1252",
156             "cp1253",
157             "cp1254",
158             "cp1255",
159             "cp1256",
160             "cp1257",
161             "iso88591",
162             "iso885913",
163             "iso885915",
164             "iso88592",
165             "iso88593",
166             "iso88594",
167             "iso88595",
168             "iso88596",
169             "iso88597",
170             "iso88598",
171             "iso88599",
172             "koi8r",
173             "koi8u",
174             "usascii",
175             "windows1250",
176             "windows1251",
177             "windows1252",
178             "windows1253",
179             "windows1254",
180             "windows1255",
181             "windows1256",
182             "windows1257"
183         };
184 
compare_strings(char const * l,char const * r)185         bool compare_strings(char const *l,char const *r)
186         {
187             return strcmp(l,r) < 0;
188         }
189     }
190 
191 
create_simple_converter(std::string const & encoding)192     std::auto_ptr<base_converter> create_simple_converter(std::string const &encoding)
193     {
194         std::auto_ptr<base_converter> res;
195         std::string norm = conv::impl::normalize_encoding(encoding.c_str());
196         if(std::binary_search<char const **>( simple_encoding_table,
197                         simple_encoding_table + sizeof(simple_encoding_table)/sizeof(char const *),
198                         norm.c_str(),
199                         compare_strings))
200         {
201             res.reset(new simple_converter(encoding));
202         }
203         return res;
204     }
205 
206 
207 
create_utf8_converter()208     std::auto_ptr<base_converter> create_utf8_converter()
209     {
210         std::auto_ptr<base_converter> res(new utf8_converter());
211         return res;
212     }
213 
214     //
215     // Traits for sizeof char
216     //
217     template<typename CharType,int n=sizeof(CharType)>
218     struct uchar_traits;
219 
220     template<typename CharType>
221     struct uchar_traits<CharType,2> {
222         typedef uint16_t uint_type;
223     };
224     template<typename CharType>
225     struct uchar_traits<CharType,4> {
226         typedef uint32_t uint_type;
227     };
228 
229     // Real codecvt
230 
231     template<typename CharType>
232     class code_converter : public std::codecvt<CharType,char,std::mbstate_t>
233     {
234     public:
code_converter(std::auto_ptr<base_converter> cvt,size_t refs=0)235         code_converter(std::auto_ptr<base_converter> cvt,size_t refs = 0) :
236           std::codecvt<CharType,char,std::mbstate_t>(refs),
237             cvt_(cvt)
238         {
239             max_len_ = cvt_->max_len();
240         }
241     protected:
242 
243         typedef CharType uchar;
244 
do_unshift(std::mbstate_t & s,char * from,char *,char * & next) const245         virtual std::codecvt_base::result do_unshift(std::mbstate_t &s,char *from,char * /*to*/,char *&next) const
246         {
247             uint16_t &state = *reinterpret_cast<uint16_t *>(&s);
248 #ifdef DEBUG_CODECVT
249             std::cout << "Entering unshift " << std::hex << state << std::dec << std::endl;
250 #endif
251             if(state != 0)
252                 return std::codecvt_base::error;
253             next=from;
254             return std::codecvt_base::ok;
255         }
do_encoding() const256         virtual int do_encoding() const throw()
257         {
258             return 0;
259         }
do_max_length() const260         virtual int do_max_length() const throw()
261         {
262             return max_len_;
263         }
do_always_noconv() const264         virtual bool do_always_noconv() const throw()
265         {
266             return false;
267         }
268 
269         virtual std::codecvt_base::result
do_in(std::mbstate_t & state,char const * from,char const * from_end,char const * & from_next,uchar * uto,uchar * uto_end,uchar * & uto_next) const270         do_in(  std::mbstate_t &state,
271                 char const *from,
272                 char const *from_end,
273                 char const *&from_next,
274                 uchar *uto,
275                 uchar *uto_end,
276                 uchar *&uto_next) const
277         {
278             typedef typename uchar_traits<uchar>::uint_type uint_type;
279             uint_type *to=reinterpret_cast<uint_type *>(uto);
280             uint_type *to_end=reinterpret_cast<uint_type *>(uto_end);
281             uint_type *&to_next=reinterpret_cast<uint_type *&>(uto_next);
282             return do_real_in(state,from,from_end,from_next,to,to_end,to_next);
283         }
284 
285         virtual int
do_length(std::mbstate_t & state,char const * from,char const * from_end,size_t max) const286         do_length(  std::mbstate_t &state,
287                 char const *from,
288                 char const *from_end,
289                 size_t max) const
290         {
291             char const *from_next=from;
292             std::vector<uchar> chrs(max+1);
293             uchar *to=&chrs.front();
294             uchar *to_end=to+max;
295             uchar *to_next=to;
296             do_in(state,from,from_end,from_next,to,to_end,to_next);
297             return from_next-from;
298         }
299 
300         virtual std::codecvt_base::result
do_out(std::mbstate_t & state,uchar const * ufrom,uchar const * ufrom_end,uchar const * & ufrom_next,char * to,char * to_end,char * & to_next) const301         do_out( std::mbstate_t &state,
302                 uchar const *ufrom,
303                 uchar const *ufrom_end,
304                 uchar const *&ufrom_next,
305                 char *to,
306                 char *to_end,
307                 char *&to_next) const
308         {
309             typedef typename uchar_traits<uchar>::uint_type uint_type;
310             uint_type const *from=reinterpret_cast<uint_type const *>(ufrom);
311             uint_type const *from_end=reinterpret_cast<uint_type const *>(ufrom_end);
312             uint_type const *&from_next=reinterpret_cast<uint_type const *&>(ufrom_next);
313             return do_real_out(state,from,from_end,from_next,to,to_end,to_next);
314         }
315 
316 
317     private:
318 
319         //
320         // Implementation for UTF-32
321         //
322         std::codecvt_base::result
do_real_in(std::mbstate_t &,char const * from,char const * from_end,char const * & from_next,uint32_t * to,uint32_t * to_end,uint32_t * & to_next) const323         do_real_in( std::mbstate_t &/*state*/,
324                     char const *from,
325                     char const *from_end,
326                     char const *&from_next,
327                     uint32_t *to,
328                     uint32_t *to_end,
329                     uint32_t *&to_next) const
330         {
331             std::auto_ptr<base_converter> cvtp;
332             base_converter *cvt = 0;
333             if(cvt_->is_thread_safe()) {
334                 cvt = cvt_.get();
335             }
336             else {
337                 cvtp.reset(cvt_->clone());
338                 cvt = cvtp.get();
339             }
340             std::codecvt_base::result r=std::codecvt_base::ok;
341             while(to < to_end && from < from_end)
342             {
343                 uint32_t ch=cvt->to_unicode(from,from_end);
344                 if(ch==base_converter::illegal) {
345                     r=std::codecvt_base::error;
346                     break;
347                 }
348                 if(ch==base_converter::incomplete) {
349                     r=std::codecvt_base::partial;
350                     break;
351                 }
352                 *to++=ch;
353             }
354             from_next=from;
355             to_next=to;
356             if(r!=std::codecvt_base::ok)
357                 return r;
358             if(from!=from_end)
359                 return std::codecvt_base::partial;
360             return r;
361         }
362 
363         //
364         // Implementation for UTF-32
365         //
366         std::codecvt_base::result
do_real_out(std::mbstate_t &,uint32_t const * from,uint32_t const * from_end,uint32_t const * & from_next,char * to,char * to_end,char * & to_next) const367         do_real_out(std::mbstate_t &/*state*/, // state is not used there
368                     uint32_t const *from,
369                     uint32_t const *from_end,
370                     uint32_t const *&from_next,
371                     char *to,
372                     char *to_end,
373                     char *&to_next) const
374         {
375             std::auto_ptr<base_converter> cvtp;
376             base_converter *cvt = 0;
377             if(cvt_->is_thread_safe()) {
378                 cvt = cvt_.get();
379             }
380             else {
381                 cvtp.reset(cvt_->clone());
382                 cvt = cvtp.get();
383             }
384 
385             std::codecvt_base::result r=std::codecvt_base::ok;
386             while(to < to_end && from < from_end)
387             {
388                 uint32_t len=cvt->from_unicode(*from,to,to_end);
389                 if(len==base_converter::illegal) {
390                     r=std::codecvt_base::error;
391                     break;
392                 }
393                 if(len==base_converter::incomplete) {
394                     r=std::codecvt_base::partial;
395                     break;
396                 }
397                 from++;
398                 to+=len;
399             }
400             from_next=from;
401             to_next=to;
402             if(r!=std::codecvt_base::ok)
403                 return r;
404             if(from!=from_end)
405                 return std::codecvt_base::partial;
406             return r;
407         }
408 
409         //
410         // Implementation for UTF-16
411         //
412         std::codecvt_base::result
do_real_in(std::mbstate_t & std_state,char const * from,char const * from_end,char const * & from_next,uint16_t * to,uint16_t * to_end,uint16_t * & to_next) const413         do_real_in( std::mbstate_t &std_state,
414                     char const *from,
415                     char const *from_end,
416                     char const *&from_next,
417                     uint16_t *to,
418                     uint16_t *to_end,
419                     uint16_t *&to_next) const
420         {
421             std::auto_ptr<base_converter> cvtp;
422             base_converter *cvt = 0;
423             if(cvt_->is_thread_safe()) {
424                 cvt = cvt_.get();
425             }
426             else {
427                 cvtp.reset(cvt_->clone());
428                 cvt = cvtp.get();
429             }
430             std::codecvt_base::result r=std::codecvt_base::ok;
431             // mbstate_t is POD type and should be initialized to 0 (i.a. state = stateT())
432             // according to standard. We use it to keed a flag 0/1 for surrogate pair writing
433             //
434             // if 0 no code above >0xFFFF observed, of 1 a code above 0xFFFF observerd
435             // and first pair is written, but no input consumed
436             uint16_t &state = *reinterpret_cast<uint16_t *>(&std_state);
437             while(to < to_end && from < from_end)
438             {
439 #ifdef DEBUG_CODECVT
440                 std::cout << "Entering IN--------------" << std::endl;
441                 std::cout << "State " << std::hex << state <<std::endl;
442                 std::cout << "Left in " << std::dec << from_end - from << " out " << to_end -to << std::endl;
443 #endif
444                 char const *from_saved = from;
445                 uint32_t ch=cvt->to_unicode(from,from_end);
446                 if(ch==base_converter::illegal) {
447                     r=std::codecvt_base::error;
448                     break;
449                 }
450                 if(ch==base_converter::incomplete) {
451                     r=std::codecvt_base::partial;
452                     break;
453                 }
454                 // Normal codepoints go direcly to stream
455                 if(ch <= 0xFFFF) {
456                     *to++=ch;
457                 }
458                 else {
459                     // for  other codepoints we do following
460                     //
461                     // 1. We can't consume our input as we may find ourselfs
462                     //    in state where all input consumed but not all output written,i.e. only
463                     //    1st pair is written
464                     // 2. We only write first pair and mark this in the state, we also revert back
465                     //    the from pointer in order to make sure this codepoint would be read
466                     //    once again and then we would consume our input together with writing
467                     //    second surrogate pair
468                     ch-=0x10000;
469                     uint16_t vh = ch >> 10;
470                     uint16_t vl = ch & 0x3FF;
471                     uint16_t w1 = vh + 0xD800;
472                     uint16_t w2 = vl + 0xDC00;
473                     if(state == 0) {
474                         from = from_saved;
475                         *to++ = w1;
476                         state = 1;
477                     }
478                     else {
479                         *to++ = w2;
480                         state = 0;
481                     }
482                 }
483             }
484             from_next=from;
485             to_next=to;
486             if(r == std::codecvt_base::ok && (from!=from_end || state!=0))
487                 r = std::codecvt_base::partial;
488 #ifdef DEBUG_CODECVT
489             std::cout << "Returning ";
490             switch(r) {
491             case std::codecvt_base::ok:
492                 std::cout << "ok" << std::endl;
493                 break;
494             case std::codecvt_base::partial:
495                 std::cout << "partial" << std::endl;
496                 break;
497             case std::codecvt_base::error:
498                 std::cout << "error" << std::endl;
499                 break;
500             default:
501                 std::cout << "other" << std::endl;
502                 break;
503             }
504             std::cout << "State " << std::hex << state <<std::endl;
505             std::cout << "Left in " << std::dec << from_end - from << " out " << to_end -to << std::endl;
506 #endif
507             return r;
508         }
509 
510         //encoding// Implementation for UTF-16
511         //
512         std::codecvt_base::result
do_real_out(std::mbstate_t & std_state,uint16_t const * from,uint16_t const * from_end,uint16_t const * & from_next,char * to,char * to_end,char * & to_next) const513         do_real_out(std::mbstate_t &std_state,
514                     uint16_t const *from,
515                     uint16_t const *from_end,
516                     uint16_t const *&from_next,
517                     char *to,
518                     char *to_end,
519                     char *&to_next) const
520         {
521             std::auto_ptr<base_converter> cvtp;
522             base_converter *cvt = 0;
523             if(cvt_->is_thread_safe()) {
524                 cvt = cvt_.get();
525             }
526             else {
527                 cvtp.reset(cvt_->clone());
528                 cvt = cvtp.get();
529             }
530             std::codecvt_base::result r=std::codecvt_base::ok;
531             // mbstate_t is POD type and should be initialized to 0 (i.a. state = stateT())
532             // according to standard. We assume that sizeof(mbstate_t) >=2 in order
533             // to be able to store first observerd surrogate pair
534             //
535             // State: state!=0 - a first surrogate pair was observerd (state = first pair),
536             // we expect the second one to come and then zero the state
537             ///
538             uint16_t &state = *reinterpret_cast<uint16_t *>(&std_state);
539             while(to < to_end && from < from_end)
540             {
541 #ifdef DEBUG_CODECVT
542             std::cout << "Entering OUT --------------" << std::endl;
543             std::cout << "State " << std::hex << state <<std::endl;
544             std::cout << "Left in " << std::dec << from_end - from << " out " << to_end -to << std::endl;
545 #endif
546                 uint32_t ch=0;
547                 if(state != 0) {
548                     // if the state idecates that 1st surrogate pair was written
549                     // we should make sure that the second one that comes is actually
550                     // second surrogate
551                     uint16_t w1 = state;
552                     uint16_t w2 = *from;
553                     // we don't forward from as writing may fail to incomplete or
554                     // partial conversion
555                     if(0xDC00 <= w2 && w2<=0xDFFF) {
556                         uint16_t vh = w1 - 0xD800;
557                         uint16_t vl = w2 - 0xDC00;
558                         ch=((uint32_t(vh) << 10)  | vl) + 0x10000;
559                     }
560                     else {
561                         // Invalid surrogate
562                         r=std::codecvt_base::error;
563                         break;
564                     }
565                 }
566                 else {
567                     ch = *from;
568                     if(0xD800 <= ch && ch<=0xDBFF) {
569                         // if this is a first surrogate pair we put
570                         // it into the state and consume it, note we don't
571                         // go forward as it should be illegal so we increase
572                         // the from pointer manually
573                         state = ch;
574                         from++;
575                         continue;
576                     }
577                     else if(0xDC00 <= ch && ch<=0xDFFF) {
578                         // if we observe second surrogate pair and
579                         // first only may be expected we should break from the loop with error
580                         // as it is illegal input
581                         r=std::codecvt_base::error;
582                         break;
583                     }
584                 }
585 
586                 uint32_t len=cvt->from_unicode(ch,to,to_end);
587                 if(len==base_converter::illegal) {
588                     r=std::codecvt_base::error;
589                     break;
590                 }
591                 if(len==base_converter::incomplete) {
592                     r=std::codecvt_base::partial;
593                     break;
594                 }
595                 state = 0;
596                 to+=len;
597                 from++;
598             }
599             from_next=from;
600             to_next=to;
601             if(r==std::codecvt_base::ok && from!=from_end)
602                 r = std::codecvt_base::partial;
603 #ifdef DEBUG_CODECVT
604             std::cout << "Returning ";
605             switch(r) {
606             case std::codecvt_base::ok:
607                 std::cout << "ok" << std::endl;
608                 break;
609             case std::codecvt_base::partial:
610                 std::cout << "partial" << std::endl;
611                 break;
612             case std::codecvt_base::error:
613                 std::cout << "error" << std::endl;
614                 break;
615             default:
616                 std::cout << "other" << std::endl;
617                 break;
618             }
619             std::cout << "State " << std::hex << state <<std::endl;
620             std::cout << "Left in " << std::dec << from_end - from << " out " << to_end -to << std::endl;
621 #endif
622             return r;
623         }
624 
625         int max_len_;
626         std::auto_ptr<base_converter> cvt_;
627 
628     };
629 
630     static const char ensure_mbstate_size_is_at_least_2[sizeof(std::mbstate_t) >= 2 ? 1 : -1] = {0};
631 
632     template<>
633     class code_converter<char> : public std::codecvt<char,char,std::mbstate_t>
634     {
635     public:
code_converter(std::auto_ptr<base_converter>,size_t refs=0)636         code_converter(std::auto_ptr<base_converter> /*cvt*/,size_t refs = 0) :
637           std::codecvt<char,char,std::mbstate_t>(refs)
638         {
639         }
640     };
641 
642 
create_codecvt(std::locale const & in,std::auto_ptr<base_converter> cvt,character_facet_type type)643     std::locale create_codecvt(std::locale const &in,std::auto_ptr<base_converter> cvt,character_facet_type type)
644     {
645         if(!cvt.get())
646             cvt.reset(new base_converter());
647         switch(type) {
648         case char_facet:
649             return std::locale(in,new code_converter<char>(cvt));
650         case wchar_t_facet:
651             return std::locale(in,new code_converter<wchar_t>(cvt));
652         #if defined(BOOST_HAS_CHAR16_T) && !defined(BOOST_NO_CHAR16_T_CODECVT)
653         case char16_t_facet:
654             return std::locale(in,new code_converter<char16_t>(cvt));
655         #endif
656         #if defined(BOOST_HAS_CHAR32_T) && !defined(BOOST_NO_CHAR32_T_CODECVT)
657         case char32_t_facet:
658             return std::locale(in,new code_converter<char32_t>(cvt));
659         #endif
660         default:
661             return in;
662         }
663     }
664 
665 
666 } // util
667 } // locale
668 } // boost
669 
670 // vim: tabstop=4 expandtab shiftwidth=4 softtabstop=4
671