1//
2//  Copyright (c) 2009-2011 Artyom Beilis (Tonkikh)
3//
4//  Distributed under the Boost Software License, Version 1.0. (See
5//  accompanying file LICENSE_1_0.txt or copy at
6//  http://www.boost.org/LICENSE_1_0.txt)
7//
8
9#ifndef BOOST_LOCALE_IMPL_WCONV_CODEPAGE_HPP
10#define BOOST_LOCALE_IMPL_WCONV_CODEPAGE_HPP
11
12
13#include <boost/locale/encoding.hpp>
14#include <algorithm>
15#include <cstring>
16#include <string>
17#include "conv.hpp"
18
19#ifndef NOMINMAX
20# define NOMINMAX
21#endif
22#include <windows.h>
23#include <vector>
24
25
26namespace boost {
27namespace locale {
28namespace conv {
29namespace impl {
30
31    struct windows_encoding {
32        char const *name;
33        unsigned codepage;
34        unsigned was_tested;
35    };
36
37    bool operator<(windows_encoding const &l,windows_encoding const &r)
38    {
39        return strcmp(l.name,r.name) < 0;
40    }
41
42    windows_encoding all_windows_encodings[] = {
43        { "big5",       950, 0 },
44        { "cp1250",     1250, 0 },
45        { "cp1251",     1251, 0 },
46        { "cp1252",     1252, 0 },
47        { "cp1253",     1253, 0 },
48        { "cp1254",     1254, 0 },
49        { "cp1255",     1255, 0 },
50        { "cp1256",     1256, 0 },
51        { "cp1257",     1257, 0 },
52        { "cp874",      874, 0 },
53        { "cp932",      932, 0 },
54        { "cp936",      936, 0 },
55        { "eucjp",      20932, 0 },
56        { "euckr",      51949, 0 },
57        { "gb18030",    54936, 0 },
58        { "gb2312",     20936, 0 },
59        { "gbk",        936, 0 },
60        { "iso2022jp",  50220, 0 },
61        { "iso2022kr",  50225, 0 },
62        { "iso88591",   28591, 0 },
63        { "iso885913",  28603, 0 },
64        { "iso885915",  28605, 0 },
65        { "iso88592",   28592, 0 },
66        { "iso88593",   28593, 0 },
67        { "iso88594",   28594, 0 },
68        { "iso88595",   28595, 0 },
69        { "iso88596",   28596, 0 },
70        { "iso88597",   28597, 0 },
71        { "iso88598",   28598, 0 },
72        { "iso88599",   28599, 0 },
73        { "koi8r",      20866, 0 },
74        { "koi8u",      21866, 0 },
75        { "ms936",      936, 0 },
76        { "shiftjis",   932, 0 },
77        { "sjis",       932, 0 },
78        { "usascii",    20127, 0 },
79        { "utf8",       65001, 0 },
80        { "windows1250",        1250, 0 },
81        { "windows1251",        1251, 0 },
82        { "windows1252",        1252, 0 },
83        { "windows1253",        1253, 0 },
84        { "windows1254",        1254, 0 },
85        { "windows1255",        1255, 0 },
86        { "windows1256",        1256, 0 },
87        { "windows1257",        1257, 0 },
88        { "windows874",         874, 0 },
89        { "windows932",         932, 0 },
90        { "windows936",         936, 0 },
91    };
92
93    size_t remove_substitutions(std::vector<char> &v)
94    {
95        if(std::find(v.begin(),v.end(),0) == v.end()) {
96            return v.size();
97        }
98        std::vector<char> v2;
99        v2.reserve(v.size());
100        for(unsigned i=0;i<v.size();i++) {
101            if(v[i]!=0)
102                v2.push_back(v[i]);
103        }
104        v.swap(v2);
105        return v.size();
106    }
107
108    void multibyte_to_wide_one_by_one(int codepage,char const *begin,char const *end,std::vector<wchar_t> &buf)
109    {
110        buf.reserve(end-begin);
111        while(begin!=end) {
112            wchar_t wide_buf[4];
113            int n = 0;
114            int len = IsDBCSLeadByteEx(codepage,*begin) ? 2 : 1;
115            if(len == 2 && begin+1==end)
116                return;
117            n = MultiByteToWideChar(codepage,MB_ERR_INVALID_CHARS,begin,len,wide_buf,4);
118            for(int i=0;i<n;i++)
119                buf.push_back(wide_buf[i]);
120            begin+=len;
121        }
122    }
123
124
125    void multibyte_to_wide(int codepage,char const *begin,char const *end,bool do_skip,std::vector<wchar_t> &buf)
126    {
127        if(begin==end)
128            return;
129        int n = MultiByteToWideChar(codepage,MB_ERR_INVALID_CHARS,begin,end-begin,0,0);
130        if(n == 0) {
131            if(do_skip) {
132                multibyte_to_wide_one_by_one(codepage,begin,end,buf);
133                return;
134            }
135            throw conversion_error();
136        }
137
138        buf.resize(n,0);
139        if(MultiByteToWideChar(codepage,MB_ERR_INVALID_CHARS,begin,end-begin,&buf.front(),buf.size())==0)
140            throw conversion_error();
141    }
142
143    void wide_to_multibyte_non_zero(int codepage,wchar_t const *begin,wchar_t const *end,bool do_skip,std::vector<char> &buf)
144    {
145        if(begin==end)
146            return;
147        BOOL substitute = FALSE;
148        BOOL *substitute_ptr = codepage == 65001 || codepage == 65000 ? 0 : &substitute;
149        char subst_char = 0;
150        char *subst_char_ptr = codepage == 65001 || codepage == 65000 ? 0 : &subst_char;
151
152        int n = WideCharToMultiByte(codepage,0,begin,end-begin,0,0,subst_char_ptr,substitute_ptr);
153        buf.resize(n);
154
155        if(WideCharToMultiByte(codepage,0,begin,end-begin,&buf[0],n,subst_char_ptr,substitute_ptr)==0)
156            throw conversion_error();
157        if(substitute) {
158            if(do_skip)
159                remove_substitutions(buf);
160            else
161                throw conversion_error();
162        }
163    }
164
165    void wide_to_multibyte(int codepage,wchar_t const *begin,wchar_t const *end,bool do_skip,std::vector<char> &buf)
166    {
167        if(begin==end)
168            return;
169        buf.reserve(end-begin);
170        wchar_t const *e = std::find(begin,end,L'\0');
171        wchar_t const *b = begin;
172        for(;;) {
173            std::vector<char> tmp;
174            wide_to_multibyte_non_zero(codepage,b,e,do_skip,tmp);
175            size_t osize = buf.size();
176            buf.resize(osize+tmp.size());
177            std::copy(tmp.begin(),tmp.end(),buf.begin()+osize);
178            if(e!=end) {
179                buf.push_back('\0');
180                b=e+1;
181                e=std::find(b,end,L'0');
182            }
183            else
184                break;
185        }
186    }
187
188
189    int encoding_to_windows_codepage(char const *ccharset)
190    {
191        std::string charset = normalize_encoding(ccharset);
192        windows_encoding ref;
193        ref.name = charset.c_str();
194        size_t n = sizeof(all_windows_encodings)/sizeof(all_windows_encodings[0]);
195        windows_encoding *begin = all_windows_encodings;
196        windows_encoding *end = all_windows_encodings + n;
197        windows_encoding *ptr = std::lower_bound(begin,end,ref);
198        if(ptr!=end && strcmp(ptr->name,charset.c_str())==0) {
199            if(ptr->was_tested) {
200                return ptr->codepage;
201            }
202            else if(IsValidCodePage(ptr->codepage)) {
203                // the thread safety is not an issue, maximum
204                // it would be checked more then once
205                ptr->was_tested=1;
206                return ptr->codepage;
207            }
208            else {
209                return -1;
210            }
211        }
212        return -1;
213
214    }
215
216    template<typename CharType>
217    bool validate_utf16(CharType const *str,unsigned len)
218    {
219        CharType const *begin = str;
220        CharType const *end = str+len;
221        while(begin!=end) {
222            utf::code_point c = utf::utf_traits<CharType,2>::template decode<CharType const *>(begin,end);
223            if(c==utf::illegal || c==utf::incomplete)
224                return false;
225        }
226        return true;
227    }
228
229    template<typename CharType,typename OutChar>
230    void clean_invalid_utf16(CharType const *str,unsigned len,std::vector<OutChar> &out)
231    {
232        out.reserve(len);
233        for(unsigned i=0;i<len;i++) {
234            uint16_t c = static_cast<uint16_t>(str[i]);
235
236            if(0xD800 <= c && c<= 0xDBFF) {
237                i++;
238                if(i>=len)
239                    return;
240                uint16_t c2=static_cast<uint16_t>(str[i]);
241                if(0xDC00 <= c2 && c2 <= 0xDFFF) {
242                    out.push_back(static_cast<OutChar>(c));
243                    out.push_back(static_cast<OutChar>(c2));
244                }
245            }
246            else if(0xDC00 <= c && c <=0xDFFF)
247                continue;
248            else
249                out.push_back(static_cast<OutChar>(c));
250        }
251    }
252
253
254    class wconv_between : public converter_between {
255    public:
256        wconv_between() :
257            how_(skip),
258            to_code_page_ (-1),
259            from_code_page_ ( -1)
260        {
261        }
262        bool open(char const *to_charset,char const *from_charset,method_type how)
263        {
264            how_ = how;
265            to_code_page_ = encoding_to_windows_codepage(to_charset);
266            from_code_page_ = encoding_to_windows_codepage(from_charset);
267            if(to_code_page_ == -1 || from_code_page_ == -1)
268                return false;
269            return true;
270        }
271        virtual std::string convert(char const *begin,char const *end)
272        {
273            if(to_code_page_ == 65001 && from_code_page_ == 65001)
274                return utf_to_utf<char>(begin,end,how_);
275
276            std::string res;
277
278            std::vector<wchar_t> tmp;   // buffer for mb2w
279            std::wstring tmps;          // buffer for utf_to_utf
280            wchar_t const *wbegin=0;
281            wchar_t const *wend=0;
282
283            if(from_code_page_ == 65001) {
284                tmps = utf_to_utf<wchar_t>(begin,end,how_);
285                if(tmps.empty())
286                    return res;
287                wbegin = tmps.c_str();
288                wend = wbegin + tmps.size();
289            }
290            else {
291                multibyte_to_wide(from_code_page_,begin,end,how_ == skip,tmp);
292                if(tmp.empty())
293                    return res;
294                wbegin = &tmp[0];
295                wend = wbegin + tmp.size();
296            }
297
298            if(to_code_page_ == 65001) {
299                return utf_to_utf<char>(wbegin,wend,how_);
300            }
301
302            std::vector<char> ctmp;
303            wide_to_multibyte(to_code_page_,wbegin,wend,how_ == skip,ctmp);
304            if(ctmp.empty())
305                return res;
306            res.assign(&ctmp.front(),ctmp.size());
307            return res;
308        }
309    private:
310        method_type how_;
311        int to_code_page_;
312        int from_code_page_;
313    };
314
315    template<typename CharType,int size = sizeof(CharType) >
316    class wconv_to_utf;
317
318    template<typename CharType,int size = sizeof(CharType) >
319    class wconv_from_utf;
320
321    template<>
322    class wconv_to_utf<char,1> : public  converter_to_utf<char> , public wconv_between {
323    public:
324        virtual bool open(char const *cs,method_type how)
325        {
326            return wconv_between::open("UTF-8",cs,how);
327        }
328        virtual std::string convert(char const *begin,char const *end)
329        {
330            return wconv_between::convert(begin,end);
331        }
332    };
333
334    template<>
335    class wconv_from_utf<char,1> : public  converter_from_utf<char> , public wconv_between {
336    public:
337        virtual bool open(char const *cs,method_type how)
338        {
339            return wconv_between::open(cs,"UTF-8",how);
340        }
341        virtual std::string convert(char const *begin,char const *end)
342        {
343            return wconv_between::convert(begin,end);
344        }
345    };
346
347    template<typename CharType>
348    class wconv_to_utf<CharType,2> : public converter_to_utf<CharType> {
349    public:
350        typedef CharType char_type;
351
352        typedef std::basic_string<char_type> string_type;
353
354        wconv_to_utf() :
355            how_(skip),
356            code_page_(-1)
357        {
358        }
359
360        virtual bool open(char const *charset,method_type how)
361        {
362            how_ = how;
363            code_page_ = encoding_to_windows_codepage(charset);
364            return code_page_ != -1;
365        }
366
367        virtual string_type convert(char const *begin,char const *end)
368        {
369            if(code_page_ == 65001) {
370                return utf_to_utf<char_type>(begin,end,how_);
371            }
372            std::vector<wchar_t> tmp;
373            multibyte_to_wide(code_page_,begin,end,how_ == skip,tmp);
374            string_type res;
375            if(!tmp.empty())
376                res.assign(reinterpret_cast<char_type *>(&tmp.front()),tmp.size());
377            return res;
378        }
379
380    private:
381        method_type how_;
382        int code_page_;
383    };
384
385    template<typename CharType>
386    class wconv_from_utf<CharType,2> : public converter_from_utf<CharType> {
387    public:
388        typedef CharType char_type;
389
390        typedef std::basic_string<char_type> string_type;
391
392        wconv_from_utf() :
393            how_(skip),
394            code_page_(-1)
395        {
396        }
397
398        virtual bool open(char const *charset,method_type how)
399        {
400            how_ = how;
401            code_page_ = encoding_to_windows_codepage(charset);
402            return code_page_ != -1;
403        }
404
405        virtual std::string convert(CharType const *begin,CharType const *end)
406        {
407            if(code_page_ == 65001) {
408                return utf_to_utf<char>(begin,end,how_);
409            }
410            wchar_t const *wbegin = 0;
411            wchar_t const *wend = 0;
412            std::vector<wchar_t> buffer; // if needed
413            if(begin==end)
414                return std::string();
415            if(validate_utf16(begin,end-begin)) {
416                wbegin =  reinterpret_cast<wchar_t const *>(begin);
417                wend = reinterpret_cast<wchar_t const *>(end);
418            }
419            else {
420                if(how_ == stop) {
421                        throw conversion_error();
422                }
423                else {
424                    clean_invalid_utf16(begin,end-begin,buffer);
425                    if(!buffer.empty()) {
426                        wbegin = &buffer[0];
427                        wend = wbegin + buffer.size();
428                    }
429                }
430            }
431            std::string res;
432            if(wbegin==wend)
433                return res;
434            std::vector<char> ctmp;
435            wide_to_multibyte(code_page_,wbegin,wend,how_ == skip,ctmp);
436            if(ctmp.empty())
437                return res;
438            res.assign(&ctmp.front(),ctmp.size());
439            return res;
440        }
441
442    private:
443        method_type how_;
444        int code_page_;
445    };
446
447
448
449    template<typename CharType>
450    class wconv_to_utf<CharType,4> : public converter_to_utf<CharType> {
451    public:
452        typedef CharType char_type;
453
454        typedef std::basic_string<char_type> string_type;
455
456        wconv_to_utf() :
457            how_(skip),
458            code_page_(-1)
459        {
460        }
461
462        virtual bool open(char const *charset,method_type how)
463        {
464            how_ = how;
465            code_page_ = encoding_to_windows_codepage(charset);
466            return code_page_ != -1;
467        }
468
469        virtual string_type convert(char const *begin,char const *end)
470        {
471            if(code_page_ == 65001) {
472                return utf_to_utf<char_type>(begin,end,how_);
473            }
474            std::vector<wchar_t> buf;
475            multibyte_to_wide(code_page_,begin,end,how_ == skip,buf);
476
477            if(buf.empty())
478                return string_type();
479
480            return utf_to_utf<CharType>(&buf[0],&buf[0]+buf.size(),how_);
481        }
482    private:
483        method_type how_;
484        int code_page_;
485    };
486
487    template<typename CharType>
488    class wconv_from_utf<CharType,4> : public converter_from_utf<CharType> {
489    public:
490        typedef CharType char_type;
491
492        typedef std::basic_string<char_type> string_type;
493
494        wconv_from_utf() :
495            how_(skip),
496            code_page_(-1)
497        {
498        }
499
500        virtual bool open(char const *charset,method_type how)
501        {
502            how_ = how;
503            code_page_ = encoding_to_windows_codepage(charset);
504            return code_page_ != -1;
505        }
506
507        virtual std::string convert(CharType const *begin,CharType const *end)
508        {
509            if(code_page_ == 65001) {
510                return utf_to_utf<char>(begin,end,how_);
511            }
512            std::wstring tmp = utf_to_utf<wchar_t>(begin,end,how_);
513
514            std::vector<char> ctmp;
515            wide_to_multibyte(code_page_,tmp.c_str(),tmp.c_str()+tmp.size(),how_ == skip,ctmp);
516            std::string res;
517            if(ctmp.empty())
518                return res;
519            res.assign(&ctmp.front(),ctmp.size());
520            return res;
521
522        }
523
524    private:
525        method_type how_;
526        int code_page_;
527    };
528
529
530
531
532
533} // impl
534} // conv
535} // locale
536} // boost
537
538#endif
539// vim: tabstop=4 expandtab shiftwidth=4 softtabstop=4
540