1 #ifndef BOOST_UTF8_CODECVT_FACET_HPP
2 #define BOOST_UTF8_CODECVT_FACET_HPP
3 
4 #include <boost/iostreams/detail/config/wide_streams.hpp>
5 #ifdef BOOST_IOSTREAMS_NO_WIDE_STREAMS
6 # error wide streams not supported on this platform
7 #endif
8 
9 // MS compatible compilers support #pragma once
10 #if defined(_MSC_VER) && (_MSC_VER >= 1020)
11 # pragma once
12 #endif
13 
14 /////////1/////////2/////////3/////////4/////////5/////////6/////////7/////////8
15 // utf8_codecvt_facet.hpp
16 
17 // Copyright (c) 2001 Ronald Garcia, Indiana University (garcia@osl.iu.edu)
18 // Andrew Lumsdaine, Indiana University (lums@osl.iu.edu).
19 // Distributed under the Boost Software License, Version 1.0. (See accompany-
20 // ing file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
21 
22 // Note:(Robert Ramey).  I have made the following alterations in the original
23 // code.
24 // a) Rendered utf8_codecvt<wchar_t, char>  with using templates
25 // b) Move longer functions outside class definition to prevent inlining
26 // and make code smaller
27 // c) added on a derived class to permit translation to/from current
28 // locale to utf8
29 
30 //  See http://www.boost.org for updates, documentation, and revision history.
31 
32 // archives stored as text - note these ar templated on the basic
33 // stream templates to accommodate wide (and other?) kind of characters
34 //
35 // note the fact that on libraries without wide characters, ostream is
36 // is not a specialization of basic_ostream which in fact is not defined
37 // in such cases.   So we can't use basic_ostream<OStream::char_type> but rather
38 // use two template parameters
39 //
40 // utf8_codecvt_facet
41 //   This is an implementation of a std::codecvt facet for translating
42 //   from UTF-8 externally to UCS-4.  Note that this is not tied to
43 //   any specific types in order to allow customization on platforms
44 //   where wchar_t is not big enough.
45 //
46 // NOTES:  The current implementation jumps through some unpleasant hoops in
47 // order to deal with signed character types.  As a std::codecvt_base::result,
48 // it is necessary  for the ExternType to be convertible to unsigned  char.
49 // I chose not to tie the extern_type explicitly to char. But if any combination
50 // of types other than <wchar_t,char_t> is used, then std::codecvt must be
51 // specialized on those types for this to work.
52 
53 #include <locale>
54 #include <cstddef> // size_t
55 #include <cwchar>  // mbstate_t
56 #include <boost/integer_traits.hpp>
57 #include <boost/iostreams/detail/config/wide_streams.hpp>
58 #include <boost/iostreams/detail/codecvt_helper.hpp>
59 
60 // maximum lenght of a multibyte string
61 #define MB_LENGTH_MAX 8
62 
63 struct utf8_codecvt_facet_wchar_t
64     : public boost::iostreams::detail::codecvt_helper<wchar_t, char, std::mbstate_t>
65 {
66 public:
utf8_codecvt_facet_wchar_tutf8_codecvt_facet_wchar_t67     explicit utf8_codecvt_facet_wchar_t(std::size_t no_locale_manage = 0)
68         : boost::iostreams::detail::codecvt_helper<wchar_t, char, std::mbstate_t>
69               (no_locale_manage)
70         { }
71 protected:
72     virtual std::codecvt_base::result do_in(
73         std::mbstate_t& state,
74         const char * from,
75         const char * from_end,
76         const char * & from_next,
77         wchar_t * to,
78         wchar_t * to_end,
79         wchar_t*& to_next
80     ) const;
81 
82     virtual std::codecvt_base::result do_out(
83         std::mbstate_t & state, const wchar_t * from,
84         const wchar_t * from_end, const wchar_t*  & from_next,
85         char * to, char * to_end, char * & to_next
86     ) const;
87 
invalid_continuing_octetutf8_codecvt_facet_wchar_t88     bool invalid_continuing_octet(unsigned char octet_1) const {
89         return (octet_1 < 0x80|| 0xbf< octet_1);
90     }
91 
invalid_leading_octetutf8_codecvt_facet_wchar_t92     bool invalid_leading_octet(unsigned char octet_1)   const {
93         return (0x7f < octet_1 && octet_1 < 0xc0) ||
94             (octet_1 > 0xfd);
95     }
96 
97     // continuing octets = octets except for the leading octet
get_cont_octet_countutf8_codecvt_facet_wchar_t98     static unsigned int get_cont_octet_count(unsigned   char lead_octet) {
99         return get_octet_count(lead_octet) - 1;
100     }
101 
102     static unsigned int get_octet_count(unsigned char   lead_octet);
103 
104     // How many "continuing octets" will be needed for this word
105     // ==   total octets - 1.
106     int get_cont_octet_out_count(wchar_t word) const ;
107 
do_always_noconvutf8_codecvt_facet_wchar_t108     virtual bool do_always_noconv() const throw() { return false; }
109 
110     // UTF-8 isn't really stateful since we rewind on partial conversions
do_unshiftutf8_codecvt_facet_wchar_t111     virtual std::codecvt_base::result do_unshift(
112         std::mbstate_t&,
113         char * from,
114         char * /* to */,
115         char * & next
116     ) const{
117         next = from;
118         return ok;
119     }
120 
do_encodingutf8_codecvt_facet_wchar_t121     virtual int do_encoding() const throw() {
122         const int variable_byte_external_encoding=0;
123         return variable_byte_external_encoding;
124     }
125 
126     // How many char objects can I process to get <= max_limit
127     // wchar_t objects?
128     virtual int do_length(
129         BOOST_IOSTREAMS_CODECVT_CV_QUALIFIER std::mbstate_t &,
130         const char * from,
131         const char * from_end,
132         std::size_t max_limit
133     ) const throw();
134 
135     // Largest possible value do_length(state,from,from_end,1) could return.
do_max_lengthutf8_codecvt_facet_wchar_t136     virtual int do_max_length() const throw () {
137         return 6; // largest UTF-8 encoding of a UCS-4 character
138     }
139 };
140 
141 #if 0 // not used - incorrect in any case
142 // Robert Ramey - use the above to make a code converter from multi-byte
143 // char strings to utf8 encoding
144 struct utf8_codecvt_facet_char : public utf8_codecvt_facet_wchar_t
145 {
146     typedef utf8_codecvt_facet_wchar_t base_class;
147 public:
148     explicit utf8_codecvt_facet_char(std::size_t no_locale_manage=0)
149         : base_class(no_locale_manage)
150     {}
151 protected:
152     virtual std::codecvt_base::result do_in(
153         std::mbstate_t & state,
154         const char * from,
155         const char * from_end,
156         const char * & from_next,
157         char * to,
158         char * to_end,
159         char * & to_next
160     ) const;
161 
162     virtual std::codecvt_base::result do_out(
163         std::mbstate_t & state,
164         const char * from,
165         const char * from_end,
166         const char*  & from_next,
167         char * to,
168         char * to_end,
169         char * & to_next
170     ) const;
171 
172     // How many char objects can I process to get <= max_limit
173     // char objects?
174     virtual int do_length(
175         const std::mbstate_t&,
176         const char * from,
177         const char * from_end,
178         std::size_t max_limit
179     ) const;
180 };
181 #endif
182 
183 template<class Internal, class External>
184 struct utf8_codecvt_facet
185 {};
186 
187 template<>
188 struct utf8_codecvt_facet<wchar_t, char>
189     : public utf8_codecvt_facet_wchar_t
190 {};
191 
192 #if 0
193 template<>
194 struct utf8_codecvt_facet<char, char>
195     : public utf8_codecvt_facet_char
196 {};
197 #endif
198 
199 #endif // BOOST_UTF8_CODECVT_FACET_HPP
200 
201