1 #ifndef BOOST_UTF8_CODECVT_FACET_HPP 2 #define BOOST_UTF8_CODECVT_FACET_HPP 3 4 #include <boost/iostreams/detail/config/wide_streams.hpp> 5 #ifdef BOOST_IOSTREAMS_NO_WIDE_STREAMS 6 # error wide streams not supported on this platform 7 #endif 8 9 // MS compatible compilers support #pragma once 10 #if defined(_MSC_VER) && (_MSC_VER >= 1020) 11 # pragma once 12 #endif 13 14 /////////1/////////2/////////3/////////4/////////5/////////6/////////7/////////8 15 // utf8_codecvt_facet.hpp 16 17 // Copyright (c) 2001 Ronald Garcia, Indiana University (garcia@osl.iu.edu) 18 // Andrew Lumsdaine, Indiana University (lums@osl.iu.edu). 19 // Distributed under the Boost Software License, Version 1.0. (See accompany- 20 // ing file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) 21 22 // Note:(Robert Ramey). I have made the following alterations in the original 23 // code. 24 // a) Rendered utf8_codecvt<wchar_t, char> with using templates 25 // b) Move longer functions outside class definition to prevent inlining 26 // and make code smaller 27 // c) added on a derived class to permit translation to/from current 28 // locale to utf8 29 30 // See http://www.boost.org for updates, documentation, and revision history. 31 32 // archives stored as text - note these ar templated on the basic 33 // stream templates to accommodate wide (and other?) kind of characters 34 // 35 // note the fact that on libraries without wide characters, ostream is 36 // is not a specialization of basic_ostream which in fact is not defined 37 // in such cases. So we can't use basic_ostream<OStream::char_type> but rather 38 // use two template parameters 39 // 40 // utf8_codecvt_facet 41 // This is an implementation of a std::codecvt facet for translating 42 // from UTF-8 externally to UCS-4. Note that this is not tied to 43 // any specific types in order to allow customization on platforms 44 // where wchar_t is not big enough. 45 // 46 // NOTES: The current implementation jumps through some unpleasant hoops in 47 // order to deal with signed character types. As a std::codecvt_base::result, 48 // it is necessary for the ExternType to be convertible to unsigned char. 49 // I chose not to tie the extern_type explicitly to char. But if any combination 50 // of types other than <wchar_t,char_t> is used, then std::codecvt must be 51 // specialized on those types for this to work. 52 53 #include <locale> 54 #include <cstddef> // size_t 55 #include <cwchar> // mbstate_t 56 #include <boost/integer_traits.hpp> 57 #include <boost/iostreams/detail/config/wide_streams.hpp> 58 #include <boost/iostreams/detail/codecvt_helper.hpp> 59 60 // maximum lenght of a multibyte string 61 #define MB_LENGTH_MAX 8 62 63 struct utf8_codecvt_facet_wchar_t 64 : public boost::iostreams::detail::codecvt_helper<wchar_t, char, std::mbstate_t> 65 { 66 public: utf8_codecvt_facet_wchar_tutf8_codecvt_facet_wchar_t67 explicit utf8_codecvt_facet_wchar_t(std::size_t no_locale_manage = 0) 68 : boost::iostreams::detail::codecvt_helper<wchar_t, char, std::mbstate_t> 69 (no_locale_manage) 70 { } 71 protected: 72 virtual std::codecvt_base::result do_in( 73 std::mbstate_t& state, 74 const char * from, 75 const char * from_end, 76 const char * & from_next, 77 wchar_t * to, 78 wchar_t * to_end, 79 wchar_t*& to_next 80 ) const; 81 82 virtual std::codecvt_base::result do_out( 83 std::mbstate_t & state, const wchar_t * from, 84 const wchar_t * from_end, const wchar_t* & from_next, 85 char * to, char * to_end, char * & to_next 86 ) const; 87 invalid_continuing_octetutf8_codecvt_facet_wchar_t88 bool invalid_continuing_octet(unsigned char octet_1) const { 89 return (octet_1 < 0x80|| 0xbf< octet_1); 90 } 91 invalid_leading_octetutf8_codecvt_facet_wchar_t92 bool invalid_leading_octet(unsigned char octet_1) const { 93 return (0x7f < octet_1 && octet_1 < 0xc0) || 94 (octet_1 > 0xfd); 95 } 96 97 // continuing octets = octets except for the leading octet get_cont_octet_countutf8_codecvt_facet_wchar_t98 static unsigned int get_cont_octet_count(unsigned char lead_octet) { 99 return get_octet_count(lead_octet) - 1; 100 } 101 102 static unsigned int get_octet_count(unsigned char lead_octet); 103 104 // How many "continuing octets" will be needed for this word 105 // == total octets - 1. 106 int get_cont_octet_out_count(wchar_t word) const ; 107 do_always_noconvutf8_codecvt_facet_wchar_t108 virtual bool do_always_noconv() const throw() { return false; } 109 110 // UTF-8 isn't really stateful since we rewind on partial conversions do_unshiftutf8_codecvt_facet_wchar_t111 virtual std::codecvt_base::result do_unshift( 112 std::mbstate_t&, 113 char * from, 114 char * /* to */, 115 char * & next 116 ) const{ 117 next = from; 118 return ok; 119 } 120 do_encodingutf8_codecvt_facet_wchar_t121 virtual int do_encoding() const throw() { 122 const int variable_byte_external_encoding=0; 123 return variable_byte_external_encoding; 124 } 125 126 // How many char objects can I process to get <= max_limit 127 // wchar_t objects? 128 virtual int do_length( 129 BOOST_IOSTREAMS_CODECVT_CV_QUALIFIER std::mbstate_t &, 130 const char * from, 131 const char * from_end, 132 std::size_t max_limit 133 ) const throw(); 134 135 // Largest possible value do_length(state,from,from_end,1) could return. do_max_lengthutf8_codecvt_facet_wchar_t136 virtual int do_max_length() const throw () { 137 return 6; // largest UTF-8 encoding of a UCS-4 character 138 } 139 }; 140 141 #if 0 // not used - incorrect in any case 142 // Robert Ramey - use the above to make a code converter from multi-byte 143 // char strings to utf8 encoding 144 struct utf8_codecvt_facet_char : public utf8_codecvt_facet_wchar_t 145 { 146 typedef utf8_codecvt_facet_wchar_t base_class; 147 public: 148 explicit utf8_codecvt_facet_char(std::size_t no_locale_manage=0) 149 : base_class(no_locale_manage) 150 {} 151 protected: 152 virtual std::codecvt_base::result do_in( 153 std::mbstate_t & state, 154 const char * from, 155 const char * from_end, 156 const char * & from_next, 157 char * to, 158 char * to_end, 159 char * & to_next 160 ) const; 161 162 virtual std::codecvt_base::result do_out( 163 std::mbstate_t & state, 164 const char * from, 165 const char * from_end, 166 const char* & from_next, 167 char * to, 168 char * to_end, 169 char * & to_next 170 ) const; 171 172 // How many char objects can I process to get <= max_limit 173 // char objects? 174 virtual int do_length( 175 const std::mbstate_t&, 176 const char * from, 177 const char * from_end, 178 std::size_t max_limit 179 ) const; 180 }; 181 #endif 182 183 template<class Internal, class External> 184 struct utf8_codecvt_facet 185 {}; 186 187 template<> 188 struct utf8_codecvt_facet<wchar_t, char> 189 : public utf8_codecvt_facet_wchar_t 190 {}; 191 192 #if 0 193 template<> 194 struct utf8_codecvt_facet<char, char> 195 : public utf8_codecvt_facet_char 196 {}; 197 #endif 198 199 #endif // BOOST_UTF8_CODECVT_FACET_HPP 200 201