1/////////1/////////2/////////3/////////4/////////5/////////6/////////7/////////8 2// utf8_codecvt_facet.ipp 3 4// Copyright (c) 2001 Ronald Garcia, Indiana University (garcia@osl.iu.edu) 5// Andrew Lumsdaine, Indiana University (lums@osl.iu.edu). 6// Use, modification and distribution is subject to the Boost Software 7// License, Version 1.0. (See accompanying file LICENSE_1_0.txt or copy at 8// http://www.boost.org/LICENSE_1_0.txt) 9 10// Please see the comments in <boost/detail/utf8_codecvt_facet.hpp> to 11// learn how this file should be used. 12 13#include <boost/detail/utf8_codecvt_facet.hpp> 14 15#include <cstdlib> // for multi-byte converson routines 16#include <cassert> 17 18#include <boost/limits.hpp> 19#include <boost/config.hpp> 20 21// If we don't have wstring, then Unicode support 22// is not available anyway, so we don't need to even 23// compiler this file. This also fixes the problem 24// with mingw, which can compile this file, but will 25// generate link error when building DLL. 26#ifndef BOOST_NO_STD_WSTRING 27 28BOOST_UTF8_BEGIN_NAMESPACE 29 30/////////1/////////2/////////3/////////4/////////5/////////6/////////7/////////8 31// implementation for wchar_t 32 33// Translate incoming UTF-8 into UCS-4 34std::codecvt_base::result utf8_codecvt_facet::do_in( 35 std::mbstate_t& /*state*/, 36 const char * from, 37 const char * from_end, 38 const char * & from_next, 39 wchar_t * to, 40 wchar_t * to_end, 41 wchar_t * & to_next 42) const { 43 // Basic algorithm: The first octet determines how many 44 // octets total make up the UCS-4 character. The remaining 45 // "continuing octets" all begin with "10". To convert, subtract 46 // the amount that specifies the number of octets from the first 47 // octet. Subtract 0x80 (1000 0000) from each continuing octet, 48 // then mash the whole lot together. Note that each continuing 49 // octet only uses 6 bits as unique values, so only shift by 50 // multiples of 6 to combine. 51 while (from != from_end && to != to_end) { 52 53 // Error checking on the first octet 54 if (invalid_leading_octet(*from)){ 55 from_next = from; 56 to_next = to; 57 return std::codecvt_base::error; 58 } 59 60 // The first octet is adjusted by a value dependent upon 61 // the number of "continuing octets" encoding the character 62 const int cont_octet_count = get_cont_octet_count(*from); 63 const wchar_t octet1_modifier_table[] = { 64 0x00, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc 65 }; 66 67 // The unsigned char conversion is necessary in case char is 68 // signed (I learned this the hard way) 69 wchar_t ucs_result = 70 (unsigned char)(*from++) - octet1_modifier_table[cont_octet_count]; 71 72 // Invariants : 73 // 1) At the start of the loop, 'i' continuing characters have been 74 // processed 75 // 2) *from points to the next continuing character to be processed. 76 int i = 0; 77 while(i != cont_octet_count && from != from_end) { 78 79 // Error checking on continuing characters 80 if (invalid_continuing_octet(*from)) { 81 from_next = from; 82 to_next = to; 83 return std::codecvt_base::error; 84 } 85 86 ucs_result *= (1 << 6); 87 88 // each continuing character has an extra (10xxxxxx)b attached to 89 // it that must be removed. 90 ucs_result += (unsigned char)(*from++) - 0x80; 91 ++i; 92 } 93 94 // If the buffer ends with an incomplete unicode character... 95 if (from == from_end && i != cont_octet_count) { 96 // rewind "from" to before the current character translation 97 from_next = from - (i+1); 98 to_next = to; 99 return std::codecvt_base::partial; 100 } 101 *to++ = ucs_result; 102 } 103 from_next = from; 104 to_next = to; 105 106 // Were we done converting or did we run out of destination space? 107 if(from == from_end) return std::codecvt_base::ok; 108 else return std::codecvt_base::partial; 109} 110 111std::codecvt_base::result utf8_codecvt_facet::do_out( 112 std::mbstate_t& /*state*/, 113 const wchar_t * from, 114 const wchar_t * from_end, 115 const wchar_t * & from_next, 116 char * to, 117 char * to_end, 118 char * & to_next 119) const 120{ 121 // RG - consider merging this table with the other one 122 const wchar_t octet1_modifier_table[] = { 123 0x00, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc 124 }; 125 126 wchar_t max_wchar = (std::numeric_limits<wchar_t>::max)(); 127 while (from != from_end && to != to_end) { 128 129 // Check for invalid UCS-4 character 130 if (*from > max_wchar) { 131 from_next = from; 132 to_next = to; 133 return std::codecvt_base::error; 134 } 135 136 int cont_octet_count = get_cont_octet_out_count(*from); 137 138 // RG - comment this formula better 139 int shift_exponent = (cont_octet_count) * 6; 140 141 // Process the first character 142 *to++ = static_cast<char>(octet1_modifier_table[cont_octet_count] + 143 (unsigned char)(*from / (1 << shift_exponent))); 144 145 // Process the continuation characters 146 // Invariants: At the start of the loop: 147 // 1) 'i' continuing octets have been generated 148 // 2) '*to' points to the next location to place an octet 149 // 3) shift_exponent is 6 more than needed for the next octet 150 int i = 0; 151 while (i != cont_octet_count && to != to_end) { 152 shift_exponent -= 6; 153 *to++ = static_cast<char>(0x80 + ((*from / (1 << shift_exponent)) % (1 << 6))); 154 ++i; 155 } 156 // If we filled up the out buffer before encoding the character 157 if(to == to_end && i != cont_octet_count) { 158 from_next = from; 159 to_next = to - (i+1); 160 return std::codecvt_base::partial; 161 } 162 ++from; 163 } 164 from_next = from; 165 to_next = to; 166 // Were we done or did we run out of destination space 167 if(from == from_end) return std::codecvt_base::ok; 168 else return std::codecvt_base::partial; 169} 170 171// How many char objects can I process to get <= max_limit 172// wchar_t objects? 173int utf8_codecvt_facet::do_length( 174 const std::mbstate_t &, 175 const char * from, 176 const char * from_end, 177 std::size_t max_limit 178) const 179#if BOOST_WORKAROUND(__IBMCPP__, BOOST_TESTED_AT(600)) 180 throw() 181#endif 182{ 183 // RG - this code is confusing! I need a better way to express it. 184 // and test cases. 185 186 // Invariants: 187 // 1) last_octet_count has the size of the last measured character 188 // 2) char_count holds the number of characters shown to fit 189 // within the bounds so far (no greater than max_limit) 190 // 3) from_next points to the octet 'last_octet_count' before the 191 // last measured character. 192 int last_octet_count=0; 193 std::size_t char_count = 0; 194 const char* from_next = from; 195 // Use "<" because the buffer may represent incomplete characters 196 while (from_next+last_octet_count <= from_end && char_count <= max_limit) { 197 from_next += last_octet_count; 198 last_octet_count = (get_octet_count(*from_next)); 199 ++char_count; 200 } 201 return static_cast<int>(from_next-from_end); 202} 203 204unsigned int utf8_codecvt_facet::get_octet_count( 205 unsigned char lead_octet 206){ 207 // if the 0-bit (MSB) is 0, then 1 character 208 if (lead_octet <= 0x7f) return 1; 209 210 // Otherwise the count number of consecutive 1 bits starting at MSB 211// assert(0xc0 <= lead_octet && lead_octet <= 0xfd); 212 213 if (0xc0 <= lead_octet && lead_octet <= 0xdf) return 2; 214 else if (0xe0 <= lead_octet && lead_octet <= 0xef) return 3; 215 else if (0xf0 <= lead_octet && lead_octet <= 0xf7) return 4; 216 else if (0xf8 <= lead_octet && lead_octet <= 0xfb) return 5; 217 else return 6; 218} 219 220namespace detail { 221 222template<std::size_t s> 223int get_cont_octet_out_count_impl(wchar_t word){ 224 if (word < 0x80) { 225 return 0; 226 } 227 if (word < 0x800) { 228 return 1; 229 } 230 return 2; 231} 232 233template<> 234int get_cont_octet_out_count_impl<4>(wchar_t word){ 235 if (word < 0x80) { 236 return 0; 237 } 238 if (word < 0x800) { 239 return 1; 240 } 241 242 // Note that the following code will generate warnings on some platforms 243 // where wchar_t is defined as UCS2. The warnings are superfluous as the 244 // specialization is never instantitiated with such compilers, but this 245 // can cause problems if warnings are being treated as errors, so we guard 246 // against that. Including <boost/detail/utf8_codecvt_facet.hpp> as we do 247 // should be enough to get WCHAR_MAX defined. 248#if !defined(WCHAR_MAX) 249# error WCHAR_MAX not defined! 250#endif 251 // cope with VC++ 7.1 or earlier having invalid WCHAR_MAX 252#if defined(_MSC_VER) && _MSC_VER <= 1310 // 7.1 or earlier 253 return 2; 254#elif WCHAR_MAX > 0x10000 255 256 if (word < 0x10000) { 257 return 2; 258 } 259 if (word < 0x200000) { 260 return 3; 261 } 262 if (word < 0x4000000) { 263 return 4; 264 } 265 return 5; 266 267#else 268 return 2; 269#endif 270} 271 272} // namespace detail 273 274// How many "continuing octets" will be needed for this word 275// == total octets - 1. 276int utf8_codecvt_facet::get_cont_octet_out_count( 277 wchar_t word 278) const { 279 return detail::get_cont_octet_out_count_impl<sizeof(wchar_t)>(word); 280} 281BOOST_UTF8_END_NAMESPACE 282 283#endif 284