1 // 2 // Copyright (c) 2009-2011 Artyom Beilis (Tonkikh) 3 // 4 // Distributed under the Boost Software License, Version 1.0. (See 5 // accompanying file LICENSE_1_0.txt or copy at 6 // http://www.boost.org/LICENSE_1_0.txt) 7 // 8 #ifndef BOOST_LOCALE_UTF_HPP_INCLUDED 9 #define BOOST_LOCALE_UTF_HPP_INCLUDED 10 11 #include <boost/cstdint.hpp> 12 13 namespace boost { 14 namespace locale { 15 /// 16 /// \brief Namespace that holds basic operations on UTF encoded sequences 17 /// 18 /// All functions defined in this namespace do not require linking with Boost.Locale library 19 /// 20 namespace utf { 21 /// \cond INTERNAL 22 #ifdef __GNUC__ 23 # define BOOST_LOCALE_LIKELY(x) __builtin_expect((x),1) 24 # define BOOST_LOCALE_UNLIKELY(x) __builtin_expect((x),0) 25 #else 26 # define BOOST_LOCALE_LIKELY(x) (x) 27 # define BOOST_LOCALE_UNLIKELY(x) (x) 28 #endif 29 /// \endcond 30 31 /// 32 /// \brief The integral type that can hold a Unicode code point 33 /// 34 typedef uint32_t code_point; 35 36 /// 37 /// \brief Special constant that defines illegal code point 38 /// 39 static const code_point illegal = 0xFFFFFFFFu; 40 41 /// 42 /// \brief Special constant that defines incomplete code point 43 /// 44 static const code_point incomplete = 0xFFFFFFFEu; 45 46 /// 47 /// \brief the function checks if \a v is a valid code point 48 /// is_valid_codepoint(code_point v)49 inline bool is_valid_codepoint(code_point v) 50 { 51 if(v>0x10FFFF) 52 return false; 53 if(0xD800 <=v && v<= 0xDFFF) // surragates 54 return false; 55 return true; 56 } 57 58 #ifdef BOOST_LOCALE_DOXYGEN 59 /// 60 /// \brief UTF Traits class - functions to convert UTF sequences to and from Unicode code points 61 /// 62 template<typename CharType,int size=sizeof(CharType)> 63 struct utf_traits { 64 /// 65 /// The type of the character 66 /// 67 typedef CharType char_type; 68 /// 69 /// Read one code point from the range [p,e) and return it. 70 /// 71 /// - If the sequence that was read is incomplete sequence returns \ref incomplete, 72 /// - If illegal sequence detected returns \ref illegal 73 /// 74 /// Requirements 75 /// 76 /// - Iterator is valid input iterator 77 /// 78 /// Postconditions 79 /// 80 /// - p points to the last consumed character 81 /// 82 template<typename Iterator> 83 static code_point decode(Iterator &p,Iterator e); 84 85 /// 86 /// Maximal width of valid sequence in the code units: 87 /// 88 /// - UTF-8 - 4 89 /// - UTF-16 - 2 90 /// - UTF-32 - 1 91 /// 92 static const int max_width; 93 /// 94 /// The width of specific code point in the code units. 95 /// 96 /// Requirement: value is a valid Unicode code point 97 /// Returns value in range [1..max_width] 98 /// 99 static int width(code_point value); 100 101 /// 102 /// Get the size of the trail part of variable length encoded sequence. 103 /// 104 /// Returns -1 if C is not valid lead character 105 /// 106 static int trail_length(char_type c); 107 /// 108 /// Returns true if c is trail code unit, always false for UTF-32 109 /// 110 static bool is_trail(char_type c); 111 /// 112 /// Returns true if c is lead code unit, always true of UTF-32 113 /// 114 static bool is_lead(char_type c); 115 116 /// 117 /// Convert valid Unicode code point \a value to the UTF sequence. 118 /// 119 /// Requirements: 120 /// 121 /// - \a value is valid code point 122 /// - \a out is an output iterator should be able to accept at least width(value) units 123 /// 124 /// Returns the iterator past the last written code unit. 125 /// 126 template<typename Iterator> 127 static Iterator encode(code_point value,Iterator out); 128 /// 129 /// Decodes valid UTF sequence that is pointed by p into code point. 130 /// 131 /// If the sequence is invalid or points to end the behavior is undefined 132 /// 133 template<typename Iterator> 134 static code_point decode_valid(Iterator &p); 135 }; 136 137 #else 138 139 template<typename CharType,int size=sizeof(CharType)> 140 struct utf_traits; 141 142 template<typename CharType> 143 struct utf_traits<CharType,1> { 144 145 typedef CharType char_type; 146 trail_lengthboost::locale::utf::utf_traits147 static int trail_length(char_type ci) 148 { 149 unsigned char c = ci; 150 if(c < 128) 151 return 0; 152 if(BOOST_LOCALE_UNLIKELY(c < 194)) 153 return -1; 154 if(c < 224) 155 return 1; 156 if(c < 240) 157 return 2; 158 if(BOOST_LOCALE_LIKELY(c <=244)) 159 return 3; 160 return -1; 161 } 162 163 static const int max_width = 4; 164 widthboost::locale::utf::utf_traits165 static int width(code_point value) 166 { 167 if(value <=0x7F) { 168 return 1; 169 } 170 else if(value <=0x7FF) { 171 return 2; 172 } 173 else if(BOOST_LOCALE_LIKELY(value <=0xFFFF)) { 174 return 3; 175 } 176 else { 177 return 4; 178 } 179 } 180 is_trailboost::locale::utf::utf_traits181 static bool is_trail(char_type ci) 182 { 183 unsigned char c=ci; 184 return (c & 0xC0)==0x80; 185 } 186 is_leadboost::locale::utf::utf_traits187 static bool is_lead(char_type ci) 188 { 189 return !is_trail(ci); 190 } 191 192 template<typename Iterator> decodeboost::locale::utf::utf_traits193 static code_point decode(Iterator &p,Iterator e) 194 { 195 if(BOOST_LOCALE_UNLIKELY(p==e)) 196 return incomplete; 197 198 unsigned char lead = *p++; 199 200 // First byte is fully validated here 201 int trail_size = trail_length(lead); 202 203 if(BOOST_LOCALE_UNLIKELY(trail_size < 0)) 204 return illegal; 205 206 // 207 // Ok as only ASCII may be of size = 0 208 // also optimize for ASCII text 209 // 210 if(trail_size == 0) 211 return lead; 212 213 code_point c = lead & ((1<<(6-trail_size))-1); 214 215 // Read the rest 216 unsigned char tmp; 217 switch(trail_size) { 218 case 3: 219 if(BOOST_LOCALE_UNLIKELY(p==e)) 220 return incomplete; 221 tmp = *p++; 222 if (!is_trail(tmp)) 223 return illegal; 224 c = (c << 6) | ( tmp & 0x3F); 225 case 2: 226 if(BOOST_LOCALE_UNLIKELY(p==e)) 227 return incomplete; 228 tmp = *p++; 229 if (!is_trail(tmp)) 230 return illegal; 231 c = (c << 6) | ( tmp & 0x3F); 232 case 1: 233 if(BOOST_LOCALE_UNLIKELY(p==e)) 234 return incomplete; 235 tmp = *p++; 236 if (!is_trail(tmp)) 237 return illegal; 238 c = (c << 6) | ( tmp & 0x3F); 239 } 240 241 // Check code point validity: no surrogates and 242 // valid range 243 if(BOOST_LOCALE_UNLIKELY(!is_valid_codepoint(c))) 244 return illegal; 245 246 // make sure it is the most compact representation 247 if(BOOST_LOCALE_UNLIKELY(width(c)!=trail_size + 1)) 248 return illegal; 249 250 return c; 251 252 } 253 254 template<typename Iterator> decode_validboost::locale::utf::utf_traits255 static code_point decode_valid(Iterator &p) 256 { 257 unsigned char lead = *p++; 258 if(lead < 192) 259 return lead; 260 261 int trail_size; 262 263 if(lead < 224) 264 trail_size = 1; 265 else if(BOOST_LOCALE_LIKELY(lead < 240)) // non-BMP rare 266 trail_size = 2; 267 else 268 trail_size = 3; 269 270 code_point c = lead & ((1<<(6-trail_size))-1); 271 272 switch(trail_size) { 273 case 3: 274 c = (c << 6) | ( static_cast<unsigned char>(*p++) & 0x3F); 275 case 2: 276 c = (c << 6) | ( static_cast<unsigned char>(*p++) & 0x3F); 277 case 1: 278 c = (c << 6) | ( static_cast<unsigned char>(*p++) & 0x3F); 279 } 280 281 return c; 282 } 283 284 285 286 template<typename Iterator> encodeboost::locale::utf::utf_traits287 static Iterator encode(code_point value,Iterator out) 288 { 289 if(value <= 0x7F) { 290 *out++ = static_cast<char_type>(value); 291 } 292 else if(value <= 0x7FF) { 293 *out++ = static_cast<char_type>((value >> 6) | 0xC0); 294 *out++ = static_cast<char_type>((value & 0x3F) | 0x80); 295 } 296 else if(BOOST_LOCALE_LIKELY(value <= 0xFFFF)) { 297 *out++ = static_cast<char_type>((value >> 12) | 0xE0); 298 *out++ = static_cast<char_type>(((value >> 6) & 0x3F) | 0x80); 299 *out++ = static_cast<char_type>((value & 0x3F) | 0x80); 300 } 301 else { 302 *out++ = static_cast<char_type>((value >> 18) | 0xF0); 303 *out++ = static_cast<char_type>(((value >> 12) & 0x3F) | 0x80); 304 *out++ = static_cast<char_type>(((value >> 6) & 0x3F) | 0x80); 305 *out++ = static_cast<char_type>((value & 0x3F) | 0x80); 306 } 307 return out; 308 } 309 }; // utf8 310 311 template<typename CharType> 312 struct utf_traits<CharType,2> { 313 typedef CharType char_type; 314 315 // See RFC 2781 is_first_surrogateboost::locale::utf::utf_traits316 static bool is_first_surrogate(uint16_t x) 317 { 318 return 0xD800 <=x && x<= 0xDBFF; 319 } is_second_surrogateboost::locale::utf::utf_traits320 static bool is_second_surrogate(uint16_t x) 321 { 322 return 0xDC00 <=x && x<= 0xDFFF; 323 } combine_surrogateboost::locale::utf::utf_traits324 static code_point combine_surrogate(uint16_t w1,uint16_t w2) 325 { 326 return ((code_point(w1 & 0x3FF) << 10) | (w2 & 0x3FF)) + 0x10000; 327 } trail_lengthboost::locale::utf::utf_traits328 static int trail_length(char_type c) 329 { 330 if(is_first_surrogate(c)) 331 return 1; 332 if(is_second_surrogate(c)) 333 return -1; 334 return 0; 335 } 336 /// 337 /// Returns true if c is trail code unit, always false for UTF-32 338 /// is_trailboost::locale::utf::utf_traits339 static bool is_trail(char_type c) 340 { 341 return is_second_surrogate(c); 342 } 343 /// 344 /// Returns true if c is lead code unit, always true of UTF-32 345 /// is_leadboost::locale::utf::utf_traits346 static bool is_lead(char_type c) 347 { 348 return !is_second_surrogate(c); 349 } 350 351 template<typename It> decodeboost::locale::utf::utf_traits352 static code_point decode(It ¤t,It last) 353 { 354 if(BOOST_LOCALE_UNLIKELY(current == last)) 355 return incomplete; 356 uint16_t w1=*current++; 357 if(BOOST_LOCALE_LIKELY(w1 < 0xD800 || 0xDFFF < w1)) { 358 return w1; 359 } 360 if(w1 > 0xDBFF) 361 return illegal; 362 if(current==last) 363 return incomplete; 364 uint16_t w2=*current++; 365 if(w2 < 0xDC00 || 0xDFFF < w2) 366 return illegal; 367 return combine_surrogate(w1,w2); 368 } 369 template<typename It> decode_validboost::locale::utf::utf_traits370 static code_point decode_valid(It ¤t) 371 { 372 uint16_t w1=*current++; 373 if(BOOST_LOCALE_LIKELY(w1 < 0xD800 || 0xDFFF < w1)) { 374 return w1; 375 } 376 uint16_t w2=*current++; 377 return combine_surrogate(w1,w2); 378 } 379 380 static const int max_width = 2; widthboost::locale::utf::utf_traits381 static int width(code_point u) 382 { 383 return u>=0x10000 ? 2 : 1; 384 } 385 template<typename It> encodeboost::locale::utf::utf_traits386 static It encode(code_point u,It out) 387 { 388 if(BOOST_LOCALE_LIKELY(u<=0xFFFF)) { 389 *out++ = static_cast<char_type>(u); 390 } 391 else { 392 u -= 0x10000; 393 *out++ = static_cast<char_type>(0xD800 | (u>>10)); 394 *out++ = static_cast<char_type>(0xDC00 | (u & 0x3FF)); 395 } 396 return out; 397 } 398 }; // utf16; 399 400 401 template<typename CharType> 402 struct utf_traits<CharType,4> { 403 typedef CharType char_type; trail_lengthboost::locale::utf::utf_traits404 static int trail_length(char_type c) 405 { 406 if(is_valid_codepoint(c)) 407 return 0; 408 return -1; 409 } is_trailboost::locale::utf::utf_traits410 static bool is_trail(char_type /*c*/) 411 { 412 return false; 413 } is_leadboost::locale::utf::utf_traits414 static bool is_lead(char_type /*c*/) 415 { 416 return true; 417 } 418 419 template<typename It> decode_validboost::locale::utf::utf_traits420 static code_point decode_valid(It ¤t) 421 { 422 return *current++; 423 } 424 425 template<typename It> decodeboost::locale::utf::utf_traits426 static code_point decode(It ¤t,It last) 427 { 428 if(BOOST_LOCALE_UNLIKELY(current == last)) 429 return boost::locale::utf::incomplete; 430 code_point c=*current++; 431 if(BOOST_LOCALE_UNLIKELY(!is_valid_codepoint(c))) 432 return boost::locale::utf::illegal; 433 return c; 434 } 435 static const int max_width = 1; widthboost::locale::utf::utf_traits436 static int width(code_point /*u*/) 437 { 438 return 1; 439 } 440 template<typename It> encodeboost::locale::utf::utf_traits441 static It encode(code_point u,It out) 442 { 443 *out++ = static_cast<char_type>(u); 444 return out; 445 } 446 447 }; // utf32 448 449 #endif 450 451 452 } // utf 453 } // locale 454 } // boost 455 456 457 #endif 458 459 // vim: tabstop=4 expandtab shiftwidth=4 softtabstop=4 460 461