1 // Copyright 2006 Nemanja Trifunovic 2 3 /* 4 Permission is hereby granted, free of charge, to any person or organization 5 obtaining a copy of the software and accompanying documentation covered by 6 this license (the "Software") to use, reproduce, display, distribute, 7 execute, and transmit the Software, and to prepare derivative works of the 8 Software, and to permit third-parties to whom the Software is furnished to 9 do so, all subject to the following: 10 11 The copyright notices in the Software and this entire statement, including 12 the above license grant, this restriction and the following disclaimer, 13 must be included in all copies of the Software, in whole or in part, and 14 all derivative works of the Software, unless such copies or derivative 15 works are solely in the form of machine-executable object code generated by 16 a source language processor. 17 18 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 19 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 20 FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT 21 SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE 22 FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE, 23 ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 24 DEALINGS IN THE SOFTWARE. 25 */ 26 27 28 #ifndef UTF8_FOR_CPP_CORE_H_2675DCD0_9480_4c0c_B92A_CC14C027B731 29 #define UTF8_FOR_CPP_CORE_H_2675DCD0_9480_4c0c_B92A_CC14C027B731 30 31 #include <iterator> 32 33 namespace utf8 34 { 35 // The typedefs for 8-bit, 16-bit and 32-bit unsigned integers 36 // You may need to change them to match your system. 37 // These typedefs have the same names as ones from cstdint, or boost/cstdint 38 typedef unsigned char uint8_t; 39 typedef unsigned short uint16_t; 40 typedef unsigned int uint32_t; 41 42 // Helper code - not intended to be directly called by the library users. May be changed at any time 43 namespace internal 44 { 45 // Unicode constants 46 // Leading (high) surrogates: 0xd800 - 0xdbff 47 // Trailing (low) surrogates: 0xdc00 - 0xdfff 48 const uint16_t LEAD_SURROGATE_MIN = 0xd800u; 49 const uint16_t LEAD_SURROGATE_MAX = 0xdbffu; 50 const uint16_t TRAIL_SURROGATE_MIN = 0xdc00u; 51 const uint16_t TRAIL_SURROGATE_MAX = 0xdfffu; 52 const uint16_t LEAD_OFFSET = LEAD_SURROGATE_MIN - (0x10000 >> 10); 53 const uint32_t SURROGATE_OFFSET = 0x10000u - (LEAD_SURROGATE_MIN << 10) - TRAIL_SURROGATE_MIN; 54 55 // Maximum valid value for a Unicode code point 56 const uint32_t CODE_POINT_MAX = 0x0010ffffu; 57 58 template<typename octet_type> mask8(octet_type oc)59 inline uint8_t mask8(octet_type oc) 60 { 61 return static_cast<uint8_t>(0xff & oc); 62 } 63 template<typename u16_type> mask16(u16_type oc)64 inline uint16_t mask16(u16_type oc) 65 { 66 return static_cast<uint16_t>(0xffff & oc); 67 } 68 template<typename octet_type> is_trail(octet_type oc)69 inline bool is_trail(octet_type oc) 70 { 71 return ((mask8(oc) >> 6) == 0x2); 72 } 73 74 template <typename u16> is_lead_surrogate(u16 cp)75 inline bool is_lead_surrogate(u16 cp) 76 { 77 return (cp >= LEAD_SURROGATE_MIN && cp <= LEAD_SURROGATE_MAX); 78 } 79 80 template <typename u16> is_trail_surrogate(u16 cp)81 inline bool is_trail_surrogate(u16 cp) 82 { 83 return (cp >= TRAIL_SURROGATE_MIN && cp <= TRAIL_SURROGATE_MAX); 84 } 85 86 template <typename u16> is_surrogate(u16 cp)87 inline bool is_surrogate(u16 cp) 88 { 89 return (cp >= LEAD_SURROGATE_MIN && cp <= TRAIL_SURROGATE_MAX); 90 } 91 92 template <typename u32> is_code_point_valid(u32 cp)93 inline bool is_code_point_valid(u32 cp) 94 { 95 return (cp <= CODE_POINT_MAX && !is_surrogate(cp)); 96 } 97 98 template <typename octet_iterator> 99 inline typename std::iterator_traits<octet_iterator>::difference_type sequence_length(octet_iterator lead_it)100 sequence_length(octet_iterator lead_it) 101 { 102 uint8_t lead = mask8(*lead_it); 103 if (lead < 0x80) 104 return 1; 105 else if ((lead >> 5) == 0x6) 106 return 2; 107 else if ((lead >> 4) == 0xe) 108 return 3; 109 else if ((lead >> 3) == 0x1e) 110 return 4; 111 else 112 return 0; 113 } 114 115 template <typename octet_difference_type> is_overlong_sequence(uint32_t cp,octet_difference_type length)116 inline bool is_overlong_sequence(uint32_t cp, octet_difference_type length) 117 { 118 if (cp < 0x80) { 119 if (length != 1) 120 return true; 121 } 122 else if (cp < 0x800) { 123 if (length != 2) 124 return true; 125 } 126 else if (cp < 0x10000) { 127 if (length != 3) 128 return true; 129 } 130 131 return false; 132 } 133 134 enum utf_error {UTF8_OK, NOT_ENOUGH_ROOM, INVALID_LEAD, INCOMPLETE_SEQUENCE, OVERLONG_SEQUENCE, INVALID_CODE_POINT}; 135 136 /// get_sequence_x functions decode utf-8 sequences of the length x 137 138 template <typename octet_iterator> get_sequence_1(octet_iterator & it,octet_iterator end,uint32_t * code_point)139 utf_error get_sequence_1(octet_iterator& it, octet_iterator end, uint32_t* code_point) 140 { 141 if (it != end) { 142 if (code_point) 143 *code_point = mask8(*it); 144 return UTF8_OK; 145 } 146 return NOT_ENOUGH_ROOM; 147 } 148 149 template <typename octet_iterator> get_sequence_2(octet_iterator & it,octet_iterator end,uint32_t * code_point)150 utf_error get_sequence_2(octet_iterator& it, octet_iterator end, uint32_t* code_point) 151 { 152 utf_error ret_code = NOT_ENOUGH_ROOM; 153 154 if (it != end) { 155 uint32_t cp = mask8(*it); 156 if (++it != end) { 157 if (is_trail(*it)) { 158 cp = ((cp << 6) & 0x7ff) + ((*it) & 0x3f); 159 160 if (code_point) 161 *code_point = cp; 162 ret_code = UTF8_OK; 163 } 164 else 165 ret_code = INCOMPLETE_SEQUENCE; 166 } 167 else 168 ret_code = NOT_ENOUGH_ROOM; 169 } 170 171 return ret_code; 172 } 173 174 template <typename octet_iterator> get_sequence_3(octet_iterator & it,octet_iterator end,uint32_t * code_point)175 utf_error get_sequence_3(octet_iterator& it, octet_iterator end, uint32_t* code_point) 176 { 177 utf_error ret_code = NOT_ENOUGH_ROOM; 178 179 if (it != end) { 180 uint32_t cp = mask8(*it); 181 if (++it != end) { 182 if (is_trail(*it)) { 183 cp = ((cp << 12) & 0xffff) + ((mask8(*it) << 6) & 0xfff); 184 if (++it != end) { 185 if (is_trail(*it)) { 186 cp += (*it) & 0x3f; 187 188 if (code_point) 189 *code_point = cp; 190 ret_code = UTF8_OK; 191 } 192 else 193 ret_code = INCOMPLETE_SEQUENCE; 194 } 195 else 196 ret_code = NOT_ENOUGH_ROOM; 197 } 198 else 199 ret_code = INCOMPLETE_SEQUENCE; 200 } 201 else 202 ret_code = NOT_ENOUGH_ROOM; 203 } 204 205 return ret_code; 206 } 207 208 template <typename octet_iterator> get_sequence_4(octet_iterator & it,octet_iterator end,uint32_t * code_point)209 utf_error get_sequence_4(octet_iterator& it, octet_iterator end, uint32_t* code_point) 210 { 211 utf_error ret_code = NOT_ENOUGH_ROOM; 212 213 if (it != end) { 214 uint32_t cp = mask8(*it); 215 if (++it != end) { 216 if (is_trail(*it)) { 217 cp = ((cp << 18) & 0x1fffff) + ((mask8(*it) << 12) & 0x3ffff); 218 if (++it != end) { 219 if (is_trail(*it)) { 220 cp += (mask8(*it) << 6) & 0xfff; 221 if (++it != end) { 222 if (is_trail(*it)) { 223 cp += (*it) & 0x3f; 224 225 if (code_point) 226 *code_point = cp; 227 ret_code = UTF8_OK; 228 } 229 else 230 ret_code = INCOMPLETE_SEQUENCE; 231 } 232 else 233 ret_code = NOT_ENOUGH_ROOM; 234 } 235 else 236 ret_code = INCOMPLETE_SEQUENCE; 237 } 238 else 239 ret_code = NOT_ENOUGH_ROOM; 240 } 241 else 242 ret_code = INCOMPLETE_SEQUENCE; 243 } 244 else 245 ret_code = NOT_ENOUGH_ROOM; 246 } 247 248 return ret_code; 249 } 250 251 template <typename octet_iterator> validate_next(octet_iterator & it,octet_iterator end,uint32_t * code_point)252 utf_error validate_next(octet_iterator& it, octet_iterator end, uint32_t* code_point) 253 { 254 // Save the original value of it so we can go back in case of failure 255 // Of course, it does not make much sense with i.e. stream iterators 256 octet_iterator original_it = it; 257 258 uint32_t cp = 0; 259 // Determine the sequence length based on the lead octet 260 typedef typename std::iterator_traits<octet_iterator>::difference_type octet_difference_type; 261 octet_difference_type length = sequence_length(it); 262 if (length == 0) 263 return INVALID_LEAD; 264 265 // Now that we have a valid sequence length, get trail octets and calculate the code point 266 utf_error err = UTF8_OK; 267 switch (length) { 268 case 1: 269 err = get_sequence_1(it, end, &cp); 270 break; 271 case 2: 272 err = get_sequence_2(it, end, &cp); 273 break; 274 case 3: 275 err = get_sequence_3(it, end, &cp); 276 break; 277 case 4: 278 err = get_sequence_4(it, end, &cp); 279 break; 280 } 281 282 if (err == UTF8_OK) { 283 // Decoding succeeded. Now, security checks... 284 if (is_code_point_valid(cp)) { 285 if (!is_overlong_sequence(cp, length)){ 286 // Passed! Return here. 287 if (code_point) 288 *code_point = cp; 289 ++it; 290 return UTF8_OK; 291 } 292 else 293 err = OVERLONG_SEQUENCE; 294 } 295 else 296 err = INVALID_CODE_POINT; 297 } 298 299 // Failure branch - restore the original value of the iterator 300 it = original_it; 301 return err; 302 } 303 304 template <typename octet_iterator> validate_next(octet_iterator & it,octet_iterator end)305 inline utf_error validate_next(octet_iterator& it, octet_iterator end) { 306 return validate_next(it, end, 0); 307 } 308 309 } // namespace internal 310 311 /// The library API - functions intended to be called by the users 312 313 // Byte order mark 314 const uint8_t bom[] = {0xef, 0xbb, 0xbf}; 315 316 template <typename octet_iterator> find_invalid(octet_iterator start,octet_iterator end)317 octet_iterator find_invalid(octet_iterator start, octet_iterator end) 318 { 319 octet_iterator result = start; 320 while (result != end) { 321 internal::utf_error err_code = internal::validate_next(result, end); 322 if (err_code != internal::UTF8_OK) 323 return result; 324 } 325 return result; 326 } 327 328 template <typename octet_iterator> is_valid(octet_iterator start,octet_iterator end)329 inline bool is_valid(octet_iterator start, octet_iterator end) 330 { 331 return (find_invalid(start, end) == end); 332 } 333 334 template <typename octet_iterator> starts_with_bom(octet_iterator it,octet_iterator end)335 inline bool starts_with_bom (octet_iterator it, octet_iterator end) 336 { 337 return ( 338 ((it != end) && (internal::mask8(*it++)) == bom[0]) && 339 ((it != end) && (internal::mask8(*it++)) == bom[1]) && 340 ((it != end) && (internal::mask8(*it)) == bom[2]) 341 ); 342 } 343 344 //Deprecated in release 2.3 345 template <typename octet_iterator> is_bom(octet_iterator it)346 inline bool is_bom (octet_iterator it) 347 { 348 return ( 349 (internal::mask8(*it++)) == bom[0] && 350 (internal::mask8(*it++)) == bom[1] && 351 (internal::mask8(*it)) == bom[2] 352 ); 353 } 354 } // namespace utf8 355 356 #endif // header guard 357 358 359