1 // Copyright 2006 Nemanja Trifunovic 2 3 /* 4 Permission is hereby granted, free of charge, to any person or organization 5 obtaining a copy of the software and accompanying documentation covered by 6 this license (the "Software") to use, reproduce, display, distribute, 7 execute, and transmit the Software, and to prepare derivative works of the 8 Software, and to permit third-parties to whom the Software is furnished to 9 do so, all subject to the following: 10 11 The copyright notices in the Software and this entire statement, including 12 the above license grant, this restriction and the following disclaimer, 13 must be included in all copies of the Software, in whole or in part, and 14 all derivative works of the Software, unless such copies or derivative 15 works are solely in the form of machine-executable object code generated by 16 a source language processor. 17 18 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 19 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 20 FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT 21 SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE 22 FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE, 23 ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 24 DEALINGS IN THE SOFTWARE. 25 */ 26 27 /** \file checked.h \brief Contains part of the utfcpp library. See 28 http://utfcpp.sourceforge.net for documentation. */ 29 30 31 #ifndef UTF8_FOR_CPP_CHECKED_H_2675DCD0_9480_4c0c_B92A_CC14C027B731 32 #define UTF8_FOR_CPP_CHECKED_H_2675DCD0_9480_4c0c_B92A_CC14C027B731 33 34 #include "core.h" 35 #include <stdexcept> 36 37 namespace utf8 38 { 39 // Exceptions that may be thrown from the library functions. 40 class invalid_code_point : public std::exception { 41 uint32_t cp; 42 public: invalid_code_point(uint32_t cp)43 invalid_code_point(uint32_t cp) : cp(cp) {} what()44 const char* what() const noexcept override { return "Invalid code point"; } code_point()45 uint32_t code_point() const {return cp;} 46 }; 47 48 class invalid_utf8 : public std::exception { 49 uint8_t u8; 50 public: invalid_utf8(uint8_t u)51 invalid_utf8 (uint8_t u) : u8(u) {} what()52 const char* what() const noexcept override { return "Invalid UTF-8"; } utf8_octet()53 uint8_t utf8_octet() const {return u8;} 54 }; 55 56 class invalid_utf16 : public std::exception { 57 uint16_t u16; 58 public: invalid_utf16(uint16_t u)59 invalid_utf16 (uint16_t u) : u16(u) {} what()60 const char* what() const noexcept override { return "Invalid UTF-16"; } utf16_word()61 uint16_t utf16_word() const {return u16;} 62 }; 63 64 class not_enough_room : public std::exception { 65 public: what()66 const char* what() const noexcept override { return "Not enough space"; } 67 }; 68 69 /// The library API - functions intended to be called by the users 70 71 template <typename octet_iterator, typename output_iterator> replace_invalid(octet_iterator start,octet_iterator end,output_iterator out,uint32_t replacement)72 output_iterator replace_invalid(octet_iterator start, octet_iterator end, output_iterator out, uint32_t replacement) 73 { 74 while (start != end) { 75 octet_iterator sequence_start = start; 76 internal::utf_error err_code = internal::validate_next(start, end); 77 switch (err_code) { 78 case internal::OK : 79 for (octet_iterator it = sequence_start; it != start; ++it) 80 *out++ = *it; 81 break; 82 case internal::NOT_ENOUGH_ROOM: 83 throw not_enough_room(); 84 case internal::INVALID_LEAD: 85 append (replacement, out); 86 ++start; 87 break; 88 case internal::INCOMPLETE_SEQUENCE: 89 case internal::OVERLONG_SEQUENCE: 90 case internal::INVALID_CODE_POINT: 91 append (replacement, out); 92 ++start; 93 // just one replacement mark for the sequence 94 while (internal::is_trail(*start) && start != end) 95 ++start; 96 break; 97 } 98 } 99 return out; 100 } 101 102 template <typename octet_iterator, typename output_iterator> replace_invalid(octet_iterator start,octet_iterator end,output_iterator out)103 inline output_iterator replace_invalid(octet_iterator start, octet_iterator end, output_iterator out) 104 { 105 static const uint32_t replacement_marker = internal::mask16(0xfffd); 106 return replace_invalid(start, end, out, replacement_marker); 107 } 108 109 template <typename octet_iterator> append(uint32_t cp,octet_iterator result)110 octet_iterator append(uint32_t cp, octet_iterator result) 111 { 112 if (!internal::is_code_point_valid(cp)) 113 throw invalid_code_point(cp); 114 115 if (cp < 0x80) // one octet 116 *(result++) = static_cast<uint8_t>(cp); 117 else if (cp < 0x800) { // two octets 118 *(result++) = static_cast<uint8_t>((cp >> 6) | 0xc0); 119 *(result++) = static_cast<uint8_t>((cp & 0x3f) | 0x80); 120 } 121 else if (cp < 0x10000) { // three octets 122 *(result++) = static_cast<uint8_t>((cp >> 12) | 0xe0); 123 *(result++) = static_cast<uint8_t>(((cp >> 6) & 0x3f) | 0x80); 124 *(result++) = static_cast<uint8_t>((cp & 0x3f) | 0x80); 125 } 126 else if (cp <= internal::CODE_POINT_MAX) { // four octets 127 *(result++) = static_cast<uint8_t>((cp >> 18) | 0xf0); 128 *(result++) = static_cast<uint8_t>(((cp >> 12) & 0x3f) | 0x80); 129 *(result++) = static_cast<uint8_t>(((cp >> 6) & 0x3f) | 0x80); 130 *(result++) = static_cast<uint8_t>((cp & 0x3f) | 0x80); 131 } 132 else 133 throw invalid_code_point(cp); 134 135 return result; 136 } 137 138 template <typename octet_iterator> next(octet_iterator & it,octet_iterator end)139 uint32_t next(octet_iterator& it, octet_iterator end) 140 { 141 uint32_t cp = 0; 142 internal::utf_error err_code = internal::validate_next(it, end, &cp); 143 switch (err_code) { 144 case internal::OK : 145 break; 146 case internal::NOT_ENOUGH_ROOM : 147 throw not_enough_room(); 148 case internal::INVALID_LEAD : 149 case internal::INCOMPLETE_SEQUENCE : 150 case internal::OVERLONG_SEQUENCE : 151 throw invalid_utf8(*it); 152 case internal::INVALID_CODE_POINT : 153 throw invalid_code_point(cp); 154 } 155 return cp; 156 } 157 158 template <typename octet_iterator> peek_next(octet_iterator it,octet_iterator end)159 uint32_t peek_next(octet_iterator it, octet_iterator end) 160 { 161 return next(it, end); 162 } 163 164 template <typename octet_iterator> prior(octet_iterator & it,octet_iterator start)165 uint32_t prior(octet_iterator& it, octet_iterator start) 166 { 167 octet_iterator end = it; 168 while (internal::is_trail(*(--it))) 169 if (it < start) 170 throw invalid_utf8(*it); // error - no lead byte in the sequence 171 octet_iterator temp = it; 172 return next(temp, end); 173 } 174 175 /// Deprecated in versions that include "prior" 176 template <typename octet_iterator> previous(octet_iterator & it,octet_iterator pass_start)177 uint32_t previous(octet_iterator& it, octet_iterator pass_start) 178 { 179 octet_iterator end = it; 180 while (internal::is_trail(*(--it))) 181 if (it == pass_start) 182 throw invalid_utf8(*it); // error - no lead byte in the sequence 183 octet_iterator temp = it; 184 return next(temp, end); 185 } 186 187 template <typename octet_iterator, typename distance_type> advance(octet_iterator & it,distance_type n,octet_iterator end)188 void advance (octet_iterator& it, distance_type n, octet_iterator end) 189 { 190 for (distance_type i = 0; i < n; ++i) 191 next(it, end); 192 } 193 194 template <typename octet_iterator> 195 typename std::iterator_traits<octet_iterator>::difference_type distance(octet_iterator first,octet_iterator last)196 distance (octet_iterator first, octet_iterator last) 197 { 198 typename std::iterator_traits<octet_iterator>::difference_type dist; 199 for (dist = 0; first < last; ++dist) 200 next(first, last); 201 return dist; 202 } 203 204 template <typename u16bit_iterator, typename octet_iterator> utf16to8(u16bit_iterator start,u16bit_iterator end,octet_iterator result)205 octet_iterator utf16to8 (u16bit_iterator start, u16bit_iterator end, octet_iterator result) 206 { 207 while (start != end) { 208 uint32_t cp = internal::mask16(*start++); 209 // Take care of surrogate pairs first 210 if (internal::is_surrogate(cp)) { 211 if (start != end) { 212 uint32_t trail_surrogate = internal::mask16(*start++); 213 if (trail_surrogate >= internal::TRAIL_SURROGATE_MIN && trail_surrogate <= internal::TRAIL_SURROGATE_MAX) 214 cp = (cp << 10) + trail_surrogate + internal::SURROGATE_OFFSET; 215 else 216 throw invalid_utf16(static_cast<uint16_t>(trail_surrogate)); 217 } 218 else 219 throw invalid_utf16(static_cast<uint16_t>(*start)); 220 221 } 222 result = append(cp, result); 223 } 224 return result; 225 } 226 227 template <typename u16bit_iterator, typename octet_iterator> utf8to16(octet_iterator start,octet_iterator end,u16bit_iterator result)228 u16bit_iterator utf8to16 (octet_iterator start, octet_iterator end, u16bit_iterator result) 229 { 230 while (start != end) { 231 uint32_t cp = next(start, end); 232 if (cp > 0xffff) { //make a surrogate pair 233 *result++ = static_cast<uint16_t>((cp >> 10) + internal::LEAD_OFFSET); 234 *result++ = static_cast<uint16_t>((cp & 0x3ff) + internal::TRAIL_SURROGATE_MIN); 235 } 236 else 237 *result++ = static_cast<uint16_t>(cp); 238 } 239 return result; 240 } 241 242 template <typename octet_iterator, typename u32bit_iterator> utf32to8(u32bit_iterator start,u32bit_iterator end,octet_iterator result)243 octet_iterator utf32to8 (u32bit_iterator start, u32bit_iterator end, octet_iterator result) 244 { 245 while (start != end) 246 result = append(*(start++), result); 247 248 return result; 249 } 250 251 template <typename octet_iterator, typename u32bit_iterator> utf8to32(octet_iterator start,octet_iterator end,u32bit_iterator result)252 u32bit_iterator utf8to32 (octet_iterator start, octet_iterator end, u32bit_iterator result) 253 { 254 while (start < end) 255 (*result++) = next(start, end); 256 257 return result; 258 } 259 260 // The iterator class 261 template <typename octet_iterator> 262 class iterator : public std::iterator <std::bidirectional_iterator_tag, uint32_t> { 263 octet_iterator it; 264 octet_iterator range_start; 265 octet_iterator range_end; 266 public: iterator()267 iterator () {}; iterator(const octet_iterator & octet_it,const octet_iterator & range_start,const octet_iterator & range_end)268 explicit iterator (const octet_iterator& octet_it, 269 const octet_iterator& range_start, 270 const octet_iterator& range_end) : 271 it(octet_it), range_start(range_start), range_end(range_end) 272 { 273 if (it < range_start || it > range_end) 274 throw std::out_of_range("Invalid utf-8 iterator position"); 275 } 276 // the default "big three" are OK base()277 octet_iterator base () const { return it; } 278 uint32_t operator * () const 279 { 280 octet_iterator temp = it; 281 return next(temp, range_end); 282 } 283 bool operator == (const iterator& rhs) const 284 { 285 if (range_start != rhs.range_start || range_end != rhs.range_end) 286 throw std::logic_error("Comparing utf-8 iterators defined with different ranges"); 287 return (it == rhs.it); 288 } 289 bool operator != (const iterator& rhs) const 290 { 291 return !(operator == (rhs)); 292 } 293 iterator& operator ++ () 294 { 295 next(it, range_end); 296 return *this; 297 } 298 iterator operator ++ (int) 299 { 300 iterator temp = *this; 301 next(it, range_end); 302 return temp; 303 } 304 iterator& operator -- () 305 { 306 prior(it, range_start); 307 return *this; 308 } 309 iterator operator -- (int) 310 { 311 iterator temp = *this; 312 prior(it, range_start); 313 return temp; 314 } 315 }; // class iterator 316 317 // The wchar_t iterator class 318 template <typename octet_iterator> 319 class wchar_iterator : 320 public std::iterator<std::bidirectional_iterator_tag, wchar_t> 321 { 322 octet_iterator it; 323 octet_iterator range_start; 324 octet_iterator range_end; 325 public: wchar_iterator()326 wchar_iterator () {}; wchar_iterator(const octet_iterator & octet_it,const octet_iterator & range_start,const octet_iterator & range_end)327 wchar_iterator (const octet_iterator& octet_it, 328 const octet_iterator& range_start, 329 const octet_iterator& range_end) : 330 it(octet_it), range_start(range_start), range_end(range_end) 331 { 332 if (it < range_start || it > range_end) 333 throw std::out_of_range("Invalid utf-8 iterator position"); 334 } 335 // the default "big three" are OK base()336 octet_iterator base () const { return it; } 337 wchar_t operator * () const 338 { 339 octet_iterator temp = it; 340 uint32_t retval = next(temp, range_end); 341 assert(retval <= WCHAR_MAX); 342 return retval; 343 } 344 bool operator == (const wchar_iterator& rhs) const 345 { 346 if (range_start != rhs.range_start || range_end != rhs.range_end) 347 throw std::logic_error("Comparing utf-8 iterators defined with different ranges"); 348 return (it == rhs.it); 349 } 350 bool operator != (const wchar_iterator& rhs) const 351 { 352 return !(operator == (rhs)); 353 } 354 wchar_iterator& operator ++ () 355 { 356 next(it, range_end); 357 return *this; 358 } 359 wchar_iterator operator ++ (int) 360 { 361 wchar_iterator temp = *this; 362 next(it, range_end); 363 return temp; 364 } 365 wchar_iterator& operator -- () 366 { 367 prior(it, range_start); 368 return *this; 369 } 370 wchar_iterator operator -- (int) 371 { 372 wchar_iterator temp = *this; 373 prior(it, range_start); 374 return temp; 375 } 376 }; 377 378 } // namespace utf8 379 380 #endif //header guard 381 382 383