1 // Locale support (codecvt) -*- C++ -*- 2 3 // Copyright (C) 2015-2018 Free Software Foundation, Inc. 4 // 5 // This file is part of the GNU ISO C++ Library. This library is free 6 // software; you can redistribute it and/or modify it under the 7 // terms of the GNU General Public License as published by the 8 // Free Software Foundation; either version 3, or (at your option) 9 // any later version. 10 11 // This library is distributed in the hope that it will be useful, 12 // but WITHOUT ANY WARRANTY; without even the implied warranty of 13 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 // GNU General Public License for more details. 15 16 // Under Section 7 of GPL version 3, you are granted additional 17 // permissions described in the GCC Runtime Library Exception, version 18 // 3.1, as published by the Free Software Foundation. 19 20 // You should have received a copy of the GNU General Public License and 21 // a copy of the GCC Runtime Library Exception along with this program; 22 // see the files COPYING3 and COPYING.RUNTIME respectively. If not, see 23 // <http://www.gnu.org/licenses/>. 24 25 #include <codecvt> 26 #include <cstring> // std::memcpy, std::memcmp 27 #include <bits/stl_algobase.h> // std::min 28 29 #ifdef _GLIBCXX_USE_C99_STDINT_TR1 30 namespace std _GLIBCXX_VISIBILITY(default) 31 { 32 _GLIBCXX_BEGIN_NAMESPACE_VERSION 33 34 // The standard doesn't define these operators, which is annoying. 35 static underlying_type<codecvt_mode>::type 36 to_integer(codecvt_mode m) 37 { return static_cast<underlying_type<codecvt_mode>::type>(m); } 38 39 static codecvt_mode& operator&=(codecvt_mode& m, codecvt_mode n) 40 { return m = codecvt_mode(to_integer(m) & to_integer(n)); } 41 42 static codecvt_mode& operator|=(codecvt_mode& m, codecvt_mode n) 43 { return m = codecvt_mode(to_integer(m) | to_integer(n)); } 44 45 static codecvt_mode operator~(codecvt_mode m) 46 { return codecvt_mode(~to_integer(m)); } 47 48 namespace 49 { 50 // Largest code point that fits in a single UTF-16 code unit. 51 const char32_t max_single_utf16_unit = 0xFFFF; 52 53 const char32_t max_code_point = 0x10FFFF; 54 55 // The functions below rely on maxcode < incomplete_mb_character 56 // (which is enforced by the codecvt_utf* classes on construction). 57 const char32_t incomplete_mb_character = char32_t(-2); 58 const char32_t invalid_mb_sequence = char32_t(-1); 59 60 // Utility type for reading and writing code units of type Elem from 61 // a range defined by a pair of pointers. 62 template<typename Elem, bool Aligned = true> 63 struct range 64 { 65 Elem* next; 66 Elem* end; 67 68 // Write a code unit. 69 range& operator=(Elem e) 70 { 71 *next++ = e; 72 return *this; 73 } 74 75 // Read the next code unit. 76 Elem operator*() const { return *next; } 77 78 // Read the Nth code unit. 79 Elem operator[](size_t n) const { return next[n]; } 80 81 // Move to the next code unit. 82 range& operator++() 83 { 84 ++next; 85 return *this; 86 } 87 88 // Move to the Nth code unit. 89 range& operator+=(size_t n) 90 { 91 next += n; 92 return *this; 93 } 94 95 // The number of code units remaining. 96 size_t size() const { return end - next; } 97 98 // The number of bytes remaining. 99 size_t nbytes() const { return (const char*)end - (const char*)next; } 100 }; 101 102 // This specialization is used when accessing char16_t values through 103 // pointers to char, which might not be correctly aligned for char16_t. 104 template<typename Elem> 105 struct range<Elem, false> 106 { 107 using value_type = typename remove_const<Elem>::type; 108 109 using char_pointer = typename 110 conditional<is_const<Elem>::value, const char*, char*>::type; 111 112 char_pointer next; 113 char_pointer end; 114 115 // Write a code unit. 116 range& operator=(Elem e) 117 { 118 memcpy(next, &e, sizeof(Elem)); 119 ++*this; 120 return *this; 121 } 122 123 // Read the next code unit. 124 Elem operator*() const 125 { 126 value_type e; 127 memcpy(&e, next, sizeof(Elem)); 128 return e; 129 } 130 131 // Read the Nth code unit. 132 Elem operator[](size_t n) const 133 { 134 value_type e; 135 memcpy(&e, next + n * sizeof(Elem), sizeof(Elem)); 136 return e; 137 } 138 139 // Move to the next code unit. 140 range& operator++() 141 { 142 next += sizeof(Elem); 143 return *this; 144 } 145 146 // Move to the Nth code unit. 147 range& operator+=(size_t n) 148 { 149 next += n * sizeof(Elem); 150 return *this; 151 } 152 153 // The number of code units remaining. 154 size_t size() const { return nbytes() / sizeof(Elem); } 155 156 // The number of bytes remaining. 157 size_t nbytes() const { return end - next; } 158 }; 159 160 // Multibyte sequences can have "header" consisting of Byte Order Mark 161 const unsigned char utf8_bom[3] = { 0xEF, 0xBB, 0xBF }; 162 const unsigned char utf16_bom[2] = { 0xFE, 0xFF }; 163 const unsigned char utf16le_bom[2] = { 0xFF, 0xFE }; 164 165 // Write a BOM (space permitting). 166 template<typename C, bool A, size_t N> 167 bool 168 write_bom(range<C, A>& to, const unsigned char (&bom)[N]) 169 { 170 static_assert( (N / sizeof(C)) != 0, "" ); 171 static_assert( (N % sizeof(C)) == 0, "" ); 172 173 if (to.nbytes() < N) 174 return false; 175 memcpy(to.next, bom, N); 176 to += (N / sizeof(C)); 177 return true; 178 } 179 180 // Try to read a BOM. 181 template<typename C, bool A, size_t N> 182 bool 183 read_bom(range<C, A>& from, const unsigned char (&bom)[N]) 184 { 185 static_assert( (N / sizeof(C)) != 0, "" ); 186 static_assert( (N % sizeof(C)) == 0, "" ); 187 188 if (from.nbytes() >= N && !memcmp(from.next, bom, N)) 189 { 190 from += (N / sizeof(C)); 191 return true; 192 } 193 return false; 194 } 195 196 // If generate_header is set in mode write out UTF-8 BOM. 197 bool 198 write_utf8_bom(range<char>& to, codecvt_mode mode) 199 { 200 if (mode & generate_header) 201 return write_bom(to, utf8_bom); 202 return true; 203 } 204 205 // If generate_header is set in mode write out the UTF-16 BOM indicated 206 // by whether little_endian is set in mode. 207 template<bool Aligned> 208 bool 209 write_utf16_bom(range<char16_t, Aligned>& to, codecvt_mode mode) 210 { 211 if (mode & generate_header) 212 { 213 if (mode & little_endian) 214 return write_bom(to, utf16le_bom); 215 else 216 return write_bom(to, utf16_bom); 217 } 218 return true; 219 } 220 221 // If consume_header is set in mode update from.next to after any BOM. 222 void 223 read_utf8_bom(range<const char>& from, codecvt_mode mode) 224 { 225 if (mode & consume_header) 226 read_bom(from, utf8_bom); 227 } 228 229 // If consume_header is not set in mode, no effects. 230 // Otherwise, if *from.next is a UTF-16 BOM increment from.next and then: 231 // - if the UTF-16BE BOM was found unset little_endian in mode, or 232 // - if the UTF-16LE BOM was found set little_endian in mode. 233 template<bool Aligned> 234 void 235 read_utf16_bom(range<const char16_t, Aligned>& from, codecvt_mode& mode) 236 { 237 if (mode & consume_header) 238 { 239 if (read_bom(from, utf16_bom)) 240 mode &= ~little_endian; 241 else if (read_bom(from, utf16le_bom)) 242 mode |= little_endian; 243 } 244 } 245 246 // Read a codepoint from a UTF-8 multibyte sequence. 247 // Updates from.next if the codepoint is not greater than maxcode. 248 // Returns invalid_mb_sequence, incomplete_mb_character or the code point. 249 char32_t 250 read_utf8_code_point(range<const char>& from, unsigned long maxcode) 251 { 252 const size_t avail = from.size(); 253 if (avail == 0) 254 return incomplete_mb_character; 255 unsigned char c1 = from[0]; 256 // https://en.wikipedia.org/wiki/UTF-8#Sample_code 257 if (c1 < 0x80) 258 { 259 ++from; 260 return c1; 261 } 262 else if (c1 < 0xC2) // continuation or overlong 2-byte sequence 263 return invalid_mb_sequence; 264 else if (c1 < 0xE0) // 2-byte sequence 265 { 266 if (avail < 2) 267 return incomplete_mb_character; 268 unsigned char c2 = from[1]; 269 if ((c2 & 0xC0) != 0x80) 270 return invalid_mb_sequence; 271 char32_t c = (c1 << 6) + c2 - 0x3080; 272 if (c <= maxcode) 273 from += 2; 274 return c; 275 } 276 else if (c1 < 0xF0) // 3-byte sequence 277 { 278 if (avail < 3) 279 return incomplete_mb_character; 280 unsigned char c2 = from[1]; 281 if ((c2 & 0xC0) != 0x80) 282 return invalid_mb_sequence; 283 if (c1 == 0xE0 && c2 < 0xA0) // overlong 284 return invalid_mb_sequence; 285 unsigned char c3 = from[2]; 286 if ((c3 & 0xC0) != 0x80) 287 return invalid_mb_sequence; 288 char32_t c = (c1 << 12) + (c2 << 6) + c3 - 0xE2080; 289 if (c <= maxcode) 290 from += 3; 291 return c; 292 } 293 else if (c1 < 0xF5) // 4-byte sequence 294 { 295 if (avail < 4) 296 return incomplete_mb_character; 297 unsigned char c2 = from[1]; 298 if ((c2 & 0xC0) != 0x80) 299 return invalid_mb_sequence; 300 if (c1 == 0xF0 && c2 < 0x90) // overlong 301 return invalid_mb_sequence; 302 if (c1 == 0xF4 && c2 >= 0x90) // > U+10FFFF 303 return invalid_mb_sequence; 304 unsigned char c3 = from[2]; 305 if ((c3 & 0xC0) != 0x80) 306 return invalid_mb_sequence; 307 unsigned char c4 = from[3]; 308 if ((c4 & 0xC0) != 0x80) 309 return invalid_mb_sequence; 310 char32_t c = (c1 << 18) + (c2 << 12) + (c3 << 6) + c4 - 0x3C82080; 311 if (c <= maxcode) 312 from += 4; 313 return c; 314 } 315 else // > U+10FFFF 316 return invalid_mb_sequence; 317 } 318 319 bool 320 write_utf8_code_point(range<char>& to, char32_t code_point) 321 { 322 if (code_point < 0x80) 323 { 324 if (to.size() < 1) 325 return false; 326 to = code_point; 327 } 328 else if (code_point <= 0x7FF) 329 { 330 if (to.size() < 2) 331 return false; 332 to = (code_point >> 6) + 0xC0; 333 to = (code_point & 0x3F) + 0x80; 334 } 335 else if (code_point <= 0xFFFF) 336 { 337 if (to.size() < 3) 338 return false; 339 to = (code_point >> 12) + 0xE0; 340 to = ((code_point >> 6) & 0x3F) + 0x80; 341 to = (code_point & 0x3F) + 0x80; 342 } 343 else if (code_point <= 0x10FFFF) 344 { 345 if (to.size() < 4) 346 return false; 347 to = (code_point >> 18) + 0xF0; 348 to = ((code_point >> 12) & 0x3F) + 0x80; 349 to = ((code_point >> 6) & 0x3F) + 0x80; 350 to = (code_point & 0x3F) + 0x80; 351 } 352 else 353 return false; 354 return true; 355 } 356 357 inline char16_t 358 adjust_byte_order(char16_t c, codecvt_mode mode) 359 { 360 #if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ 361 return (mode & little_endian) ? __builtin_bswap16(c) : c; 362 #else 363 return (mode & little_endian) ? c : __builtin_bswap16(c); 364 #endif 365 } 366 367 // Return true if c is a high-surrogate (aka leading) code point. 368 inline bool 369 is_high_surrogate(char32_t c) 370 { 371 return c >= 0xD800 && c <= 0xDBFF; 372 } 373 374 // Return true if c is a low-surrogate (aka trailing) code point. 375 inline bool 376 is_low_surrogate(char32_t c) 377 { 378 return c >= 0xDC00 && c <= 0xDFFF; 379 } 380 381 inline char32_t 382 surrogate_pair_to_code_point(char32_t high, char32_t low) 383 { 384 return (high << 10) + low - 0x35FDC00; 385 } 386 387 // Read a codepoint from a UTF-16 multibyte sequence. 388 // The sequence's endianness is indicated by (mode & little_endian). 389 // Updates from.next if the codepoint is not greater than maxcode. 390 // Returns invalid_mb_sequence, incomplete_mb_character or the code point. 391 template<bool Aligned> 392 char32_t 393 read_utf16_code_point(range<const char16_t, Aligned>& from, 394 unsigned long maxcode, codecvt_mode mode) 395 { 396 const size_t avail = from.size(); 397 if (avail == 0) 398 return incomplete_mb_character; 399 int inc = 1; 400 char32_t c = adjust_byte_order(from[0], mode); 401 if (is_high_surrogate(c)) 402 { 403 if (avail < 2) 404 return incomplete_mb_character; 405 const char16_t c2 = adjust_byte_order(from[1], mode); 406 if (is_low_surrogate(c2)) 407 { 408 c = surrogate_pair_to_code_point(c, c2); 409 inc = 2; 410 } 411 else 412 return invalid_mb_sequence; 413 } 414 else if (is_low_surrogate(c)) 415 return invalid_mb_sequence; 416 if (c <= maxcode) 417 from += inc; 418 return c; 419 } 420 421 template<typename C, bool A> 422 bool 423 write_utf16_code_point(range<C, A>& to, char32_t codepoint, codecvt_mode mode) 424 { 425 static_assert(sizeof(C) >= 2, "a code unit must be at least 16-bit"); 426 427 if (codepoint <= max_single_utf16_unit) 428 { 429 if (to.size() > 0) 430 { 431 to = adjust_byte_order(codepoint, mode); 432 return true; 433 } 434 } 435 else if (to.size() > 1) 436 { 437 // Algorithm from http://www.unicode.org/faq/utf_bom.html#utf16-4 438 const char32_t LEAD_OFFSET = 0xD800 - (0x10000 >> 10); 439 char16_t lead = LEAD_OFFSET + (codepoint >> 10); 440 char16_t trail = 0xDC00 + (codepoint & 0x3FF); 441 to = adjust_byte_order(lead, mode); 442 to = adjust_byte_order(trail, mode); 443 return true; 444 } 445 return false; 446 } 447 448 // utf8 -> ucs4 449 codecvt_base::result 450 ucs4_in(range<const char>& from, range<char32_t>& to, 451 unsigned long maxcode = max_code_point, codecvt_mode mode = {}) 452 { 453 read_utf8_bom(from, mode); 454 while (from.size() && to.size()) 455 { 456 const char32_t codepoint = read_utf8_code_point(from, maxcode); 457 if (codepoint == incomplete_mb_character) 458 return codecvt_base::partial; 459 if (codepoint > maxcode) 460 return codecvt_base::error; 461 to = codepoint; 462 } 463 return from.size() ? codecvt_base::partial : codecvt_base::ok; 464 } 465 466 // ucs4 -> utf8 467 codecvt_base::result 468 ucs4_out(range<const char32_t>& from, range<char>& to, 469 unsigned long maxcode = max_code_point, codecvt_mode mode = {}) 470 { 471 if (!write_utf8_bom(to, mode)) 472 return codecvt_base::partial; 473 while (from.size()) 474 { 475 const char32_t c = from[0]; 476 if (c > maxcode) 477 return codecvt_base::error; 478 if (!write_utf8_code_point(to, c)) 479 return codecvt_base::partial; 480 ++from; 481 } 482 return codecvt_base::ok; 483 } 484 485 // utf16 -> ucs4 486 codecvt_base::result 487 ucs4_in(range<const char16_t, false>& from, range<char32_t>& to, 488 unsigned long maxcode = max_code_point, codecvt_mode mode = {}) 489 { 490 read_utf16_bom(from, mode); 491 while (from.size() && to.size()) 492 { 493 const char32_t codepoint = read_utf16_code_point(from, maxcode, mode); 494 if (codepoint == incomplete_mb_character) 495 return codecvt_base::partial; 496 if (codepoint > maxcode) 497 return codecvt_base::error; 498 to = codepoint; 499 } 500 return from.size() ? codecvt_base::partial : codecvt_base::ok; 501 } 502 503 // ucs4 -> utf16 504 codecvt_base::result 505 ucs4_out(range<const char32_t>& from, range<char16_t, false>& to, 506 unsigned long maxcode = max_code_point, codecvt_mode mode = {}) 507 { 508 if (!write_utf16_bom(to, mode)) 509 return codecvt_base::partial; 510 while (from.size()) 511 { 512 const char32_t c = from[0]; 513 if (c > maxcode) 514 return codecvt_base::error; 515 if (!write_utf16_code_point(to, c, mode)) 516 return codecvt_base::partial; 517 ++from; 518 } 519 return codecvt_base::ok; 520 } 521 522 // Flag indicating whether to process UTF-16 or UCS2 523 enum class surrogates { allowed, disallowed }; 524 525 // utf8 -> utf16 (or utf8 -> ucs2 if s == surrogates::disallowed) 526 template<typename C> 527 codecvt_base::result 528 utf16_in(range<const char>& from, range<C>& to, 529 unsigned long maxcode = max_code_point, codecvt_mode mode = {}, 530 surrogates s = surrogates::allowed) 531 { 532 read_utf8_bom(from, mode); 533 while (from.size() && to.size()) 534 { 535 auto orig = from; 536 const char32_t codepoint = read_utf8_code_point(from, maxcode); 537 if (codepoint == incomplete_mb_character) 538 { 539 if (s == surrogates::allowed) 540 return codecvt_base::partial; 541 else 542 return codecvt_base::error; // No surrogates in UCS2 543 } 544 if (codepoint > maxcode) 545 return codecvt_base::error; 546 if (!write_utf16_code_point(to, codepoint, mode)) 547 { 548 from = orig; // rewind to previous position 549 return codecvt_base::partial; 550 } 551 } 552 return codecvt_base::ok; 553 } 554 555 // utf16 -> utf8 (or ucs2 -> utf8 if s == surrogates::disallowed) 556 template<typename C> 557 codecvt_base::result 558 utf16_out(range<const C>& from, range<char>& to, 559 unsigned long maxcode = max_code_point, codecvt_mode mode = {}, 560 surrogates s = surrogates::allowed) 561 { 562 if (!write_utf8_bom(to, mode)) 563 return codecvt_base::partial; 564 while (from.size()) 565 { 566 char32_t c = from[0]; 567 int inc = 1; 568 if (is_high_surrogate(c)) 569 { 570 if (s == surrogates::disallowed) 571 return codecvt_base::error; // No surrogates in UCS-2 572 573 if (from.size() < 2) 574 return codecvt_base::ok; // stop converting at this point 575 576 const char32_t c2 = from[1]; 577 if (is_low_surrogate(c2)) 578 { 579 c = surrogate_pair_to_code_point(c, c2); 580 inc = 2; 581 } 582 else 583 return codecvt_base::error; 584 } 585 else if (is_low_surrogate(c)) 586 return codecvt_base::error; 587 if (c > maxcode) 588 return codecvt_base::error; 589 if (!write_utf8_code_point(to, c)) 590 return codecvt_base::partial; 591 from += inc; 592 } 593 return codecvt_base::ok; 594 } 595 596 // return pos such that [begin,pos) is valid UTF-16 string no longer than max 597 const char* 598 utf16_span(const char* begin, const char* end, size_t max, 599 char32_t maxcode = max_code_point, codecvt_mode mode = {}) 600 { 601 range<const char> from{ begin, end }; 602 read_utf8_bom(from, mode); 603 size_t count = 0; 604 while (count+1 < max) 605 { 606 char32_t c = read_utf8_code_point(from, maxcode); 607 if (c > maxcode) 608 return from.next; 609 else if (c > max_single_utf16_unit) 610 ++count; 611 ++count; 612 } 613 if (count+1 == max) // take one more character if it fits in a single unit 614 read_utf8_code_point(from, std::min(max_single_utf16_unit, maxcode)); 615 return from.next; 616 } 617 618 // utf8 -> ucs2 619 codecvt_base::result 620 ucs2_in(range<const char>& from, range<char16_t>& to, 621 char32_t maxcode = max_code_point, codecvt_mode mode = {}) 622 { 623 // UCS-2 only supports characters in the BMP, i.e. one UTF-16 code unit: 624 maxcode = std::min(max_single_utf16_unit, maxcode); 625 return utf16_in(from, to, maxcode, mode, surrogates::disallowed); 626 } 627 628 // ucs2 -> utf8 629 codecvt_base::result 630 ucs2_out(range<const char16_t>& from, range<char>& to, 631 char32_t maxcode = max_code_point, codecvt_mode mode = {}) 632 { 633 // UCS-2 only supports characters in the BMP, i.e. one UTF-16 code unit: 634 maxcode = std::min(max_single_utf16_unit, maxcode); 635 return utf16_out(from, to, maxcode, mode, surrogates::disallowed); 636 } 637 638 // ucs2 -> utf16 639 codecvt_base::result 640 ucs2_out(range<const char16_t>& from, range<char16_t, false>& to, 641 char32_t maxcode = max_code_point, codecvt_mode mode = {}) 642 { 643 if (!write_utf16_bom(to, mode)) 644 return codecvt_base::partial; 645 while (from.size() && to.size()) 646 { 647 char16_t c = from[0]; 648 if (is_high_surrogate(c)) 649 return codecvt_base::error; 650 if (c > maxcode) 651 return codecvt_base::error; 652 to = adjust_byte_order(c, mode); 653 ++from; 654 } 655 return from.size() == 0 ? codecvt_base::ok : codecvt_base::partial; 656 } 657 658 // utf16 -> ucs2 659 codecvt_base::result 660 ucs2_in(range<const char16_t, false>& from, range<char16_t>& to, 661 char32_t maxcode = max_code_point, codecvt_mode mode = {}) 662 { 663 read_utf16_bom(from, mode); 664 // UCS-2 only supports characters in the BMP, i.e. one UTF-16 code unit: 665 maxcode = std::min(max_single_utf16_unit, maxcode); 666 while (from.size() && to.size()) 667 { 668 const char32_t c = read_utf16_code_point(from, maxcode, mode); 669 if (c == incomplete_mb_character) 670 return codecvt_base::error; // UCS-2 only supports single units. 671 if (c > maxcode) 672 return codecvt_base::error; 673 to = c; 674 } 675 return from.size() == 0 ? codecvt_base::ok : codecvt_base::partial; 676 } 677 678 const char16_t* 679 ucs2_span(range<const char16_t, false>& from, size_t max, 680 char32_t maxcode, codecvt_mode mode) 681 { 682 read_utf16_bom(from, mode); 683 // UCS-2 only supports characters in the BMP, i.e. one UTF-16 code unit: 684 maxcode = std::min(max_single_utf16_unit, maxcode); 685 char32_t c = 0; 686 while (max-- && c <= maxcode) 687 c = read_utf16_code_point(from, maxcode, mode); 688 return reinterpret_cast<const char16_t*>(from.next); 689 } 690 691 const char* 692 ucs2_span(const char* begin, const char* end, size_t max, 693 char32_t maxcode, codecvt_mode mode) 694 { 695 range<const char> from{ begin, end }; 696 read_utf8_bom(from, mode); 697 // UCS-2 only supports characters in the BMP, i.e. one UTF-16 code unit: 698 maxcode = std::min(max_single_utf16_unit, maxcode); 699 char32_t c = 0; 700 while (max-- && c <= maxcode) 701 c = read_utf8_code_point(from, maxcode); 702 return from.next; 703 } 704 705 // return pos such that [begin,pos) is valid UCS-4 string no longer than max 706 const char* 707 ucs4_span(const char* begin, const char* end, size_t max, 708 char32_t maxcode = max_code_point, codecvt_mode mode = {}) 709 { 710 range<const char> from{ begin, end }; 711 read_utf8_bom(from, mode); 712 char32_t c = 0; 713 while (max-- && c <= maxcode) 714 c = read_utf8_code_point(from, maxcode); 715 return from.next; 716 } 717 718 // return pos such that [begin,pos) is valid UCS-4 string no longer than max 719 const char16_t* 720 ucs4_span(range<const char16_t, false>& from, size_t max, 721 char32_t maxcode = max_code_point, codecvt_mode mode = {}) 722 { 723 read_utf16_bom(from, mode); 724 char32_t c = 0; 725 while (max-- && c <= maxcode) 726 c = read_utf16_code_point(from, maxcode, mode); 727 return reinterpret_cast<const char16_t*>(from.next); 728 } 729 } 730 731 // Define members of codecvt<char16_t, char, mbstate_t> specialization. 732 // Converts from UTF-8 to UTF-16. 733 734 locale::id codecvt<char16_t, char, mbstate_t>::id; 735 736 codecvt<char16_t, char, mbstate_t>::~codecvt() { } 737 738 codecvt_base::result 739 codecvt<char16_t, char, mbstate_t>:: 740 do_out(state_type&, 741 const intern_type* __from, 742 const intern_type* __from_end, const intern_type*& __from_next, 743 extern_type* __to, extern_type* __to_end, 744 extern_type*& __to_next) const 745 { 746 range<const char16_t> from{ __from, __from_end }; 747 range<char> to{ __to, __to_end }; 748 auto res = utf16_out(from, to); 749 __from_next = from.next; 750 __to_next = to.next; 751 return res; 752 } 753 754 codecvt_base::result 755 codecvt<char16_t, char, mbstate_t>:: 756 do_unshift(state_type&, extern_type* __to, extern_type*, 757 extern_type*& __to_next) const 758 { 759 __to_next = __to; 760 return noconv; // we don't use mbstate_t for the unicode facets 761 } 762 763 codecvt_base::result 764 codecvt<char16_t, char, mbstate_t>:: 765 do_in(state_type&, const extern_type* __from, const extern_type* __from_end, 766 const extern_type*& __from_next, 767 intern_type* __to, intern_type* __to_end, 768 intern_type*& __to_next) const 769 { 770 range<const char> from{ __from, __from_end }; 771 range<char16_t> to{ __to, __to_end }; 772 #if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ 773 codecvt_mode mode = {}; 774 #else 775 codecvt_mode mode = little_endian; 776 #endif 777 auto res = utf16_in(from, to, max_code_point, mode); 778 __from_next = from.next; 779 __to_next = to.next; 780 return res; 781 } 782 783 int 784 codecvt<char16_t, char, mbstate_t>::do_encoding() const throw() 785 { return 0; } // UTF-8 is not a fixed-width encoding 786 787 bool 788 codecvt<char16_t, char, mbstate_t>::do_always_noconv() const throw() 789 { return false; } 790 791 int 792 codecvt<char16_t, char, mbstate_t>:: 793 do_length(state_type&, const extern_type* __from, 794 const extern_type* __end, size_t __max) const 795 { 796 __end = utf16_span(__from, __end, __max); 797 return __end - __from; 798 } 799 800 int 801 codecvt<char16_t, char, mbstate_t>::do_max_length() const throw() 802 { 803 // A single character (one or two UTF-16 code units) requires 804 // up to four UTF-8 code units. 805 return 4; 806 } 807 808 // Define members of codecvt<char32_t, char, mbstate_t> specialization. 809 // Converts from UTF-8 to UTF-32 (aka UCS-4). 810 811 locale::id codecvt<char32_t, char, mbstate_t>::id; 812 813 codecvt<char32_t, char, mbstate_t>::~codecvt() { } 814 815 codecvt_base::result 816 codecvt<char32_t, char, mbstate_t>:: 817 do_out(state_type&, const intern_type* __from, const intern_type* __from_end, 818 const intern_type*& __from_next, 819 extern_type* __to, extern_type* __to_end, 820 extern_type*& __to_next) const 821 { 822 range<const char32_t> from{ __from, __from_end }; 823 range<char> to{ __to, __to_end }; 824 auto res = ucs4_out(from, to); 825 __from_next = from.next; 826 __to_next = to.next; 827 return res; 828 } 829 830 codecvt_base::result 831 codecvt<char32_t, char, mbstate_t>:: 832 do_unshift(state_type&, extern_type* __to, extern_type*, 833 extern_type*& __to_next) const 834 { 835 __to_next = __to; 836 return noconv; 837 } 838 839 codecvt_base::result 840 codecvt<char32_t, char, mbstate_t>:: 841 do_in(state_type&, const extern_type* __from, const extern_type* __from_end, 842 const extern_type*& __from_next, 843 intern_type* __to, intern_type* __to_end, 844 intern_type*& __to_next) const 845 { 846 range<const char> from{ __from, __from_end }; 847 range<char32_t> to{ __to, __to_end }; 848 auto res = ucs4_in(from, to); 849 __from_next = from.next; 850 __to_next = to.next; 851 return res; 852 } 853 854 int 855 codecvt<char32_t, char, mbstate_t>::do_encoding() const throw() 856 { return 0; } // UTF-8 is not a fixed-width encoding 857 858 bool 859 codecvt<char32_t, char, mbstate_t>::do_always_noconv() const throw() 860 { return false; } 861 862 int 863 codecvt<char32_t, char, mbstate_t>:: 864 do_length(state_type&, const extern_type* __from, 865 const extern_type* __end, size_t __max) const 866 { 867 __end = ucs4_span(__from, __end, __max); 868 return __end - __from; 869 } 870 871 int 872 codecvt<char32_t, char, mbstate_t>::do_max_length() const throw() 873 { 874 // A single character (one UTF-32 code unit) requires 875 // up to 4 UTF-8 code units. 876 return 4; 877 } 878 879 // Define members of codecvt_utf8<char16_t> base class implementation. 880 // Converts from UTF-8 to UCS-2. 881 882 __codecvt_utf8_base<char16_t>::~__codecvt_utf8_base() { } 883 884 codecvt_base::result 885 __codecvt_utf8_base<char16_t>:: 886 do_out(state_type&, const intern_type* __from, const intern_type* __from_end, 887 const intern_type*& __from_next, 888 extern_type* __to, extern_type* __to_end, 889 extern_type*& __to_next) const 890 { 891 range<const char16_t> from{ __from, __from_end }; 892 range<char> to{ __to, __to_end }; 893 auto res = ucs2_out(from, to, _M_maxcode, _M_mode); 894 __from_next = from.next; 895 __to_next = to.next; 896 return res; 897 } 898 899 codecvt_base::result 900 __codecvt_utf8_base<char16_t>:: 901 do_unshift(state_type&, extern_type* __to, extern_type*, 902 extern_type*& __to_next) const 903 { 904 __to_next = __to; 905 return noconv; 906 } 907 908 codecvt_base::result 909 __codecvt_utf8_base<char16_t>:: 910 do_in(state_type&, const extern_type* __from, const extern_type* __from_end, 911 const extern_type*& __from_next, 912 intern_type* __to, intern_type* __to_end, 913 intern_type*& __to_next) const 914 { 915 range<const char> from{ __from, __from_end }; 916 range<char16_t> to{ __to, __to_end }; 917 codecvt_mode mode = codecvt_mode(_M_mode & (consume_header|generate_header)); 918 #if __BYTE_ORDER__ != __ORDER_BIG_ENDIAN__ 919 mode = codecvt_mode(mode | little_endian); 920 #endif 921 auto res = ucs2_in(from, to, _M_maxcode, mode); 922 __from_next = from.next; 923 __to_next = to.next; 924 return res; 925 } 926 927 int 928 __codecvt_utf8_base<char16_t>::do_encoding() const throw() 929 { return 0; } // UTF-8 is not a fixed-width encoding 930 931 bool 932 __codecvt_utf8_base<char16_t>::do_always_noconv() const throw() 933 { return false; } 934 935 int 936 __codecvt_utf8_base<char16_t>:: 937 do_length(state_type&, const extern_type* __from, 938 const extern_type* __end, size_t __max) const 939 { 940 __end = ucs2_span(__from, __end, __max, _M_maxcode, _M_mode); 941 return __end - __from; 942 } 943 944 int 945 __codecvt_utf8_base<char16_t>::do_max_length() const throw() 946 { 947 // A single UCS-2 character requires up to three UTF-8 code units. 948 // (UCS-2 cannot represent characters that use four UTF-8 code units). 949 int max = 3; 950 if (_M_mode & consume_header) 951 max += sizeof(utf8_bom); 952 return max; 953 } 954 955 // Define members of codecvt_utf8<char32_t> base class implementation. 956 // Converts from UTF-8 to UTF-32 (aka UCS-4). 957 958 __codecvt_utf8_base<char32_t>::~__codecvt_utf8_base() { } 959 960 codecvt_base::result 961 __codecvt_utf8_base<char32_t>:: 962 do_out(state_type&, const intern_type* __from, const intern_type* __from_end, 963 const intern_type*& __from_next, 964 extern_type* __to, extern_type* __to_end, 965 extern_type*& __to_next) const 966 { 967 range<const char32_t> from{ __from, __from_end }; 968 range<char> to{ __to, __to_end }; 969 auto res = ucs4_out(from, to, _M_maxcode, _M_mode); 970 __from_next = from.next; 971 __to_next = to.next; 972 return res; 973 } 974 975 codecvt_base::result 976 __codecvt_utf8_base<char32_t>:: 977 do_unshift(state_type&, extern_type* __to, extern_type*, 978 extern_type*& __to_next) const 979 { 980 __to_next = __to; 981 return noconv; 982 } 983 984 codecvt_base::result 985 __codecvt_utf8_base<char32_t>:: 986 do_in(state_type&, const extern_type* __from, const extern_type* __from_end, 987 const extern_type*& __from_next, 988 intern_type* __to, intern_type* __to_end, 989 intern_type*& __to_next) const 990 { 991 range<const char> from{ __from, __from_end }; 992 range<char32_t> to{ __to, __to_end }; 993 auto res = ucs4_in(from, to, _M_maxcode, _M_mode); 994 __from_next = from.next; 995 __to_next = to.next; 996 return res; 997 } 998 999 int 1000 __codecvt_utf8_base<char32_t>::do_encoding() const throw() 1001 { return 0; } // UTF-8 is not a fixed-width encoding 1002 1003 bool 1004 __codecvt_utf8_base<char32_t>::do_always_noconv() const throw() 1005 { return false; } 1006 1007 int 1008 __codecvt_utf8_base<char32_t>:: 1009 do_length(state_type&, const extern_type* __from, 1010 const extern_type* __end, size_t __max) const 1011 { 1012 __end = ucs4_span(__from, __end, __max, _M_maxcode, _M_mode); 1013 return __end - __from; 1014 } 1015 1016 int 1017 __codecvt_utf8_base<char32_t>::do_max_length() const throw() 1018 { 1019 // A single UCS-4 character requires up to four UTF-8 code units. 1020 int max = 4; 1021 if (_M_mode & consume_header) 1022 max += sizeof(utf8_bom); 1023 return max; 1024 } 1025 1026 #ifdef _GLIBCXX_USE_WCHAR_T 1027 1028 #if __SIZEOF_WCHAR_T__ == 2 1029 static_assert(sizeof(wchar_t) == sizeof(char16_t), ""); 1030 #elif __SIZEOF_WCHAR_T__ == 4 1031 static_assert(sizeof(wchar_t) == sizeof(char32_t), ""); 1032 #endif 1033 1034 // Define members of codecvt_utf8<wchar_t> base class implementation. 1035 // Converts from UTF-8 to UCS-2 or UCS-4 depending on sizeof(wchar_t). 1036 1037 __codecvt_utf8_base<wchar_t>::~__codecvt_utf8_base() { } 1038 1039 codecvt_base::result 1040 __codecvt_utf8_base<wchar_t>:: 1041 do_out(state_type&, const intern_type* __from, const intern_type* __from_end, 1042 const intern_type*& __from_next, 1043 extern_type* __to, extern_type* __to_end, 1044 extern_type*& __to_next) const 1045 { 1046 range<char> to{ __to, __to_end }; 1047 #if __SIZEOF_WCHAR_T__ == 2 1048 range<const char16_t> from{ 1049 reinterpret_cast<const char16_t*>(__from), 1050 reinterpret_cast<const char16_t*>(__from_end) 1051 }; 1052 auto res = ucs2_out(from, to, _M_maxcode, _M_mode); 1053 #elif __SIZEOF_WCHAR_T__ == 4 1054 range<const char32_t> from{ 1055 reinterpret_cast<const char32_t*>(__from), 1056 reinterpret_cast<const char32_t*>(__from_end) 1057 }; 1058 auto res = ucs4_out(from, to, _M_maxcode, _M_mode); 1059 #else 1060 return codecvt_base::error; 1061 #endif 1062 __from_next = reinterpret_cast<const wchar_t*>(from.next); 1063 __to_next = to.next; 1064 return res; 1065 } 1066 1067 codecvt_base::result 1068 __codecvt_utf8_base<wchar_t>:: 1069 do_unshift(state_type&, extern_type* __to, extern_type*, 1070 extern_type*& __to_next) const 1071 { 1072 __to_next = __to; 1073 return noconv; 1074 } 1075 1076 codecvt_base::result 1077 __codecvt_utf8_base<wchar_t>:: 1078 do_in(state_type&, const extern_type* __from, const extern_type* __from_end, 1079 const extern_type*& __from_next, 1080 intern_type* __to, intern_type* __to_end, 1081 intern_type*& __to_next) const 1082 { 1083 range<const char> from{ __from, __from_end }; 1084 #if __SIZEOF_WCHAR_T__ == 2 1085 range<char16_t> to{ 1086 reinterpret_cast<char16_t*>(__to), 1087 reinterpret_cast<char16_t*>(__to_end) 1088 }; 1089 #if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ 1090 codecvt_mode mode = {}; 1091 #else 1092 codecvt_mode mode = little_endian; 1093 #endif 1094 auto res = ucs2_in(from, to, _M_maxcode, mode); 1095 #elif __SIZEOF_WCHAR_T__ == 4 1096 range<char32_t> to{ 1097 reinterpret_cast<char32_t*>(__to), 1098 reinterpret_cast<char32_t*>(__to_end) 1099 }; 1100 auto res = ucs4_in(from, to, _M_maxcode, _M_mode); 1101 #else 1102 return codecvt_base::error; 1103 #endif 1104 __from_next = from.next; 1105 __to_next = reinterpret_cast<wchar_t*>(to.next); 1106 return res; 1107 } 1108 1109 int 1110 __codecvt_utf8_base<wchar_t>::do_encoding() const throw() 1111 { return 0; } // UTF-8 is not a fixed-width encoding 1112 1113 bool 1114 __codecvt_utf8_base<wchar_t>::do_always_noconv() const throw() 1115 { return false; } 1116 1117 int 1118 __codecvt_utf8_base<wchar_t>:: 1119 do_length(state_type&, const extern_type* __from, 1120 const extern_type* __end, size_t __max) const 1121 { 1122 #if __SIZEOF_WCHAR_T__ == 2 1123 __end = ucs2_span(__from, __end, __max, _M_maxcode, _M_mode); 1124 #elif __SIZEOF_WCHAR_T__ == 4 1125 __end = ucs4_span(__from, __end, __max, _M_maxcode, _M_mode); 1126 #else 1127 __end = __from; 1128 #endif 1129 return __end - __from; 1130 } 1131 1132 int 1133 __codecvt_utf8_base<wchar_t>::do_max_length() const throw() 1134 { 1135 #if __SIZEOF_WCHAR_T__ == 2 1136 int max = 3; // See __codecvt_utf8_base<char16_t>::do_max_length() 1137 #else 1138 int max = 4; // See __codecvt_utf8_base<char32_t>::do_max_length() 1139 #endif 1140 if (_M_mode & consume_header) 1141 max += sizeof(utf8_bom); 1142 return max; 1143 } 1144 #endif 1145 1146 // Define members of codecvt_utf16<char16_t> base class implementation. 1147 // Converts from UTF-16 to UCS-2. 1148 1149 __codecvt_utf16_base<char16_t>::~__codecvt_utf16_base() { } 1150 1151 codecvt_base::result 1152 __codecvt_utf16_base<char16_t>:: 1153 do_out(state_type&, const intern_type* __from, const intern_type* __from_end, 1154 const intern_type*& __from_next, 1155 extern_type* __to, extern_type* __to_end, 1156 extern_type*& __to_next) const 1157 { 1158 range<const char16_t> from{ __from, __from_end }; 1159 range<char16_t, false> to{ __to, __to_end }; 1160 auto res = ucs2_out(from, to, _M_maxcode, _M_mode); 1161 __from_next = from.next; 1162 __to_next = reinterpret_cast<char*>(to.next); 1163 return res; 1164 } 1165 1166 codecvt_base::result 1167 __codecvt_utf16_base<char16_t>:: 1168 do_unshift(state_type&, extern_type* __to, extern_type*, 1169 extern_type*& __to_next) const 1170 { 1171 __to_next = __to; 1172 return noconv; 1173 } 1174 1175 codecvt_base::result 1176 __codecvt_utf16_base<char16_t>:: 1177 do_in(state_type&, const extern_type* __from, const extern_type* __from_end, 1178 const extern_type*& __from_next, 1179 intern_type* __to, intern_type* __to_end, 1180 intern_type*& __to_next) const 1181 { 1182 range<const char16_t, false> from{ __from, __from_end }; 1183 range<char16_t> to{ __to, __to_end }; 1184 auto res = ucs2_in(from, to, _M_maxcode, _M_mode); 1185 __from_next = reinterpret_cast<const char*>(from.next); 1186 __to_next = to.next; 1187 if (res == codecvt_base::ok && __from_next != __from_end) 1188 res = codecvt_base::error; 1189 return res; 1190 } 1191 1192 int 1193 __codecvt_utf16_base<char16_t>::do_encoding() const throw() 1194 { return 0; } // UTF-16 is not a fixed-width encoding 1195 1196 bool 1197 __codecvt_utf16_base<char16_t>::do_always_noconv() const throw() 1198 { return false; } 1199 1200 int 1201 __codecvt_utf16_base<char16_t>:: 1202 do_length(state_type&, const extern_type* __from, 1203 const extern_type* __end, size_t __max) const 1204 { 1205 range<const char16_t, false> from{ __from, __end }; 1206 const char16_t* next = ucs2_span(from, __max, _M_maxcode, _M_mode); 1207 return reinterpret_cast<const char*>(next) - __from; 1208 } 1209 1210 int 1211 __codecvt_utf16_base<char16_t>::do_max_length() const throw() 1212 { 1213 // A single UCS-2 character requires one UTF-16 code unit (so two chars). 1214 // (UCS-2 cannot represent characters that use multiple UTF-16 code units). 1215 int max = 2; 1216 if (_M_mode & consume_header) 1217 max += sizeof(utf16_bom); 1218 return max; 1219 } 1220 1221 // Define members of codecvt_utf16<char32_t> base class implementation. 1222 // Converts from UTF-16 to UTF-32 (aka UCS-4). 1223 1224 __codecvt_utf16_base<char32_t>::~__codecvt_utf16_base() { } 1225 1226 codecvt_base::result 1227 __codecvt_utf16_base<char32_t>:: 1228 do_out(state_type&, const intern_type* __from, const intern_type* __from_end, 1229 const intern_type*& __from_next, 1230 extern_type* __to, extern_type* __to_end, 1231 extern_type*& __to_next) const 1232 { 1233 range<const char32_t> from{ __from, __from_end }; 1234 range<char16_t, false> to{ __to, __to_end }; 1235 auto res = ucs4_out(from, to, _M_maxcode, _M_mode); 1236 __from_next = from.next; 1237 __to_next = reinterpret_cast<char*>(to.next); 1238 return res; 1239 } 1240 1241 codecvt_base::result 1242 __codecvt_utf16_base<char32_t>:: 1243 do_unshift(state_type&, extern_type* __to, extern_type*, 1244 extern_type*& __to_next) const 1245 { 1246 __to_next = __to; 1247 return noconv; 1248 } 1249 1250 codecvt_base::result 1251 __codecvt_utf16_base<char32_t>:: 1252 do_in(state_type&, const extern_type* __from, const extern_type* __from_end, 1253 const extern_type*& __from_next, 1254 intern_type* __to, intern_type* __to_end, 1255 intern_type*& __to_next) const 1256 { 1257 range<const char16_t, false> from{ __from, __from_end }; 1258 range<char32_t> to{ __to, __to_end }; 1259 auto res = ucs4_in(from, to, _M_maxcode, _M_mode); 1260 __from_next = reinterpret_cast<const char*>(from.next); 1261 __to_next = to.next; 1262 if (res == codecvt_base::ok && __from_next != __from_end) 1263 res = codecvt_base::error; 1264 return res; 1265 } 1266 1267 int 1268 __codecvt_utf16_base<char32_t>::do_encoding() const throw() 1269 { return 0; } // UTF-16 is not a fixed-width encoding 1270 1271 bool 1272 __codecvt_utf16_base<char32_t>::do_always_noconv() const throw() 1273 { return false; } 1274 1275 int 1276 __codecvt_utf16_base<char32_t>:: 1277 do_length(state_type&, const extern_type* __from, 1278 const extern_type* __end, size_t __max) const 1279 { 1280 range<const char16_t, false> from{ __from, __end }; 1281 const char16_t* next = ucs4_span(from, __max, _M_maxcode, _M_mode); 1282 return reinterpret_cast<const char*>(next) - __from; 1283 } 1284 1285 int 1286 __codecvt_utf16_base<char32_t>::do_max_length() const throw() 1287 { 1288 // A single UCS-4 character requires one or two UTF-16 code units 1289 // (so up to four chars). 1290 int max = 4; 1291 if (_M_mode & consume_header) 1292 max += sizeof(utf16_bom); 1293 return max; 1294 } 1295 1296 #ifdef _GLIBCXX_USE_WCHAR_T 1297 // Define members of codecvt_utf16<wchar_t> base class implementation. 1298 // Converts from UTF-8 to UCS-2 or UCS-4 depending on sizeof(wchar_t). 1299 1300 __codecvt_utf16_base<wchar_t>::~__codecvt_utf16_base() { } 1301 1302 codecvt_base::result 1303 __codecvt_utf16_base<wchar_t>:: 1304 do_out(state_type&, const intern_type* __from, const intern_type* __from_end, 1305 const intern_type*& __from_next, 1306 extern_type* __to, extern_type* __to_end, 1307 extern_type*& __to_next) const 1308 { 1309 range<char16_t, false> to{ __to, __to_end }; 1310 #if __SIZEOF_WCHAR_T__ == 2 1311 range<const char16_t> from{ 1312 reinterpret_cast<const char16_t*>(__from), 1313 reinterpret_cast<const char16_t*>(__from_end), 1314 }; 1315 auto res = ucs2_out(from, to, _M_maxcode, _M_mode); 1316 #elif __SIZEOF_WCHAR_T__ == 4 1317 range<const char32_t> from{ 1318 reinterpret_cast<const char32_t*>(__from), 1319 reinterpret_cast<const char32_t*>(__from_end), 1320 }; 1321 auto res = ucs4_out(from, to, _M_maxcode, _M_mode); 1322 #else 1323 return codecvt_base::error; 1324 #endif 1325 __from_next = reinterpret_cast<const wchar_t*>(from.next); 1326 __to_next = reinterpret_cast<char*>(to.next); 1327 return res; 1328 } 1329 1330 codecvt_base::result 1331 __codecvt_utf16_base<wchar_t>:: 1332 do_unshift(state_type&, extern_type* __to, extern_type*, 1333 extern_type*& __to_next) const 1334 { 1335 __to_next = __to; 1336 return noconv; 1337 } 1338 1339 codecvt_base::result 1340 __codecvt_utf16_base<wchar_t>:: 1341 do_in(state_type&, const extern_type* __from, const extern_type* __from_end, 1342 const extern_type*& __from_next, 1343 intern_type* __to, intern_type* __to_end, 1344 intern_type*& __to_next) const 1345 { 1346 range<const char16_t, false> from{ __from, __from_end }; 1347 #if __SIZEOF_WCHAR_T__ == 2 1348 range<char16_t> to{ 1349 reinterpret_cast<char16_t*>(__to), 1350 reinterpret_cast<char16_t*>(__to_end), 1351 }; 1352 auto res = ucs2_in(from, to, _M_maxcode, _M_mode); 1353 #elif __SIZEOF_WCHAR_T__ == 4 1354 range<char32_t> to{ 1355 reinterpret_cast<char32_t*>(__to), 1356 reinterpret_cast<char32_t*>(__to_end), 1357 }; 1358 auto res = ucs4_in(from, to, _M_maxcode, _M_mode); 1359 #else 1360 return codecvt_base::error; 1361 #endif 1362 __from_next = reinterpret_cast<const char*>(from.next); 1363 __to_next = reinterpret_cast<wchar_t*>(to.next); 1364 if (res == codecvt_base::ok && __from_next != __from_end) 1365 res = codecvt_base::error; 1366 return res; 1367 } 1368 1369 int 1370 __codecvt_utf16_base<wchar_t>::do_encoding() const throw() 1371 { return 0; } // UTF-16 is not a fixed-width encoding 1372 1373 bool 1374 __codecvt_utf16_base<wchar_t>::do_always_noconv() const throw() 1375 { return false; } 1376 1377 int 1378 __codecvt_utf16_base<wchar_t>:: 1379 do_length(state_type&, const extern_type* __from, 1380 const extern_type* __end, size_t __max) const 1381 { 1382 range<const char16_t, false> from{ __from, __end }; 1383 #if __SIZEOF_WCHAR_T__ == 2 1384 const char16_t* next = ucs2_span(from, __max, _M_maxcode, _M_mode); 1385 #elif __SIZEOF_WCHAR_T__ == 4 1386 const char16_t* next = ucs4_span(from, __max, _M_maxcode, _M_mode); 1387 #endif 1388 return reinterpret_cast<const char*>(next) - __from; 1389 } 1390 1391 int 1392 __codecvt_utf16_base<wchar_t>::do_max_length() const throw() 1393 { 1394 #if __SIZEOF_WCHAR_T__ == 2 1395 int max = 2; // See __codecvt_utf16_base<char16_t>::do_max_length() 1396 #else 1397 int max = 4; // See __codecvt_utf16_base<char32_t>::do_max_length() 1398 #endif 1399 if (_M_mode & consume_header) 1400 max += sizeof(utf16_bom); 1401 return max; 1402 } 1403 #endif 1404 1405 // Define members of codecvt_utf8_utf16<char16_t> base class implementation. 1406 // Converts from UTF-8 to UTF-16. 1407 1408 __codecvt_utf8_utf16_base<char16_t>::~__codecvt_utf8_utf16_base() { } 1409 1410 codecvt_base::result 1411 __codecvt_utf8_utf16_base<char16_t>:: 1412 do_out(state_type&, const intern_type* __from, const intern_type* __from_end, 1413 const intern_type*& __from_next, 1414 extern_type* __to, extern_type* __to_end, 1415 extern_type*& __to_next) const 1416 { 1417 range<const char16_t> from{ __from, __from_end }; 1418 range<char> to{ __to, __to_end }; 1419 auto res = utf16_out(from, to, _M_maxcode, _M_mode); 1420 __from_next = from.next; 1421 __to_next = to.next; 1422 return res; 1423 } 1424 1425 codecvt_base::result 1426 __codecvt_utf8_utf16_base<char16_t>:: 1427 do_unshift(state_type&, extern_type* __to, extern_type*, 1428 extern_type*& __to_next) const 1429 { 1430 __to_next = __to; 1431 return noconv; 1432 } 1433 1434 codecvt_base::result 1435 __codecvt_utf8_utf16_base<char16_t>:: 1436 do_in(state_type&, const extern_type* __from, const extern_type* __from_end, 1437 const extern_type*& __from_next, 1438 intern_type* __to, intern_type* __to_end, 1439 intern_type*& __to_next) const 1440 { 1441 range<const char> from{ __from, __from_end }; 1442 range<char16_t> to{ __to, __to_end }; 1443 codecvt_mode mode = codecvt_mode(_M_mode & (consume_header|generate_header)); 1444 #if __BYTE_ORDER__ != __ORDER_BIG_ENDIAN__ 1445 mode = codecvt_mode(mode | little_endian); 1446 #endif 1447 auto res = utf16_in(from, to, _M_maxcode, mode); 1448 __from_next = from.next; 1449 __to_next = to.next; 1450 return res; 1451 } 1452 1453 int 1454 __codecvt_utf8_utf16_base<char16_t>::do_encoding() const throw() 1455 { return 0; } // UTF-8 is not a fixed-width encoding 1456 1457 bool 1458 __codecvt_utf8_utf16_base<char16_t>::do_always_noconv() const throw() 1459 { return false; } 1460 1461 int 1462 __codecvt_utf8_utf16_base<char16_t>:: 1463 do_length(state_type&, const extern_type* __from, 1464 const extern_type* __end, size_t __max) const 1465 { 1466 __end = utf16_span(__from, __end, __max, _M_maxcode, _M_mode); 1467 return __end - __from; 1468 } 1469 1470 int 1471 __codecvt_utf8_utf16_base<char16_t>::do_max_length() const throw() 1472 { 1473 // A single character can be 1 or 2 UTF-16 code units, 1474 // requiring up to 4 UTF-8 code units. 1475 int max = 4; 1476 if (_M_mode & consume_header) 1477 max += sizeof(utf8_bom); 1478 return max; 1479 } 1480 1481 // Define members of codecvt_utf8_utf16<char32_t> base class implementation. 1482 // Converts from UTF-8 to UTF-16. 1483 1484 __codecvt_utf8_utf16_base<char32_t>::~__codecvt_utf8_utf16_base() { } 1485 1486 codecvt_base::result 1487 __codecvt_utf8_utf16_base<char32_t>:: 1488 do_out(state_type&, const intern_type* __from, const intern_type* __from_end, 1489 const intern_type*& __from_next, 1490 extern_type* __to, extern_type* __to_end, 1491 extern_type*& __to_next) const 1492 { 1493 range<const char32_t> from{ __from, __from_end }; 1494 range<char> to{ __to, __to_end }; 1495 auto res = utf16_out(from, to, _M_maxcode, _M_mode); 1496 __from_next = from.next; 1497 __to_next = to.next; 1498 return res; 1499 } 1500 1501 codecvt_base::result 1502 __codecvt_utf8_utf16_base<char32_t>:: 1503 do_unshift(state_type&, extern_type* __to, extern_type*, 1504 extern_type*& __to_next) const 1505 { 1506 __to_next = __to; 1507 return noconv; 1508 } 1509 1510 codecvt_base::result 1511 __codecvt_utf8_utf16_base<char32_t>:: 1512 do_in(state_type&, const extern_type* __from, const extern_type* __from_end, 1513 const extern_type*& __from_next, 1514 intern_type* __to, intern_type* __to_end, 1515 intern_type*& __to_next) const 1516 { 1517 range<const char> from{ __from, __from_end }; 1518 range<char32_t> to{ __to, __to_end }; 1519 codecvt_mode mode = codecvt_mode(_M_mode & (consume_header|generate_header)); 1520 #if __BYTE_ORDER__ != __ORDER_BIG_ENDIAN__ 1521 mode = codecvt_mode(mode | little_endian); 1522 #endif 1523 auto res = utf16_in(from, to, _M_maxcode, mode); 1524 __from_next = from.next; 1525 __to_next = to.next; 1526 return res; 1527 } 1528 1529 int 1530 __codecvt_utf8_utf16_base<char32_t>::do_encoding() const throw() 1531 { return 0; } // UTF-8 is not a fixed-width encoding 1532 1533 bool 1534 __codecvt_utf8_utf16_base<char32_t>::do_always_noconv() const throw() 1535 { return false; } 1536 1537 int 1538 __codecvt_utf8_utf16_base<char32_t>:: 1539 do_length(state_type&, const extern_type* __from, 1540 const extern_type* __end, size_t __max) const 1541 { 1542 __end = utf16_span(__from, __end, __max, _M_maxcode, _M_mode); 1543 return __end - __from; 1544 } 1545 1546 int 1547 __codecvt_utf8_utf16_base<char32_t>::do_max_length() const throw() 1548 { 1549 // A single character can be 1 or 2 UTF-16 code units, 1550 // requiring up to 4 UTF-8 code units. 1551 int max = 4; 1552 if (_M_mode & consume_header) 1553 max += sizeof(utf8_bom); 1554 return max; 1555 } 1556 1557 #ifdef _GLIBCXX_USE_WCHAR_T 1558 // Define members of codecvt_utf8_utf16<wchar_t> base class implementation. 1559 // Converts from UTF-8 to UTF-16. 1560 1561 __codecvt_utf8_utf16_base<wchar_t>::~__codecvt_utf8_utf16_base() { } 1562 1563 codecvt_base::result 1564 __codecvt_utf8_utf16_base<wchar_t>:: 1565 do_out(state_type&, const intern_type* __from, const intern_type* __from_end, 1566 const intern_type*& __from_next, 1567 extern_type* __to, extern_type* __to_end, 1568 extern_type*& __to_next) const 1569 { 1570 range<const wchar_t> from{ __from, __from_end }; 1571 range<char> to{ __to, __to_end }; 1572 auto res = utf16_out(from, to, _M_maxcode, _M_mode); 1573 __from_next = from.next; 1574 __to_next = to.next; 1575 return res; 1576 } 1577 1578 codecvt_base::result 1579 __codecvt_utf8_utf16_base<wchar_t>:: 1580 do_unshift(state_type&, extern_type* __to, extern_type*, 1581 extern_type*& __to_next) const 1582 { 1583 __to_next = __to; 1584 return noconv; 1585 } 1586 1587 codecvt_base::result 1588 __codecvt_utf8_utf16_base<wchar_t>:: 1589 do_in(state_type&, const extern_type* __from, const extern_type* __from_end, 1590 const extern_type*& __from_next, 1591 intern_type* __to, intern_type* __to_end, 1592 intern_type*& __to_next) const 1593 { 1594 range<const char> from{ __from, __from_end }; 1595 range<wchar_t> to{ __to, __to_end }; 1596 codecvt_mode mode = codecvt_mode(_M_mode & (consume_header|generate_header)); 1597 #if __BYTE_ORDER__ != __ORDER_BIG_ENDIAN__ 1598 mode = codecvt_mode(mode | little_endian); 1599 #endif 1600 auto res = utf16_in(from, to, _M_maxcode, mode); 1601 __from_next = from.next; 1602 __to_next = to.next; 1603 return res; 1604 } 1605 1606 int 1607 __codecvt_utf8_utf16_base<wchar_t>::do_encoding() const throw() 1608 { return 0; } // UTF-8 is not a fixed-width encoding 1609 1610 bool 1611 __codecvt_utf8_utf16_base<wchar_t>::do_always_noconv() const throw() 1612 { return false; } 1613 1614 int 1615 __codecvt_utf8_utf16_base<wchar_t>:: 1616 do_length(state_type&, const extern_type* __from, 1617 const extern_type* __end, size_t __max) const 1618 { 1619 __end = utf16_span(__from, __end, __max, _M_maxcode, _M_mode); 1620 return __end - __from; 1621 } 1622 1623 int 1624 __codecvt_utf8_utf16_base<wchar_t>::do_max_length() const throw() 1625 { 1626 // A single character can be 1 or 2 UTF-16 code units, 1627 // requiring up to 4 UTF-8 code units. 1628 int max = 4; 1629 if (_M_mode & consume_header) 1630 max += sizeof(utf8_bom); 1631 return max; 1632 } 1633 #endif 1634 1635 inline template class __codecvt_abstract_base<char16_t, char, mbstate_t>; 1636 inline template class __codecvt_abstract_base<char32_t, char, mbstate_t>; 1637 template class codecvt_byname<char16_t, char, mbstate_t>; 1638 template class codecvt_byname<char32_t, char, mbstate_t>; 1639 1640 _GLIBCXX_END_NAMESPACE_VERSION 1641 } 1642 #endif // _GLIBCXX_USE_C99_STDINT_TR1 1643