1//////////////////////////////////////////////////////////// 2// 3// SFML - Simple and Fast Multimedia Library 4// Copyright (C) 2007-2018 Laurent Gomila (laurent@sfml-dev.org) 5// 6// This software is provided 'as-is', without any express or implied warranty. 7// In no event will the authors be held liable for any damages arising from the use of this software. 8// 9// Permission is granted to anyone to use this software for any purpose, 10// including commercial applications, and to alter it and redistribute it freely, 11// subject to the following restrictions: 12// 13// 1. The origin of this software must not be misrepresented; 14// you must not claim that you wrote the original software. 15// If you use this software in a product, an acknowledgment 16// in the product documentation would be appreciated but is not required. 17// 18// 2. Altered source versions must be plainly marked as such, 19// and must not be misrepresented as being the original software. 20// 21// 3. This notice may not be removed or altered from any source distribution. 22// 23//////////////////////////////////////////////////////////// 24 25 26//////////////////////////////////////////////////////////// 27// References: 28// 29// https://www.unicode.org/ 30// https://www.unicode.org/Public/PROGRAMS/CVTUTF/ConvertUTF.c 31// https://www.unicode.org/Public/PROGRAMS/CVTUTF/ConvertUTF.h 32// https://people.w3.org/rishida/scripts/uniview/conversion 33// 34//////////////////////////////////////////////////////////// 35 36 37//////////////////////////////////////////////////////////// 38template <typename In> 39In Utf<8>::decode(In begin, In end, Uint32& output, Uint32 replacement) 40{ 41 // Some useful precomputed data 42 static const int trailing[256] = 43 { 44 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 45 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 46 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 47 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 48 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 49 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 50 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 51 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5 52 }; 53 static const Uint32 offsets[6] = 54 { 55 0x00000000, 0x00003080, 0x000E2080, 0x03C82080, 0xFA082080, 0x82082080 56 }; 57 58 // decode the character 59 int trailingBytes = trailing[static_cast<Uint8>(*begin)]; 60 if (begin + trailingBytes < end) 61 { 62 output = 0; 63 switch (trailingBytes) 64 { 65 case 5: output += static_cast<Uint8>(*begin++); output <<= 6; 66 case 4: output += static_cast<Uint8>(*begin++); output <<= 6; 67 case 3: output += static_cast<Uint8>(*begin++); output <<= 6; 68 case 2: output += static_cast<Uint8>(*begin++); output <<= 6; 69 case 1: output += static_cast<Uint8>(*begin++); output <<= 6; 70 case 0: output += static_cast<Uint8>(*begin++); 71 } 72 output -= offsets[trailingBytes]; 73 } 74 else 75 { 76 // Incomplete character 77 begin = end; 78 output = replacement; 79 } 80 81 return begin; 82} 83 84 85//////////////////////////////////////////////////////////// 86template <typename Out> 87Out Utf<8>::encode(Uint32 input, Out output, Uint8 replacement) 88{ 89 // Some useful precomputed data 90 static const Uint8 firstBytes[7] = 91 { 92 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC 93 }; 94 95 // encode the character 96 if ((input > 0x0010FFFF) || ((input >= 0xD800) && (input <= 0xDBFF))) 97 { 98 // Invalid character 99 if (replacement) 100 *output++ = replacement; 101 } 102 else 103 { 104 // Valid character 105 106 // Get the number of bytes to write 107 std::size_t bytestoWrite = 1; 108 if (input < 0x80) bytestoWrite = 1; 109 else if (input < 0x800) bytestoWrite = 2; 110 else if (input < 0x10000) bytestoWrite = 3; 111 else if (input <= 0x0010FFFF) bytestoWrite = 4; 112 113 // Extract the bytes to write 114 Uint8 bytes[4]; 115 switch (bytestoWrite) 116 { 117 case 4: bytes[3] = static_cast<Uint8>((input | 0x80) & 0xBF); input >>= 6; 118 case 3: bytes[2] = static_cast<Uint8>((input | 0x80) & 0xBF); input >>= 6; 119 case 2: bytes[1] = static_cast<Uint8>((input | 0x80) & 0xBF); input >>= 6; 120 case 1: bytes[0] = static_cast<Uint8> (input | firstBytes[bytestoWrite]); 121 } 122 123 // Add them to the output 124 output = std::copy(bytes, bytes + bytestoWrite, output); 125 } 126 127 return output; 128} 129 130 131//////////////////////////////////////////////////////////// 132template <typename In> 133In Utf<8>::next(In begin, In end) 134{ 135 Uint32 codepoint; 136 return decode(begin, end, codepoint); 137} 138 139 140//////////////////////////////////////////////////////////// 141template <typename In> 142std::size_t Utf<8>::count(In begin, In end) 143{ 144 std::size_t length = 0; 145 while (begin < end) 146 { 147 begin = next(begin, end); 148 ++length; 149 } 150 151 return length; 152} 153 154 155//////////////////////////////////////////////////////////// 156template <typename In, typename Out> 157Out Utf<8>::fromAnsi(In begin, In end, Out output, const std::locale& locale) 158{ 159 while (begin < end) 160 { 161 Uint32 codepoint = Utf<32>::decodeAnsi(*begin++, locale); 162 output = encode(codepoint, output); 163 } 164 165 return output; 166} 167 168 169//////////////////////////////////////////////////////////// 170template <typename In, typename Out> 171Out Utf<8>::fromWide(In begin, In end, Out output) 172{ 173 while (begin < end) 174 { 175 Uint32 codepoint = Utf<32>::decodeWide(*begin++); 176 output = encode(codepoint, output); 177 } 178 179 return output; 180} 181 182 183//////////////////////////////////////////////////////////// 184template <typename In, typename Out> 185Out Utf<8>::fromLatin1(In begin, In end, Out output) 186{ 187 // Latin-1 is directly compatible with Unicode encodings, 188 // and can thus be treated as (a sub-range of) UTF-32 189 while (begin < end) 190 output = encode(*begin++, output); 191 192 return output; 193} 194 195 196//////////////////////////////////////////////////////////// 197template <typename In, typename Out> 198Out Utf<8>::toAnsi(In begin, In end, Out output, char replacement, const std::locale& locale) 199{ 200 while (begin < end) 201 { 202 Uint32 codepoint; 203 begin = decode(begin, end, codepoint); 204 output = Utf<32>::encodeAnsi(codepoint, output, replacement, locale); 205 } 206 207 return output; 208} 209 210 211//////////////////////////////////////////////////////////// 212template <typename In, typename Out> 213Out Utf<8>::toWide(In begin, In end, Out output, wchar_t replacement) 214{ 215 while (begin < end) 216 { 217 Uint32 codepoint; 218 begin = decode(begin, end, codepoint); 219 output = Utf<32>::encodeWide(codepoint, output, replacement); 220 } 221 222 return output; 223} 224 225 226//////////////////////////////////////////////////////////// 227template <typename In, typename Out> 228Out Utf<8>::toLatin1(In begin, In end, Out output, char replacement) 229{ 230 // Latin-1 is directly compatible with Unicode encodings, 231 // and can thus be treated as (a sub-range of) UTF-32 232 while (begin < end) 233 { 234 Uint32 codepoint; 235 begin = decode(begin, end, codepoint); 236 *output++ = codepoint < 256 ? static_cast<char>(codepoint) : replacement; 237 } 238 239 return output; 240} 241 242 243//////////////////////////////////////////////////////////// 244template <typename In, typename Out> 245Out Utf<8>::toUtf8(In begin, In end, Out output) 246{ 247 return std::copy(begin, end, output); 248} 249 250 251//////////////////////////////////////////////////////////// 252template <typename In, typename Out> 253Out Utf<8>::toUtf16(In begin, In end, Out output) 254{ 255 while (begin < end) 256 { 257 Uint32 codepoint; 258 begin = decode(begin, end, codepoint); 259 output = Utf<16>::encode(codepoint, output); 260 } 261 262 return output; 263} 264 265 266//////////////////////////////////////////////////////////// 267template <typename In, typename Out> 268Out Utf<8>::toUtf32(In begin, In end, Out output) 269{ 270 while (begin < end) 271 { 272 Uint32 codepoint; 273 begin = decode(begin, end, codepoint); 274 *output++ = codepoint; 275 } 276 277 return output; 278} 279 280 281//////////////////////////////////////////////////////////// 282template <typename In> 283In Utf<16>::decode(In begin, In end, Uint32& output, Uint32 replacement) 284{ 285 Uint16 first = *begin++; 286 287 // If it's a surrogate pair, first convert to a single UTF-32 character 288 if ((first >= 0xD800) && (first <= 0xDBFF)) 289 { 290 if (begin < end) 291 { 292 Uint32 second = *begin++; 293 if ((second >= 0xDC00) && (second <= 0xDFFF)) 294 { 295 // The second element is valid: convert the two elements to a UTF-32 character 296 output = static_cast<Uint32>(((first - 0xD800) << 10) + (second - 0xDC00) + 0x0010000); 297 } 298 else 299 { 300 // Invalid character 301 output = replacement; 302 } 303 } 304 else 305 { 306 // Invalid character 307 begin = end; 308 output = replacement; 309 } 310 } 311 else 312 { 313 // We can make a direct copy 314 output = first; 315 } 316 317 return begin; 318} 319 320 321//////////////////////////////////////////////////////////// 322template <typename Out> 323Out Utf<16>::encode(Uint32 input, Out output, Uint16 replacement) 324{ 325 if (input <= 0xFFFF) 326 { 327 // The character can be copied directly, we just need to check if it's in the valid range 328 if ((input >= 0xD800) && (input <= 0xDFFF)) 329 { 330 // Invalid character (this range is reserved) 331 if (replacement) 332 *output++ = replacement; 333 } 334 else 335 { 336 // Valid character directly convertible to a single UTF-16 character 337 *output++ = static_cast<Uint16>(input); 338 } 339 } 340 else if (input > 0x0010FFFF) 341 { 342 // Invalid character (greater than the maximum Unicode value) 343 if (replacement) 344 *output++ = replacement; 345 } 346 else 347 { 348 // The input character will be converted to two UTF-16 elements 349 input -= 0x0010000; 350 *output++ = static_cast<Uint16>((input >> 10) + 0xD800); 351 *output++ = static_cast<Uint16>((input & 0x3FFUL) + 0xDC00); 352 } 353 354 return output; 355} 356 357 358//////////////////////////////////////////////////////////// 359template <typename In> 360In Utf<16>::next(In begin, In end) 361{ 362 Uint32 codepoint; 363 return decode(begin, end, codepoint); 364} 365 366 367//////////////////////////////////////////////////////////// 368template <typename In> 369std::size_t Utf<16>::count(In begin, In end) 370{ 371 std::size_t length = 0; 372 while (begin < end) 373 { 374 begin = next(begin, end); 375 ++length; 376 } 377 378 return length; 379} 380 381 382//////////////////////////////////////////////////////////// 383template <typename In, typename Out> 384Out Utf<16>::fromAnsi(In begin, In end, Out output, const std::locale& locale) 385{ 386 while (begin < end) 387 { 388 Uint32 codepoint = Utf<32>::decodeAnsi(*begin++, locale); 389 output = encode(codepoint, output); 390 } 391 392 return output; 393} 394 395 396//////////////////////////////////////////////////////////// 397template <typename In, typename Out> 398Out Utf<16>::fromWide(In begin, In end, Out output) 399{ 400 while (begin < end) 401 { 402 Uint32 codepoint = Utf<32>::decodeWide(*begin++); 403 output = encode(codepoint, output); 404 } 405 406 return output; 407} 408 409 410//////////////////////////////////////////////////////////// 411template <typename In, typename Out> 412Out Utf<16>::fromLatin1(In begin, In end, Out output) 413{ 414 // Latin-1 is directly compatible with Unicode encodings, 415 // and can thus be treated as (a sub-range of) UTF-32 416 return std::copy(begin, end, output); 417} 418 419 420//////////////////////////////////////////////////////////// 421template <typename In, typename Out> 422Out Utf<16>::toAnsi(In begin, In end, Out output, char replacement, const std::locale& locale) 423{ 424 while (begin < end) 425 { 426 Uint32 codepoint; 427 begin = decode(begin, end, codepoint); 428 output = Utf<32>::encodeAnsi(codepoint, output, replacement, locale); 429 } 430 431 return output; 432} 433 434 435//////////////////////////////////////////////////////////// 436template <typename In, typename Out> 437Out Utf<16>::toWide(In begin, In end, Out output, wchar_t replacement) 438{ 439 while (begin < end) 440 { 441 Uint32 codepoint; 442 begin = decode(begin, end, codepoint); 443 output = Utf<32>::encodeWide(codepoint, output, replacement); 444 } 445 446 return output; 447} 448 449 450//////////////////////////////////////////////////////////// 451template <typename In, typename Out> 452Out Utf<16>::toLatin1(In begin, In end, Out output, char replacement) 453{ 454 // Latin-1 is directly compatible with Unicode encodings, 455 // and can thus be treated as (a sub-range of) UTF-32 456 while (begin < end) 457 { 458 *output++ = *begin < 256 ? static_cast<char>(*begin) : replacement; 459 begin++; 460 } 461 462 return output; 463} 464 465 466//////////////////////////////////////////////////////////// 467template <typename In, typename Out> 468Out Utf<16>::toUtf8(In begin, In end, Out output) 469{ 470 while (begin < end) 471 { 472 Uint32 codepoint; 473 begin = decode(begin, end, codepoint); 474 output = Utf<8>::encode(codepoint, output); 475 } 476 477 return output; 478} 479 480 481//////////////////////////////////////////////////////////// 482template <typename In, typename Out> 483Out Utf<16>::toUtf16(In begin, In end, Out output) 484{ 485 return std::copy(begin, end, output); 486} 487 488 489//////////////////////////////////////////////////////////// 490template <typename In, typename Out> 491Out Utf<16>::toUtf32(In begin, In end, Out output) 492{ 493 while (begin < end) 494 { 495 Uint32 codepoint; 496 begin = decode(begin, end, codepoint); 497 *output++ = codepoint; 498 } 499 500 return output; 501} 502 503 504//////////////////////////////////////////////////////////// 505template <typename In> 506In Utf<32>::decode(In begin, In /*end*/, Uint32& output, Uint32 /*replacement*/) 507{ 508 output = *begin++; 509 return begin; 510} 511 512 513//////////////////////////////////////////////////////////// 514template <typename Out> 515Out Utf<32>::encode(Uint32 input, Out output, Uint32 /*replacement*/) 516{ 517 *output++ = input; 518 return output; 519} 520 521 522//////////////////////////////////////////////////////////// 523template <typename In> 524In Utf<32>::next(In begin, In /*end*/) 525{ 526 return ++begin; 527} 528 529 530//////////////////////////////////////////////////////////// 531template <typename In> 532std::size_t Utf<32>::count(In begin, In end) 533{ 534 return begin - end; 535} 536 537 538//////////////////////////////////////////////////////////// 539template <typename In, typename Out> 540Out Utf<32>::fromAnsi(In begin, In end, Out output, const std::locale& locale) 541{ 542 while (begin < end) 543 *output++ = decodeAnsi(*begin++, locale); 544 545 return output; 546} 547 548 549//////////////////////////////////////////////////////////// 550template <typename In, typename Out> 551Out Utf<32>::fromWide(In begin, In end, Out output) 552{ 553 while (begin < end) 554 *output++ = decodeWide(*begin++); 555 556 return output; 557} 558 559 560//////////////////////////////////////////////////////////// 561template <typename In, typename Out> 562Out Utf<32>::fromLatin1(In begin, In end, Out output) 563{ 564 // Latin-1 is directly compatible with Unicode encodings, 565 // and can thus be treated as (a sub-range of) UTF-32 566 return std::copy(begin, end, output); 567} 568 569 570//////////////////////////////////////////////////////////// 571template <typename In, typename Out> 572Out Utf<32>::toAnsi(In begin, In end, Out output, char replacement, const std::locale& locale) 573{ 574 while (begin < end) 575 output = encodeAnsi(*begin++, output, replacement, locale); 576 577 return output; 578} 579 580 581//////////////////////////////////////////////////////////// 582template <typename In, typename Out> 583Out Utf<32>::toWide(In begin, In end, Out output, wchar_t replacement) 584{ 585 while (begin < end) 586 output = encodeWide(*begin++, output, replacement); 587 588 return output; 589} 590 591 592//////////////////////////////////////////////////////////// 593template <typename In, typename Out> 594Out Utf<32>::toLatin1(In begin, In end, Out output, char replacement) 595{ 596 // Latin-1 is directly compatible with Unicode encodings, 597 // and can thus be treated as (a sub-range of) UTF-32 598 while (begin < end) 599 { 600 *output++ = *begin < 256 ? static_cast<char>(*begin) : replacement; 601 begin++; 602 } 603 604 return output; 605} 606 607 608//////////////////////////////////////////////////////////// 609template <typename In, typename Out> 610Out Utf<32>::toUtf8(In begin, In end, Out output) 611{ 612 while (begin < end) 613 output = Utf<8>::encode(*begin++, output); 614 615 return output; 616} 617 618//////////////////////////////////////////////////////////// 619template <typename In, typename Out> 620Out Utf<32>::toUtf16(In begin, In end, Out output) 621{ 622 while (begin < end) 623 output = Utf<16>::encode(*begin++, output); 624 625 return output; 626} 627 628 629//////////////////////////////////////////////////////////// 630template <typename In, typename Out> 631Out Utf<32>::toUtf32(In begin, In end, Out output) 632{ 633 return std::copy(begin, end, output); 634} 635 636 637//////////////////////////////////////////////////////////// 638template <typename In> 639Uint32 Utf<32>::decodeAnsi(In input, const std::locale& locale) 640{ 641 // On Windows, GCC's standard library (glibc++) has almost 642 // no support for Unicode stuff. As a consequence, in this 643 // context we can only use the default locale and ignore 644 // the one passed as parameter. 645 646 #if defined(SFML_SYSTEM_WINDOWS) && /* if Windows ... */ \ 647 (defined(__GLIBCPP__) || defined (__GLIBCXX__)) && /* ... and standard library is glibc++ ... */ \ 648 !(defined(__SGI_STL_PORT) || defined(_STLPORT_VERSION)) /* ... and STLPort is not used on top of it */ 649 650 (void)locale; // to avoid warnings 651 652 wchar_t character = 0; 653 mbtowc(&character, &input, 1); 654 return static_cast<Uint32>(character); 655 656 #else 657 658 // Get the facet of the locale which deals with character conversion 659 const std::ctype<wchar_t>& facet = std::use_facet< std::ctype<wchar_t> >(locale); 660 661 // Use the facet to convert each character of the input string 662 return static_cast<Uint32>(facet.widen(input)); 663 664 #endif 665} 666 667 668//////////////////////////////////////////////////////////// 669template <typename In> 670Uint32 Utf<32>::decodeWide(In input) 671{ 672 // The encoding of wide characters is not well defined and is left to the system; 673 // however we can safely assume that it is UCS-2 on Windows and 674 // UCS-4 on Unix systems. 675 // In both cases, a simple copy is enough (UCS-2 is a subset of UCS-4, 676 // and UCS-4 *is* UTF-32). 677 678 return input; 679} 680 681 682//////////////////////////////////////////////////////////// 683template <typename Out> 684Out Utf<32>::encodeAnsi(Uint32 codepoint, Out output, char replacement, const std::locale& locale) 685{ 686 // On Windows, gcc's standard library (glibc++) has almost 687 // no support for Unicode stuff. As a consequence, in this 688 // context we can only use the default locale and ignore 689 // the one passed as parameter. 690 691 #if defined(SFML_SYSTEM_WINDOWS) && /* if Windows ... */ \ 692 (defined(__GLIBCPP__) || defined (__GLIBCXX__)) && /* ... and standard library is glibc++ ... */ \ 693 !(defined(__SGI_STL_PORT) || defined(_STLPORT_VERSION)) /* ... and STLPort is not used on top of it */ 694 695 (void)locale; // to avoid warnings 696 697 char character = 0; 698 if (wctomb(&character, static_cast<wchar_t>(codepoint)) >= 0) 699 *output++ = character; 700 else if (replacement) 701 *output++ = replacement; 702 703 return output; 704 705 #else 706 707 // Get the facet of the locale which deals with character conversion 708 const std::ctype<wchar_t>& facet = std::use_facet< std::ctype<wchar_t> >(locale); 709 710 // Use the facet to convert each character of the input string 711 *output++ = facet.narrow(static_cast<wchar_t>(codepoint), replacement); 712 713 return output; 714 715 #endif 716} 717 718 719//////////////////////////////////////////////////////////// 720template <typename Out> 721Out Utf<32>::encodeWide(Uint32 codepoint, Out output, wchar_t replacement) 722{ 723 // The encoding of wide characters is not well defined and is left to the system; 724 // however we can safely assume that it is UCS-2 on Windows and 725 // UCS-4 on Unix systems. 726 // For UCS-2 we need to check if the source characters fits in (UCS-2 is a subset of UCS-4). 727 // For UCS-4 we can do a direct copy (UCS-4 *is* UTF-32). 728 729 switch (sizeof(wchar_t)) 730 { 731 case 4: 732 { 733 *output++ = static_cast<wchar_t>(codepoint); 734 break; 735 } 736 737 default: 738 { 739 if ((codepoint <= 0xFFFF) && ((codepoint < 0xD800) || (codepoint > 0xDFFF))) 740 { 741 *output++ = static_cast<wchar_t>(codepoint); 742 } 743 else if (replacement) 744 { 745 *output++ = replacement; 746 } 747 break; 748 } 749 } 750 751 return output; 752} 753