1////////////////////////////////////////////////////////////
2//
3// SFML - Simple and Fast Multimedia Library
4// Copyright (C) 2007-2018 Laurent Gomila (laurent@sfml-dev.org)
5//
6// This software is provided 'as-is', without any express or implied warranty.
7// In no event will the authors be held liable for any damages arising from the use of this software.
8//
9// Permission is granted to anyone to use this software for any purpose,
10// including commercial applications, and to alter it and redistribute it freely,
11// subject to the following restrictions:
12//
13// 1. The origin of this software must not be misrepresented;
14//    you must not claim that you wrote the original software.
15//    If you use this software in a product, an acknowledgment
16//    in the product documentation would be appreciated but is not required.
17//
18// 2. Altered source versions must be plainly marked as such,
19//    and must not be misrepresented as being the original software.
20//
21// 3. This notice may not be removed or altered from any source distribution.
22//
23////////////////////////////////////////////////////////////
24
25
26////////////////////////////////////////////////////////////
27// References:
28//
29// https://www.unicode.org/
30// https://www.unicode.org/Public/PROGRAMS/CVTUTF/ConvertUTF.c
31// https://www.unicode.org/Public/PROGRAMS/CVTUTF/ConvertUTF.h
32// https://people.w3.org/rishida/scripts/uniview/conversion
33//
34////////////////////////////////////////////////////////////
35
36
37////////////////////////////////////////////////////////////
38template <typename In>
39In Utf<8>::decode(In begin, In end, Uint32& output, Uint32 replacement)
40{
41    // Some useful precomputed data
42    static const int trailing[256] =
43    {
44        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
45        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
46        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
47        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
48        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
49        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
50        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
51        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5
52    };
53    static const Uint32 offsets[6] =
54    {
55        0x00000000, 0x00003080, 0x000E2080, 0x03C82080, 0xFA082080, 0x82082080
56    };
57
58    // decode the character
59    int trailingBytes = trailing[static_cast<Uint8>(*begin)];
60    if (begin + trailingBytes < end)
61    {
62        output = 0;
63        switch (trailingBytes)
64        {
65            case 5: output += static_cast<Uint8>(*begin++); output <<= 6;
66            case 4: output += static_cast<Uint8>(*begin++); output <<= 6;
67            case 3: output += static_cast<Uint8>(*begin++); output <<= 6;
68            case 2: output += static_cast<Uint8>(*begin++); output <<= 6;
69            case 1: output += static_cast<Uint8>(*begin++); output <<= 6;
70            case 0: output += static_cast<Uint8>(*begin++);
71        }
72        output -= offsets[trailingBytes];
73    }
74    else
75    {
76        // Incomplete character
77        begin = end;
78        output = replacement;
79    }
80
81    return begin;
82}
83
84
85////////////////////////////////////////////////////////////
86template <typename Out>
87Out Utf<8>::encode(Uint32 input, Out output, Uint8 replacement)
88{
89    // Some useful precomputed data
90    static const Uint8 firstBytes[7] =
91    {
92        0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC
93    };
94
95    // encode the character
96    if ((input > 0x0010FFFF) || ((input >= 0xD800) && (input <= 0xDBFF)))
97    {
98        // Invalid character
99        if (replacement)
100            *output++ = replacement;
101    }
102    else
103    {
104        // Valid character
105
106        // Get the number of bytes to write
107        std::size_t bytestoWrite = 1;
108        if      (input <  0x80)       bytestoWrite = 1;
109        else if (input <  0x800)      bytestoWrite = 2;
110        else if (input <  0x10000)    bytestoWrite = 3;
111        else if (input <= 0x0010FFFF) bytestoWrite = 4;
112
113        // Extract the bytes to write
114        Uint8 bytes[4];
115        switch (bytestoWrite)
116        {
117            case 4: bytes[3] = static_cast<Uint8>((input | 0x80) & 0xBF); input >>= 6;
118            case 3: bytes[2] = static_cast<Uint8>((input | 0x80) & 0xBF); input >>= 6;
119            case 2: bytes[1] = static_cast<Uint8>((input | 0x80) & 0xBF); input >>= 6;
120            case 1: bytes[0] = static_cast<Uint8> (input | firstBytes[bytestoWrite]);
121        }
122
123        // Add them to the output
124        output = std::copy(bytes, bytes + bytestoWrite, output);
125    }
126
127    return output;
128}
129
130
131////////////////////////////////////////////////////////////
132template <typename In>
133In Utf<8>::next(In begin, In end)
134{
135    Uint32 codepoint;
136    return decode(begin, end, codepoint);
137}
138
139
140////////////////////////////////////////////////////////////
141template <typename In>
142std::size_t Utf<8>::count(In begin, In end)
143{
144    std::size_t length = 0;
145    while (begin < end)
146    {
147        begin = next(begin, end);
148        ++length;
149    }
150
151    return length;
152}
153
154
155////////////////////////////////////////////////////////////
156template <typename In, typename Out>
157Out Utf<8>::fromAnsi(In begin, In end, Out output, const std::locale& locale)
158{
159    while (begin < end)
160    {
161        Uint32 codepoint = Utf<32>::decodeAnsi(*begin++, locale);
162        output = encode(codepoint, output);
163    }
164
165    return output;
166}
167
168
169////////////////////////////////////////////////////////////
170template <typename In, typename Out>
171Out Utf<8>::fromWide(In begin, In end, Out output)
172{
173    while (begin < end)
174    {
175        Uint32 codepoint = Utf<32>::decodeWide(*begin++);
176        output = encode(codepoint, output);
177    }
178
179    return output;
180}
181
182
183////////////////////////////////////////////////////////////
184template <typename In, typename Out>
185Out Utf<8>::fromLatin1(In begin, In end, Out output)
186{
187    // Latin-1 is directly compatible with Unicode encodings,
188    // and can thus be treated as (a sub-range of) UTF-32
189    while (begin < end)
190        output = encode(*begin++, output);
191
192    return output;
193}
194
195
196////////////////////////////////////////////////////////////
197template <typename In, typename Out>
198Out Utf<8>::toAnsi(In begin, In end, Out output, char replacement, const std::locale& locale)
199{
200    while (begin < end)
201    {
202        Uint32 codepoint;
203        begin = decode(begin, end, codepoint);
204        output = Utf<32>::encodeAnsi(codepoint, output, replacement, locale);
205    }
206
207    return output;
208}
209
210
211////////////////////////////////////////////////////////////
212template <typename In, typename Out>
213Out Utf<8>::toWide(In begin, In end, Out output, wchar_t replacement)
214{
215    while (begin < end)
216    {
217        Uint32 codepoint;
218        begin = decode(begin, end, codepoint);
219        output = Utf<32>::encodeWide(codepoint, output, replacement);
220    }
221
222    return output;
223}
224
225
226////////////////////////////////////////////////////////////
227template <typename In, typename Out>
228Out Utf<8>::toLatin1(In begin, In end, Out output, char replacement)
229{
230    // Latin-1 is directly compatible with Unicode encodings,
231    // and can thus be treated as (a sub-range of) UTF-32
232    while (begin < end)
233    {
234        Uint32 codepoint;
235        begin = decode(begin, end, codepoint);
236        *output++ = codepoint < 256 ? static_cast<char>(codepoint) : replacement;
237    }
238
239    return output;
240}
241
242
243////////////////////////////////////////////////////////////
244template <typename In, typename Out>
245Out Utf<8>::toUtf8(In begin, In end, Out output)
246{
247    return std::copy(begin, end, output);
248}
249
250
251////////////////////////////////////////////////////////////
252template <typename In, typename Out>
253Out Utf<8>::toUtf16(In begin, In end, Out output)
254{
255    while (begin < end)
256    {
257        Uint32 codepoint;
258        begin = decode(begin, end, codepoint);
259        output = Utf<16>::encode(codepoint, output);
260    }
261
262    return output;
263}
264
265
266////////////////////////////////////////////////////////////
267template <typename In, typename Out>
268Out Utf<8>::toUtf32(In begin, In end, Out output)
269{
270    while (begin < end)
271    {
272        Uint32 codepoint;
273        begin = decode(begin, end, codepoint);
274        *output++ = codepoint;
275    }
276
277    return output;
278}
279
280
281////////////////////////////////////////////////////////////
282template <typename In>
283In Utf<16>::decode(In begin, In end, Uint32& output, Uint32 replacement)
284{
285    Uint16 first = *begin++;
286
287    // If it's a surrogate pair, first convert to a single UTF-32 character
288    if ((first >= 0xD800) && (first <= 0xDBFF))
289    {
290        if (begin < end)
291        {
292            Uint32 second = *begin++;
293            if ((second >= 0xDC00) && (second <= 0xDFFF))
294            {
295                // The second element is valid: convert the two elements to a UTF-32 character
296                output = static_cast<Uint32>(((first - 0xD800) << 10) + (second - 0xDC00) + 0x0010000);
297            }
298            else
299            {
300                // Invalid character
301                output = replacement;
302            }
303        }
304        else
305        {
306            // Invalid character
307            begin = end;
308            output = replacement;
309        }
310    }
311    else
312    {
313        // We can make a direct copy
314        output = first;
315    }
316
317    return begin;
318}
319
320
321////////////////////////////////////////////////////////////
322template <typename Out>
323Out Utf<16>::encode(Uint32 input, Out output, Uint16 replacement)
324{
325    if (input <= 0xFFFF)
326    {
327        // The character can be copied directly, we just need to check if it's in the valid range
328        if ((input >= 0xD800) && (input <= 0xDFFF))
329        {
330            // Invalid character (this range is reserved)
331            if (replacement)
332                *output++ = replacement;
333        }
334        else
335        {
336            // Valid character directly convertible to a single UTF-16 character
337            *output++ = static_cast<Uint16>(input);
338        }
339    }
340    else if (input > 0x0010FFFF)
341    {
342        // Invalid character (greater than the maximum Unicode value)
343        if (replacement)
344            *output++ = replacement;
345    }
346    else
347    {
348        // The input character will be converted to two UTF-16 elements
349        input -= 0x0010000;
350        *output++ = static_cast<Uint16>((input >> 10)     + 0xD800);
351        *output++ = static_cast<Uint16>((input & 0x3FFUL) + 0xDC00);
352    }
353
354    return output;
355}
356
357
358////////////////////////////////////////////////////////////
359template <typename In>
360In Utf<16>::next(In begin, In end)
361{
362    Uint32 codepoint;
363    return decode(begin, end, codepoint);
364}
365
366
367////////////////////////////////////////////////////////////
368template <typename In>
369std::size_t Utf<16>::count(In begin, In end)
370{
371    std::size_t length = 0;
372    while (begin < end)
373    {
374        begin = next(begin, end);
375        ++length;
376    }
377
378    return length;
379}
380
381
382////////////////////////////////////////////////////////////
383template <typename In, typename Out>
384Out Utf<16>::fromAnsi(In begin, In end, Out output, const std::locale& locale)
385{
386    while (begin < end)
387    {
388        Uint32 codepoint = Utf<32>::decodeAnsi(*begin++, locale);
389        output = encode(codepoint, output);
390    }
391
392    return output;
393}
394
395
396////////////////////////////////////////////////////////////
397template <typename In, typename Out>
398Out Utf<16>::fromWide(In begin, In end, Out output)
399{
400    while (begin < end)
401    {
402        Uint32 codepoint = Utf<32>::decodeWide(*begin++);
403        output = encode(codepoint, output);
404    }
405
406    return output;
407}
408
409
410////////////////////////////////////////////////////////////
411template <typename In, typename Out>
412Out Utf<16>::fromLatin1(In begin, In end, Out output)
413{
414    // Latin-1 is directly compatible with Unicode encodings,
415    // and can thus be treated as (a sub-range of) UTF-32
416    return std::copy(begin, end, output);
417}
418
419
420////////////////////////////////////////////////////////////
421template <typename In, typename Out>
422Out Utf<16>::toAnsi(In begin, In end, Out output, char replacement, const std::locale& locale)
423{
424    while (begin < end)
425    {
426        Uint32 codepoint;
427        begin = decode(begin, end, codepoint);
428        output = Utf<32>::encodeAnsi(codepoint, output, replacement, locale);
429    }
430
431    return output;
432}
433
434
435////////////////////////////////////////////////////////////
436template <typename In, typename Out>
437Out Utf<16>::toWide(In begin, In end, Out output, wchar_t replacement)
438{
439    while (begin < end)
440    {
441        Uint32 codepoint;
442        begin = decode(begin, end, codepoint);
443        output = Utf<32>::encodeWide(codepoint, output, replacement);
444    }
445
446    return output;
447}
448
449
450////////////////////////////////////////////////////////////
451template <typename In, typename Out>
452Out Utf<16>::toLatin1(In begin, In end, Out output, char replacement)
453{
454    // Latin-1 is directly compatible with Unicode encodings,
455    // and can thus be treated as (a sub-range of) UTF-32
456    while (begin < end)
457    {
458        *output++ = *begin < 256 ? static_cast<char>(*begin) : replacement;
459        begin++;
460    }
461
462    return output;
463}
464
465
466////////////////////////////////////////////////////////////
467template <typename In, typename Out>
468Out Utf<16>::toUtf8(In begin, In end, Out output)
469{
470    while (begin < end)
471    {
472        Uint32 codepoint;
473        begin = decode(begin, end, codepoint);
474        output = Utf<8>::encode(codepoint, output);
475    }
476
477    return output;
478}
479
480
481////////////////////////////////////////////////////////////
482template <typename In, typename Out>
483Out Utf<16>::toUtf16(In begin, In end, Out output)
484{
485    return std::copy(begin, end, output);
486}
487
488
489////////////////////////////////////////////////////////////
490template <typename In, typename Out>
491Out Utf<16>::toUtf32(In begin, In end, Out output)
492{
493    while (begin < end)
494    {
495        Uint32 codepoint;
496        begin = decode(begin, end, codepoint);
497        *output++ = codepoint;
498    }
499
500    return output;
501}
502
503
504////////////////////////////////////////////////////////////
505template <typename In>
506In Utf<32>::decode(In begin, In /*end*/, Uint32& output, Uint32 /*replacement*/)
507{
508    output = *begin++;
509    return begin;
510}
511
512
513////////////////////////////////////////////////////////////
514template <typename Out>
515Out Utf<32>::encode(Uint32 input, Out output, Uint32 /*replacement*/)
516{
517    *output++ = input;
518    return output;
519}
520
521
522////////////////////////////////////////////////////////////
523template <typename In>
524In Utf<32>::next(In begin, In /*end*/)
525{
526    return ++begin;
527}
528
529
530////////////////////////////////////////////////////////////
531template <typename In>
532std::size_t Utf<32>::count(In begin, In end)
533{
534    return begin - end;
535}
536
537
538////////////////////////////////////////////////////////////
539template <typename In, typename Out>
540Out Utf<32>::fromAnsi(In begin, In end, Out output, const std::locale& locale)
541{
542    while (begin < end)
543        *output++ = decodeAnsi(*begin++, locale);
544
545    return output;
546}
547
548
549////////////////////////////////////////////////////////////
550template <typename In, typename Out>
551Out Utf<32>::fromWide(In begin, In end, Out output)
552{
553    while (begin < end)
554        *output++ = decodeWide(*begin++);
555
556    return output;
557}
558
559
560////////////////////////////////////////////////////////////
561template <typename In, typename Out>
562Out Utf<32>::fromLatin1(In begin, In end, Out output)
563{
564    // Latin-1 is directly compatible with Unicode encodings,
565    // and can thus be treated as (a sub-range of) UTF-32
566    return std::copy(begin, end, output);
567}
568
569
570////////////////////////////////////////////////////////////
571template <typename In, typename Out>
572Out Utf<32>::toAnsi(In begin, In end, Out output, char replacement, const std::locale& locale)
573{
574    while (begin < end)
575        output = encodeAnsi(*begin++, output, replacement, locale);
576
577    return output;
578}
579
580
581////////////////////////////////////////////////////////////
582template <typename In, typename Out>
583Out Utf<32>::toWide(In begin, In end, Out output, wchar_t replacement)
584{
585    while (begin < end)
586        output = encodeWide(*begin++, output, replacement);
587
588    return output;
589}
590
591
592////////////////////////////////////////////////////////////
593template <typename In, typename Out>
594Out Utf<32>::toLatin1(In begin, In end, Out output, char replacement)
595{
596    // Latin-1 is directly compatible with Unicode encodings,
597    // and can thus be treated as (a sub-range of) UTF-32
598    while (begin < end)
599    {
600        *output++ = *begin < 256 ? static_cast<char>(*begin) : replacement;
601        begin++;
602    }
603
604    return output;
605}
606
607
608////////////////////////////////////////////////////////////
609template <typename In, typename Out>
610Out Utf<32>::toUtf8(In begin, In end, Out output)
611{
612    while (begin < end)
613        output = Utf<8>::encode(*begin++, output);
614
615    return output;
616}
617
618////////////////////////////////////////////////////////////
619template <typename In, typename Out>
620Out Utf<32>::toUtf16(In begin, In end, Out output)
621{
622    while (begin < end)
623        output = Utf<16>::encode(*begin++, output);
624
625    return output;
626}
627
628
629////////////////////////////////////////////////////////////
630template <typename In, typename Out>
631Out Utf<32>::toUtf32(In begin, In end, Out output)
632{
633    return std::copy(begin, end, output);
634}
635
636
637////////////////////////////////////////////////////////////
638template <typename In>
639Uint32 Utf<32>::decodeAnsi(In input, const std::locale& locale)
640{
641    // On Windows, GCC's standard library (glibc++) has almost
642    // no support for Unicode stuff. As a consequence, in this
643    // context we can only use the default locale and ignore
644    // the one passed as parameter.
645
646    #if defined(SFML_SYSTEM_WINDOWS) &&                       /* if Windows ... */                          \
647       (defined(__GLIBCPP__) || defined (__GLIBCXX__)) &&     /* ... and standard library is glibc++ ... */ \
648      !(defined(__SGI_STL_PORT) || defined(_STLPORT_VERSION)) /* ... and STLPort is not used on top of it */
649
650        (void)locale; // to avoid warnings
651
652        wchar_t character = 0;
653        mbtowc(&character, &input, 1);
654        return static_cast<Uint32>(character);
655
656    #else
657
658        // Get the facet of the locale which deals with character conversion
659        const std::ctype<wchar_t>& facet = std::use_facet< std::ctype<wchar_t> >(locale);
660
661        // Use the facet to convert each character of the input string
662        return static_cast<Uint32>(facet.widen(input));
663
664    #endif
665}
666
667
668////////////////////////////////////////////////////////////
669template <typename In>
670Uint32 Utf<32>::decodeWide(In input)
671{
672    // The encoding of wide characters is not well defined and is left to the system;
673    // however we can safely assume that it is UCS-2 on Windows and
674    // UCS-4 on Unix systems.
675    // In both cases, a simple copy is enough (UCS-2 is a subset of UCS-4,
676    // and UCS-4 *is* UTF-32).
677
678    return input;
679}
680
681
682////////////////////////////////////////////////////////////
683template <typename Out>
684Out Utf<32>::encodeAnsi(Uint32 codepoint, Out output, char replacement, const std::locale& locale)
685{
686    // On Windows, gcc's standard library (glibc++) has almost
687    // no support for Unicode stuff. As a consequence, in this
688    // context we can only use the default locale and ignore
689    // the one passed as parameter.
690
691    #if defined(SFML_SYSTEM_WINDOWS) &&                       /* if Windows ... */                          \
692       (defined(__GLIBCPP__) || defined (__GLIBCXX__)) &&     /* ... and standard library is glibc++ ... */ \
693      !(defined(__SGI_STL_PORT) || defined(_STLPORT_VERSION)) /* ... and STLPort is not used on top of it */
694
695        (void)locale; // to avoid warnings
696
697        char character = 0;
698        if (wctomb(&character, static_cast<wchar_t>(codepoint)) >= 0)
699            *output++ = character;
700        else if (replacement)
701            *output++ = replacement;
702
703        return output;
704
705    #else
706
707        // Get the facet of the locale which deals with character conversion
708        const std::ctype<wchar_t>& facet = std::use_facet< std::ctype<wchar_t> >(locale);
709
710        // Use the facet to convert each character of the input string
711        *output++ = facet.narrow(static_cast<wchar_t>(codepoint), replacement);
712
713        return output;
714
715    #endif
716}
717
718
719////////////////////////////////////////////////////////////
720template <typename Out>
721Out Utf<32>::encodeWide(Uint32 codepoint, Out output, wchar_t replacement)
722{
723    // The encoding of wide characters is not well defined and is left to the system;
724    // however we can safely assume that it is UCS-2 on Windows and
725    // UCS-4 on Unix systems.
726    // For UCS-2 we need to check if the source characters fits in (UCS-2 is a subset of UCS-4).
727    // For UCS-4 we can do a direct copy (UCS-4 *is* UTF-32).
728
729    switch (sizeof(wchar_t))
730    {
731        case 4:
732        {
733            *output++ = static_cast<wchar_t>(codepoint);
734            break;
735        }
736
737        default:
738        {
739            if ((codepoint <= 0xFFFF) && ((codepoint < 0xD800) || (codepoint > 0xDFFF)))
740            {
741                *output++ = static_cast<wchar_t>(codepoint);
742            }
743            else if (replacement)
744            {
745                *output++ = replacement;
746            }
747            break;
748        }
749    }
750
751    return output;
752}
753