10b57cec5SDimitry Andric /*===--- ConvertUTF.c - Universal Character Names conversions ---------------===
20b57cec5SDimitry Andric  *
30b57cec5SDimitry Andric  * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
40b57cec5SDimitry Andric  * See https://llvm.org/LICENSE.txt for license information.
50b57cec5SDimitry Andric  * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
60b57cec5SDimitry Andric  *
70b57cec5SDimitry Andric  *===------------------------------------------------------------------------=*/
80b57cec5SDimitry Andric /*
9bdd1243dSDimitry Andric  * Copyright © 1991-2015 Unicode, Inc. All rights reserved.
10bdd1243dSDimitry Andric  * Distributed under the Terms of Use in
11bdd1243dSDimitry Andric  * http://www.unicode.org/copyright.html.
120b57cec5SDimitry Andric  *
13bdd1243dSDimitry Andric  * Permission is hereby granted, free of charge, to any person obtaining
14bdd1243dSDimitry Andric  * a copy of the Unicode data files and any associated documentation
15bdd1243dSDimitry Andric  * (the "Data Files") or Unicode software and any associated documentation
16bdd1243dSDimitry Andric  * (the "Software") to deal in the Data Files or Software
17bdd1243dSDimitry Andric  * without restriction, including without limitation the rights to use,
18bdd1243dSDimitry Andric  * copy, modify, merge, publish, distribute, and/or sell copies of
19bdd1243dSDimitry Andric  * the Data Files or Software, and to permit persons to whom the Data Files
20bdd1243dSDimitry Andric  * or Software are furnished to do so, provided that
21bdd1243dSDimitry Andric  * (a) this copyright and permission notice appear with all copies
22bdd1243dSDimitry Andric  * of the Data Files or Software,
23bdd1243dSDimitry Andric  * (b) this copyright and permission notice appear in associated
24bdd1243dSDimitry Andric  * documentation, and
25bdd1243dSDimitry Andric  * (c) there is clear notice in each modified Data File or in the Software
26bdd1243dSDimitry Andric  * as well as in the documentation associated with the Data File(s) or
27bdd1243dSDimitry Andric  * Software that the data or software has been modified.
280b57cec5SDimitry Andric  *
29bdd1243dSDimitry Andric  * THE DATA FILES AND SOFTWARE ARE PROVIDED "AS IS", WITHOUT WARRANTY OF
30bdd1243dSDimitry Andric  * ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
31bdd1243dSDimitry Andric  * WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
32bdd1243dSDimitry Andric  * NONINFRINGEMENT OF THIRD PARTY RIGHTS.
33bdd1243dSDimitry Andric  * IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS INCLUDED IN THIS
34bdd1243dSDimitry Andric  * NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL
35bdd1243dSDimitry Andric  * DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE,
36bdd1243dSDimitry Andric  * DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
37bdd1243dSDimitry Andric  * TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
38bdd1243dSDimitry Andric  * PERFORMANCE OF THE DATA FILES OR SOFTWARE.
390b57cec5SDimitry Andric  *
40bdd1243dSDimitry Andric  * Except as contained in this notice, the name of a copyright holder
41bdd1243dSDimitry Andric  * shall not be used in advertising or otherwise to promote the sale,
42bdd1243dSDimitry Andric  * use or other dealings in these Data Files or Software without prior
43bdd1243dSDimitry Andric  * written authorization of the copyright holder.
440b57cec5SDimitry Andric  */
450b57cec5SDimitry Andric 
460b57cec5SDimitry Andric /* ---------------------------------------------------------------------
470b57cec5SDimitry Andric 
480b57cec5SDimitry Andric     Conversions between UTF32, UTF-16, and UTF-8. Source code file.
490b57cec5SDimitry Andric     Author: Mark E. Davis, 1994.
500b57cec5SDimitry Andric     Rev History: Rick McGowan, fixes & updates May 2001.
510b57cec5SDimitry Andric     Sept 2001: fixed const & error conditions per
520b57cec5SDimitry Andric         mods suggested by S. Parent & A. Lillich.
530b57cec5SDimitry Andric     June 2002: Tim Dodd added detection and handling of incomplete
540b57cec5SDimitry Andric         source sequences, enhanced error detection, added casts
550b57cec5SDimitry Andric         to eliminate compiler warnings.
560b57cec5SDimitry Andric     July 2003: slight mods to back out aggressive FFFE detection.
570b57cec5SDimitry Andric     Jan 2004: updated switches in from-UTF8 conversions.
580b57cec5SDimitry Andric     Oct 2004: updated to use UNI_MAX_LEGAL_UTF32 in UTF-32 conversions.
590b57cec5SDimitry Andric 
600b57cec5SDimitry Andric     See the header file "ConvertUTF.h" for complete documentation.
610b57cec5SDimitry Andric 
620b57cec5SDimitry Andric ------------------------------------------------------------------------ */
630b57cec5SDimitry Andric 
640b57cec5SDimitry Andric #include "llvm/Support/ConvertUTF.h"
650b57cec5SDimitry Andric #ifdef CVTUTF_DEBUG
660b57cec5SDimitry Andric #include <stdio.h>
670b57cec5SDimitry Andric #endif
680b57cec5SDimitry Andric #include <assert.h>
690b57cec5SDimitry Andric 
700b57cec5SDimitry Andric /*
710b57cec5SDimitry Andric  * This code extensively uses fall-through switches.
720b57cec5SDimitry Andric  * Keep the compiler from warning about that.
730b57cec5SDimitry Andric  */
740b57cec5SDimitry Andric #if defined(__clang__) && defined(__has_warning)
750b57cec5SDimitry Andric # if __has_warning("-Wimplicit-fallthrough")
760b57cec5SDimitry Andric #  define ConvertUTF_DISABLE_WARNINGS \
770b57cec5SDimitry Andric     _Pragma("clang diagnostic push")  \
780b57cec5SDimitry Andric     _Pragma("clang diagnostic ignored \"-Wimplicit-fallthrough\"")
790b57cec5SDimitry Andric #  define ConvertUTF_RESTORE_WARNINGS \
800b57cec5SDimitry Andric     _Pragma("clang diagnostic pop")
810b57cec5SDimitry Andric # endif
820b57cec5SDimitry Andric #elif defined(__GNUC__) && __GNUC__ > 6
830b57cec5SDimitry Andric # define ConvertUTF_DISABLE_WARNINGS \
840b57cec5SDimitry Andric    _Pragma("GCC diagnostic push")    \
850b57cec5SDimitry Andric    _Pragma("GCC diagnostic ignored \"-Wimplicit-fallthrough\"")
860b57cec5SDimitry Andric # define ConvertUTF_RESTORE_WARNINGS \
870b57cec5SDimitry Andric    _Pragma("GCC diagnostic pop")
880b57cec5SDimitry Andric #endif
890b57cec5SDimitry Andric #ifndef ConvertUTF_DISABLE_WARNINGS
900b57cec5SDimitry Andric # define ConvertUTF_DISABLE_WARNINGS
910b57cec5SDimitry Andric #endif
920b57cec5SDimitry Andric #ifndef ConvertUTF_RESTORE_WARNINGS
930b57cec5SDimitry Andric # define ConvertUTF_RESTORE_WARNINGS
940b57cec5SDimitry Andric #endif
950b57cec5SDimitry Andric 
960b57cec5SDimitry Andric ConvertUTF_DISABLE_WARNINGS
970b57cec5SDimitry Andric 
980b57cec5SDimitry Andric namespace llvm {
990b57cec5SDimitry Andric 
1000b57cec5SDimitry Andric static const int halfShift  = 10; /* used for shifting by 10 bits */
1010b57cec5SDimitry Andric 
1020b57cec5SDimitry Andric static const UTF32 halfBase = 0x0010000UL;
1030b57cec5SDimitry Andric static const UTF32 halfMask = 0x3FFUL;
1040b57cec5SDimitry Andric 
1050b57cec5SDimitry Andric #define UNI_SUR_HIGH_START  (UTF32)0xD800
1060b57cec5SDimitry Andric #define UNI_SUR_HIGH_END    (UTF32)0xDBFF
1070b57cec5SDimitry Andric #define UNI_SUR_LOW_START   (UTF32)0xDC00
1080b57cec5SDimitry Andric #define UNI_SUR_LOW_END     (UTF32)0xDFFF
1090b57cec5SDimitry Andric 
1100b57cec5SDimitry Andric /* --------------------------------------------------------------------- */
1110b57cec5SDimitry Andric 
1120b57cec5SDimitry Andric /*
1130b57cec5SDimitry Andric  * Index into the table below with the first byte of a UTF-8 sequence to
1140b57cec5SDimitry Andric  * get the number of trailing bytes that are supposed to follow it.
1150b57cec5SDimitry Andric  * Note that *legal* UTF-8 values can't have 4 or 5-bytes. The table is
1160b57cec5SDimitry Andric  * left as-is for anyone who may want to do such conversion, which was
1170b57cec5SDimitry Andric  * allowed in earlier algorithms.
1180b57cec5SDimitry Andric  */
1190b57cec5SDimitry Andric static const char trailingBytesForUTF8[256] = {
1200b57cec5SDimitry Andric     0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
1210b57cec5SDimitry Andric     0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
1220b57cec5SDimitry Andric     0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
1230b57cec5SDimitry Andric     0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
1240b57cec5SDimitry Andric     0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
1250b57cec5SDimitry Andric     0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
1260b57cec5SDimitry Andric     1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1270b57cec5SDimitry Andric     2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5
1280b57cec5SDimitry Andric };
1290b57cec5SDimitry Andric 
1300b57cec5SDimitry Andric /*
1310b57cec5SDimitry Andric  * Magic values subtracted from a buffer value during UTF8 conversion.
1320b57cec5SDimitry Andric  * This table contains as many values as there might be trailing bytes
1330b57cec5SDimitry Andric  * in a UTF-8 sequence.
1340b57cec5SDimitry Andric  */
1350b57cec5SDimitry Andric static const UTF32 offsetsFromUTF8[6] = { 0x00000000UL, 0x00003080UL, 0x000E2080UL,
1360b57cec5SDimitry Andric                      0x03C82080UL, 0xFA082080UL, 0x82082080UL };
1370b57cec5SDimitry Andric 
1380b57cec5SDimitry Andric /*
1390b57cec5SDimitry Andric  * Once the bits are split out into bytes of UTF-8, this is a mask OR-ed
1400b57cec5SDimitry Andric  * into the first byte, depending on how many bytes follow.  There are
1410b57cec5SDimitry Andric  * as many entries in this table as there are UTF-8 sequence types.
1420b57cec5SDimitry Andric  * (I.e., one byte sequence, two byte... etc.). Remember that sequencs
1430b57cec5SDimitry Andric  * for *legal* UTF-8 will be 4 or fewer bytes total.
1440b57cec5SDimitry Andric  */
1450b57cec5SDimitry Andric static const UTF8 firstByteMark[7] = { 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC };
1460b57cec5SDimitry Andric 
1470b57cec5SDimitry Andric /* --------------------------------------------------------------------- */
1480b57cec5SDimitry Andric 
1490b57cec5SDimitry Andric /* The interface converts a whole buffer to avoid function-call overhead.
1500b57cec5SDimitry Andric  * Constants have been gathered. Loops & conditionals have been removed as
1510b57cec5SDimitry Andric  * much as possible for efficiency, in favor of drop-through switches.
1520b57cec5SDimitry Andric  * (See "Note A" at the bottom of the file for equivalent code.)
1530b57cec5SDimitry Andric  * If your compiler supports it, the "isLegalUTF8" call can be turned
1540b57cec5SDimitry Andric  * into an inline function.
1550b57cec5SDimitry Andric  */
1560b57cec5SDimitry Andric 
1570b57cec5SDimitry Andric 
1580b57cec5SDimitry Andric /* --------------------------------------------------------------------- */
1590b57cec5SDimitry Andric 
ConvertUTF32toUTF16(const UTF32 ** sourceStart,const UTF32 * sourceEnd,UTF16 ** targetStart,UTF16 * targetEnd,ConversionFlags flags)1600b57cec5SDimitry Andric ConversionResult ConvertUTF32toUTF16 (
1610b57cec5SDimitry Andric         const UTF32** sourceStart, const UTF32* sourceEnd,
1620b57cec5SDimitry Andric         UTF16** targetStart, UTF16* targetEnd, ConversionFlags flags) {
1630b57cec5SDimitry Andric     ConversionResult result = conversionOK;
1640b57cec5SDimitry Andric     const UTF32* source = *sourceStart;
1650b57cec5SDimitry Andric     UTF16* target = *targetStart;
1660b57cec5SDimitry Andric     while (source < sourceEnd) {
1670b57cec5SDimitry Andric         UTF32 ch;
1680b57cec5SDimitry Andric         if (target >= targetEnd) {
1690b57cec5SDimitry Andric             result = targetExhausted; break;
1700b57cec5SDimitry Andric         }
1710b57cec5SDimitry Andric         ch = *source++;
1720b57cec5SDimitry Andric         if (ch <= UNI_MAX_BMP) { /* Target is a character <= 0xFFFF */
1730b57cec5SDimitry Andric             /* UTF-16 surrogate values are illegal in UTF-32; 0xffff or 0xfffe are both reserved values */
1740b57cec5SDimitry Andric             if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
1750b57cec5SDimitry Andric                 if (flags == strictConversion) {
1760b57cec5SDimitry Andric                     --source; /* return to the illegal value itself */
1770b57cec5SDimitry Andric                     result = sourceIllegal;
1780b57cec5SDimitry Andric                     break;
1790b57cec5SDimitry Andric                 } else {
1800b57cec5SDimitry Andric                     *target++ = UNI_REPLACEMENT_CHAR;
1810b57cec5SDimitry Andric                 }
1820b57cec5SDimitry Andric             } else {
1830b57cec5SDimitry Andric                 *target++ = (UTF16)ch; /* normal case */
1840b57cec5SDimitry Andric             }
1850b57cec5SDimitry Andric         } else if (ch > UNI_MAX_LEGAL_UTF32) {
1860b57cec5SDimitry Andric             if (flags == strictConversion) {
1870b57cec5SDimitry Andric                 result = sourceIllegal;
1880b57cec5SDimitry Andric             } else {
1890b57cec5SDimitry Andric                 *target++ = UNI_REPLACEMENT_CHAR;
1900b57cec5SDimitry Andric             }
1910b57cec5SDimitry Andric         } else {
1920b57cec5SDimitry Andric             /* target is a character in range 0xFFFF - 0x10FFFF. */
1930b57cec5SDimitry Andric             if (target + 1 >= targetEnd) {
1940b57cec5SDimitry Andric                 --source; /* Back up source pointer! */
1950b57cec5SDimitry Andric                 result = targetExhausted; break;
1960b57cec5SDimitry Andric             }
1970b57cec5SDimitry Andric             ch -= halfBase;
1980b57cec5SDimitry Andric             *target++ = (UTF16)((ch >> halfShift) + UNI_SUR_HIGH_START);
1990b57cec5SDimitry Andric             *target++ = (UTF16)((ch & halfMask) + UNI_SUR_LOW_START);
2000b57cec5SDimitry Andric         }
2010b57cec5SDimitry Andric     }
2020b57cec5SDimitry Andric     *sourceStart = source;
2030b57cec5SDimitry Andric     *targetStart = target;
2040b57cec5SDimitry Andric     return result;
2050b57cec5SDimitry Andric }
2060b57cec5SDimitry Andric 
2070b57cec5SDimitry Andric /* --------------------------------------------------------------------- */
2080b57cec5SDimitry Andric 
ConvertUTF16toUTF32(const UTF16 ** sourceStart,const UTF16 * sourceEnd,UTF32 ** targetStart,UTF32 * targetEnd,ConversionFlags flags)2090b57cec5SDimitry Andric ConversionResult ConvertUTF16toUTF32 (
2100b57cec5SDimitry Andric         const UTF16** sourceStart, const UTF16* sourceEnd,
2110b57cec5SDimitry Andric         UTF32** targetStart, UTF32* targetEnd, ConversionFlags flags) {
2120b57cec5SDimitry Andric     ConversionResult result = conversionOK;
2130b57cec5SDimitry Andric     const UTF16* source = *sourceStart;
2140b57cec5SDimitry Andric     UTF32* target = *targetStart;
2150b57cec5SDimitry Andric     UTF32 ch, ch2;
2160b57cec5SDimitry Andric     while (source < sourceEnd) {
2170b57cec5SDimitry Andric         const UTF16* oldSource = source; /*  In case we have to back up because of target overflow. */
2180b57cec5SDimitry Andric         ch = *source++;
2190b57cec5SDimitry Andric         /* If we have a surrogate pair, convert to UTF32 first. */
2200b57cec5SDimitry Andric         if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END) {
2210b57cec5SDimitry Andric             /* If the 16 bits following the high surrogate are in the source buffer... */
2220b57cec5SDimitry Andric             if (source < sourceEnd) {
2230b57cec5SDimitry Andric                 ch2 = *source;
2240b57cec5SDimitry Andric                 /* If it's a low surrogate, convert to UTF32. */
2250b57cec5SDimitry Andric                 if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END) {
2260b57cec5SDimitry Andric                     ch = ((ch - UNI_SUR_HIGH_START) << halfShift)
2270b57cec5SDimitry Andric                         + (ch2 - UNI_SUR_LOW_START) + halfBase;
2280b57cec5SDimitry Andric                     ++source;
2290b57cec5SDimitry Andric                 } else if (flags == strictConversion) { /* it's an unpaired high surrogate */
2300b57cec5SDimitry Andric                     --source; /* return to the illegal value itself */
2310b57cec5SDimitry Andric                     result = sourceIllegal;
2320b57cec5SDimitry Andric                     break;
2330b57cec5SDimitry Andric                 }
2340b57cec5SDimitry Andric             } else { /* We don't have the 16 bits following the high surrogate. */
2350b57cec5SDimitry Andric                 --source; /* return to the high surrogate */
2360b57cec5SDimitry Andric                 result = sourceExhausted;
2370b57cec5SDimitry Andric                 break;
2380b57cec5SDimitry Andric             }
2390b57cec5SDimitry Andric         } else if (flags == strictConversion) {
2400b57cec5SDimitry Andric             /* UTF-16 surrogate values are illegal in UTF-32 */
2410b57cec5SDimitry Andric             if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END) {
2420b57cec5SDimitry Andric                 --source; /* return to the illegal value itself */
2430b57cec5SDimitry Andric                 result = sourceIllegal;
2440b57cec5SDimitry Andric                 break;
2450b57cec5SDimitry Andric             }
2460b57cec5SDimitry Andric         }
2470b57cec5SDimitry Andric         if (target >= targetEnd) {
2480b57cec5SDimitry Andric             source = oldSource; /* Back up source pointer! */
2490b57cec5SDimitry Andric             result = targetExhausted; break;
2500b57cec5SDimitry Andric         }
2510b57cec5SDimitry Andric         *target++ = ch;
2520b57cec5SDimitry Andric     }
2530b57cec5SDimitry Andric     *sourceStart = source;
2540b57cec5SDimitry Andric     *targetStart = target;
2550b57cec5SDimitry Andric #ifdef CVTUTF_DEBUG
2560b57cec5SDimitry Andric if (result == sourceIllegal) {
2570b57cec5SDimitry Andric     fprintf(stderr, "ConvertUTF16toUTF32 illegal seq 0x%04x,%04x\n", ch, ch2);
2580b57cec5SDimitry Andric     fflush(stderr);
2590b57cec5SDimitry Andric }
2600b57cec5SDimitry Andric #endif
2610b57cec5SDimitry Andric     return result;
2620b57cec5SDimitry Andric }
ConvertUTF16toUTF8(const UTF16 ** sourceStart,const UTF16 * sourceEnd,UTF8 ** targetStart,UTF8 * targetEnd,ConversionFlags flags)2630b57cec5SDimitry Andric ConversionResult ConvertUTF16toUTF8 (
2640b57cec5SDimitry Andric         const UTF16** sourceStart, const UTF16* sourceEnd,
2650b57cec5SDimitry Andric         UTF8** targetStart, UTF8* targetEnd, ConversionFlags flags) {
2660b57cec5SDimitry Andric     ConversionResult result = conversionOK;
2670b57cec5SDimitry Andric     const UTF16* source = *sourceStart;
2680b57cec5SDimitry Andric     UTF8* target = *targetStart;
2690b57cec5SDimitry Andric     while (source < sourceEnd) {
2700b57cec5SDimitry Andric         UTF32 ch;
2710b57cec5SDimitry Andric         unsigned short bytesToWrite = 0;
2720b57cec5SDimitry Andric         const UTF32 byteMask = 0xBF;
2730b57cec5SDimitry Andric         const UTF32 byteMark = 0x80;
2740b57cec5SDimitry Andric         const UTF16* oldSource = source; /* In case we have to back up because of target overflow. */
2750b57cec5SDimitry Andric         ch = *source++;
2760b57cec5SDimitry Andric         /* If we have a surrogate pair, convert to UTF32 first. */
2770b57cec5SDimitry Andric         if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END) {
2780b57cec5SDimitry Andric             /* If the 16 bits following the high surrogate are in the source buffer... */
2790b57cec5SDimitry Andric             if (source < sourceEnd) {
2800b57cec5SDimitry Andric                 UTF32 ch2 = *source;
2810b57cec5SDimitry Andric                 /* If it's a low surrogate, convert to UTF32. */
2820b57cec5SDimitry Andric                 if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END) {
2830b57cec5SDimitry Andric                     ch = ((ch - UNI_SUR_HIGH_START) << halfShift)
2840b57cec5SDimitry Andric                         + (ch2 - UNI_SUR_LOW_START) + halfBase;
2850b57cec5SDimitry Andric                     ++source;
2860b57cec5SDimitry Andric                 } else if (flags == strictConversion) { /* it's an unpaired high surrogate */
2870b57cec5SDimitry Andric                     --source; /* return to the illegal value itself */
2880b57cec5SDimitry Andric                     result = sourceIllegal;
2890b57cec5SDimitry Andric                     break;
2900b57cec5SDimitry Andric                 }
2910b57cec5SDimitry Andric             } else { /* We don't have the 16 bits following the high surrogate. */
2920b57cec5SDimitry Andric                 --source; /* return to the high surrogate */
2930b57cec5SDimitry Andric                 result = sourceExhausted;
2940b57cec5SDimitry Andric                 break;
2950b57cec5SDimitry Andric             }
2960b57cec5SDimitry Andric         } else if (flags == strictConversion) {
2970b57cec5SDimitry Andric             /* UTF-16 surrogate values are illegal in UTF-32 */
2980b57cec5SDimitry Andric             if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END) {
2990b57cec5SDimitry Andric                 --source; /* return to the illegal value itself */
3000b57cec5SDimitry Andric                 result = sourceIllegal;
3010b57cec5SDimitry Andric                 break;
3020b57cec5SDimitry Andric             }
3030b57cec5SDimitry Andric         }
3040b57cec5SDimitry Andric         /* Figure out how many bytes the result will require */
3050b57cec5SDimitry Andric         if (ch < (UTF32)0x80) {      bytesToWrite = 1;
3060b57cec5SDimitry Andric         } else if (ch < (UTF32)0x800) {     bytesToWrite = 2;
3070b57cec5SDimitry Andric         } else if (ch < (UTF32)0x10000) {   bytesToWrite = 3;
3080b57cec5SDimitry Andric         } else if (ch < (UTF32)0x110000) {  bytesToWrite = 4;
3090b57cec5SDimitry Andric         } else {                            bytesToWrite = 3;
3100b57cec5SDimitry Andric                                             ch = UNI_REPLACEMENT_CHAR;
3110b57cec5SDimitry Andric         }
3120b57cec5SDimitry Andric 
3130b57cec5SDimitry Andric         target += bytesToWrite;
3140b57cec5SDimitry Andric         if (target > targetEnd) {
3150b57cec5SDimitry Andric             source = oldSource; /* Back up source pointer! */
3160b57cec5SDimitry Andric             target -= bytesToWrite; result = targetExhausted; break;
3170b57cec5SDimitry Andric         }
3180b57cec5SDimitry Andric         switch (bytesToWrite) { /* note: everything falls through. */
3190b57cec5SDimitry Andric             case 4: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
3200b57cec5SDimitry Andric             case 3: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
3210b57cec5SDimitry Andric             case 2: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
3220b57cec5SDimitry Andric             case 1: *--target =  (UTF8)(ch | firstByteMark[bytesToWrite]);
3230b57cec5SDimitry Andric         }
3240b57cec5SDimitry Andric         target += bytesToWrite;
3250b57cec5SDimitry Andric     }
3260b57cec5SDimitry Andric     *sourceStart = source;
3270b57cec5SDimitry Andric     *targetStart = target;
3280b57cec5SDimitry Andric     return result;
3290b57cec5SDimitry Andric }
3300b57cec5SDimitry Andric 
3310b57cec5SDimitry Andric /* --------------------------------------------------------------------- */
3320b57cec5SDimitry Andric 
ConvertUTF32toUTF8(const UTF32 ** sourceStart,const UTF32 * sourceEnd,UTF8 ** targetStart,UTF8 * targetEnd,ConversionFlags flags)3330b57cec5SDimitry Andric ConversionResult ConvertUTF32toUTF8 (
3340b57cec5SDimitry Andric         const UTF32** sourceStart, const UTF32* sourceEnd,
3350b57cec5SDimitry Andric         UTF8** targetStart, UTF8* targetEnd, ConversionFlags flags) {
3360b57cec5SDimitry Andric     ConversionResult result = conversionOK;
3370b57cec5SDimitry Andric     const UTF32* source = *sourceStart;
3380b57cec5SDimitry Andric     UTF8* target = *targetStart;
3390b57cec5SDimitry Andric     while (source < sourceEnd) {
3400b57cec5SDimitry Andric         UTF32 ch;
3410b57cec5SDimitry Andric         unsigned short bytesToWrite = 0;
3420b57cec5SDimitry Andric         const UTF32 byteMask = 0xBF;
3430b57cec5SDimitry Andric         const UTF32 byteMark = 0x80;
3440b57cec5SDimitry Andric         ch = *source++;
3450b57cec5SDimitry Andric         if (flags == strictConversion ) {
3460b57cec5SDimitry Andric             /* UTF-16 surrogate values are illegal in UTF-32 */
3470b57cec5SDimitry Andric             if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
3480b57cec5SDimitry Andric                 --source; /* return to the illegal value itself */
3490b57cec5SDimitry Andric                 result = sourceIllegal;
3500b57cec5SDimitry Andric                 break;
3510b57cec5SDimitry Andric             }
3520b57cec5SDimitry Andric         }
3530b57cec5SDimitry Andric         /*
3540b57cec5SDimitry Andric          * Figure out how many bytes the result will require. Turn any
3550b57cec5SDimitry Andric          * illegally large UTF32 things (> Plane 17) into replacement chars.
3560b57cec5SDimitry Andric          */
3570b57cec5SDimitry Andric         if (ch < (UTF32)0x80) {      bytesToWrite = 1;
3580b57cec5SDimitry Andric         } else if (ch < (UTF32)0x800) {     bytesToWrite = 2;
3590b57cec5SDimitry Andric         } else if (ch < (UTF32)0x10000) {   bytesToWrite = 3;
3600b57cec5SDimitry Andric         } else if (ch <= UNI_MAX_LEGAL_UTF32) {  bytesToWrite = 4;
3610b57cec5SDimitry Andric         } else {                            bytesToWrite = 3;
3620b57cec5SDimitry Andric                                             ch = UNI_REPLACEMENT_CHAR;
3630b57cec5SDimitry Andric                                             result = sourceIllegal;
3640b57cec5SDimitry Andric         }
3650b57cec5SDimitry Andric 
3660b57cec5SDimitry Andric         target += bytesToWrite;
3670b57cec5SDimitry Andric         if (target > targetEnd) {
3680b57cec5SDimitry Andric             --source; /* Back up source pointer! */
3690b57cec5SDimitry Andric             target -= bytesToWrite; result = targetExhausted; break;
3700b57cec5SDimitry Andric         }
3710b57cec5SDimitry Andric         switch (bytesToWrite) { /* note: everything falls through. */
3720b57cec5SDimitry Andric             case 4: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
3730b57cec5SDimitry Andric             case 3: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
3740b57cec5SDimitry Andric             case 2: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
3750b57cec5SDimitry Andric             case 1: *--target = (UTF8) (ch | firstByteMark[bytesToWrite]);
3760b57cec5SDimitry Andric         }
3770b57cec5SDimitry Andric         target += bytesToWrite;
3780b57cec5SDimitry Andric     }
3790b57cec5SDimitry Andric     *sourceStart = source;
3800b57cec5SDimitry Andric     *targetStart = target;
3810b57cec5SDimitry Andric     return result;
3820b57cec5SDimitry Andric }
3830b57cec5SDimitry Andric 
3840b57cec5SDimitry Andric /* --------------------------------------------------------------------- */
3850b57cec5SDimitry Andric 
3860b57cec5SDimitry Andric /*
3870b57cec5SDimitry Andric  * Utility routine to tell whether a sequence of bytes is legal UTF-8.
3880b57cec5SDimitry Andric  * This must be called with the length pre-determined by the first byte.
3890b57cec5SDimitry Andric  * If not calling this from ConvertUTF8to*, then the length can be set by:
3900b57cec5SDimitry Andric  *  length = trailingBytesForUTF8[*source]+1;
3910b57cec5SDimitry Andric  * and the sequence is illegal right away if there aren't that many bytes
3920b57cec5SDimitry Andric  * available.
3930b57cec5SDimitry Andric  * If presented with a length > 4, this returns false.  The Unicode
3940b57cec5SDimitry Andric  * definition of UTF-8 goes up to 4-byte sequences.
3950b57cec5SDimitry Andric  */
3960b57cec5SDimitry Andric 
isLegalUTF8(const UTF8 * source,int length)3970b57cec5SDimitry Andric static Boolean isLegalUTF8(const UTF8 *source, int length) {
3980b57cec5SDimitry Andric     UTF8 a;
3990b57cec5SDimitry Andric     const UTF8 *srcptr = source+length;
4000b57cec5SDimitry Andric     switch (length) {
4010b57cec5SDimitry Andric     default: return false;
4020b57cec5SDimitry Andric         /* Everything else falls through when "true"... */
4030b57cec5SDimitry Andric     case 4: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;
4040b57cec5SDimitry Andric     case 3: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;
4050b57cec5SDimitry Andric     case 2: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;
4060b57cec5SDimitry Andric 
4070b57cec5SDimitry Andric         switch (*source) {
4080b57cec5SDimitry Andric             /* no fall-through in this inner switch */
4090b57cec5SDimitry Andric             case 0xE0: if (a < 0xA0) return false; break;
4100b57cec5SDimitry Andric             case 0xED: if (a > 0x9F) return false; break;
4110b57cec5SDimitry Andric             case 0xF0: if (a < 0x90) return false; break;
4120b57cec5SDimitry Andric             case 0xF4: if (a > 0x8F) return false; break;
4130b57cec5SDimitry Andric             default:   if (a < 0x80) return false;
4140b57cec5SDimitry Andric         }
4150b57cec5SDimitry Andric 
4160b57cec5SDimitry Andric     case 1: if (*source >= 0x80 && *source < 0xC2) return false;
4170b57cec5SDimitry Andric     }
4180b57cec5SDimitry Andric     if (*source > 0xF4) return false;
4190b57cec5SDimitry Andric     return true;
4200b57cec5SDimitry Andric }
4210b57cec5SDimitry Andric 
4220b57cec5SDimitry Andric /* --------------------------------------------------------------------- */
4230b57cec5SDimitry Andric 
4240b57cec5SDimitry Andric /*
4250b57cec5SDimitry Andric  * Exported function to return whether a UTF-8 sequence is legal or not.
4260b57cec5SDimitry Andric  * This is not used here; it's just exported.
4270b57cec5SDimitry Andric  */
isLegalUTF8Sequence(const UTF8 * source,const UTF8 * sourceEnd)4280b57cec5SDimitry Andric Boolean isLegalUTF8Sequence(const UTF8 *source, const UTF8 *sourceEnd) {
4290b57cec5SDimitry Andric     int length = trailingBytesForUTF8[*source]+1;
4300b57cec5SDimitry Andric     if (length > sourceEnd - source) {
4310b57cec5SDimitry Andric         return false;
4320b57cec5SDimitry Andric     }
4330b57cec5SDimitry Andric     return isLegalUTF8(source, length);
4340b57cec5SDimitry Andric }
4350b57cec5SDimitry Andric 
436753f127fSDimitry Andric /*
437753f127fSDimitry Andric  * Exported function to return the size of the first utf-8 code unit sequence,
438753f127fSDimitry Andric  * Or 0 if the sequence is not valid;
439753f127fSDimitry Andric  */
getUTF8SequenceSize(const UTF8 * source,const UTF8 * sourceEnd)440753f127fSDimitry Andric unsigned getUTF8SequenceSize(const UTF8 *source, const UTF8 *sourceEnd) {
441753f127fSDimitry Andric   int length = trailingBytesForUTF8[*source] + 1;
442753f127fSDimitry Andric   return (length <= sourceEnd - source && isLegalUTF8(source, length)) ? length
443753f127fSDimitry Andric                                                                        : 0;
444753f127fSDimitry Andric }
445753f127fSDimitry Andric 
4460b57cec5SDimitry Andric /* --------------------------------------------------------------------- */
4470b57cec5SDimitry Andric 
4480b57cec5SDimitry Andric static unsigned
findMaximalSubpartOfIllFormedUTF8Sequence(const UTF8 * source,const UTF8 * sourceEnd)4490b57cec5SDimitry Andric findMaximalSubpartOfIllFormedUTF8Sequence(const UTF8 *source,
4500b57cec5SDimitry Andric                                           const UTF8 *sourceEnd) {
4510b57cec5SDimitry Andric   UTF8 b1, b2, b3;
4520b57cec5SDimitry Andric 
4530b57cec5SDimitry Andric   assert(!isLegalUTF8Sequence(source, sourceEnd));
4540b57cec5SDimitry Andric 
4550b57cec5SDimitry Andric   /*
4560b57cec5SDimitry Andric    * Unicode 6.3.0, D93b:
4570b57cec5SDimitry Andric    *
4580b57cec5SDimitry Andric    *   Maximal subpart of an ill-formed subsequence: The longest code unit
4590b57cec5SDimitry Andric    *   subsequence starting at an unconvertible offset that is either:
4600b57cec5SDimitry Andric    *   a. the initial subsequence of a well-formed code unit sequence, or
4610b57cec5SDimitry Andric    *   b. a subsequence of length one.
4620b57cec5SDimitry Andric    */
4630b57cec5SDimitry Andric 
4640b57cec5SDimitry Andric   if (source == sourceEnd)
4650b57cec5SDimitry Andric     return 0;
4660b57cec5SDimitry Andric 
4670b57cec5SDimitry Andric   /*
4680b57cec5SDimitry Andric    * Perform case analysis.  See Unicode 6.3.0, Table 3-7. Well-Formed UTF-8
4690b57cec5SDimitry Andric    * Byte Sequences.
4700b57cec5SDimitry Andric    */
4710b57cec5SDimitry Andric 
4720b57cec5SDimitry Andric   b1 = *source;
4730b57cec5SDimitry Andric   ++source;
4740b57cec5SDimitry Andric   if (b1 >= 0xC2 && b1 <= 0xDF) {
4750b57cec5SDimitry Andric     /*
4760b57cec5SDimitry Andric      * First byte is valid, but we know that this code unit sequence is
4770b57cec5SDimitry Andric      * invalid, so the maximal subpart has to end after the first byte.
4780b57cec5SDimitry Andric      */
4790b57cec5SDimitry Andric     return 1;
4800b57cec5SDimitry Andric   }
4810b57cec5SDimitry Andric 
4820b57cec5SDimitry Andric   if (source == sourceEnd)
4830b57cec5SDimitry Andric     return 1;
4840b57cec5SDimitry Andric 
4850b57cec5SDimitry Andric   b2 = *source;
4860b57cec5SDimitry Andric   ++source;
4870b57cec5SDimitry Andric 
4880b57cec5SDimitry Andric   if (b1 == 0xE0) {
4890b57cec5SDimitry Andric     return (b2 >= 0xA0 && b2 <= 0xBF) ? 2 : 1;
4900b57cec5SDimitry Andric   }
4910b57cec5SDimitry Andric   if (b1 >= 0xE1 && b1 <= 0xEC) {
4920b57cec5SDimitry Andric     return (b2 >= 0x80 && b2 <= 0xBF) ? 2 : 1;
4930b57cec5SDimitry Andric   }
4940b57cec5SDimitry Andric   if (b1 == 0xED) {
4950b57cec5SDimitry Andric     return (b2 >= 0x80 && b2 <= 0x9F) ? 2 : 1;
4960b57cec5SDimitry Andric   }
4970b57cec5SDimitry Andric   if (b1 >= 0xEE && b1 <= 0xEF) {
4980b57cec5SDimitry Andric     return (b2 >= 0x80 && b2 <= 0xBF) ? 2 : 1;
4990b57cec5SDimitry Andric   }
5000b57cec5SDimitry Andric   if (b1 == 0xF0) {
5010b57cec5SDimitry Andric     if (b2 >= 0x90 && b2 <= 0xBF) {
5020b57cec5SDimitry Andric       if (source == sourceEnd)
5030b57cec5SDimitry Andric         return 2;
5040b57cec5SDimitry Andric 
5050b57cec5SDimitry Andric       b3 = *source;
5060b57cec5SDimitry Andric       return (b3 >= 0x80 && b3 <= 0xBF) ? 3 : 2;
5070b57cec5SDimitry Andric     }
5080b57cec5SDimitry Andric     return 1;
5090b57cec5SDimitry Andric   }
5100b57cec5SDimitry Andric   if (b1 >= 0xF1 && b1 <= 0xF3) {
5110b57cec5SDimitry Andric     if (b2 >= 0x80 && b2 <= 0xBF) {
5120b57cec5SDimitry Andric       if (source == sourceEnd)
5130b57cec5SDimitry Andric         return 2;
5140b57cec5SDimitry Andric 
5150b57cec5SDimitry Andric       b3 = *source;
5160b57cec5SDimitry Andric       return (b3 >= 0x80 && b3 <= 0xBF) ? 3 : 2;
5170b57cec5SDimitry Andric     }
5180b57cec5SDimitry Andric     return 1;
5190b57cec5SDimitry Andric   }
5200b57cec5SDimitry Andric   if (b1 == 0xF4) {
5210b57cec5SDimitry Andric     if (b2 >= 0x80 && b2 <= 0x8F) {
5220b57cec5SDimitry Andric       if (source == sourceEnd)
5230b57cec5SDimitry Andric         return 2;
5240b57cec5SDimitry Andric 
5250b57cec5SDimitry Andric       b3 = *source;
5260b57cec5SDimitry Andric       return (b3 >= 0x80 && b3 <= 0xBF) ? 3 : 2;
5270b57cec5SDimitry Andric     }
5280b57cec5SDimitry Andric     return 1;
5290b57cec5SDimitry Andric   }
5300b57cec5SDimitry Andric 
5310b57cec5SDimitry Andric   assert((b1 >= 0x80 && b1 <= 0xC1) || b1 >= 0xF5);
5320b57cec5SDimitry Andric   /*
5330b57cec5SDimitry Andric    * There are no valid sequences that start with these bytes.  Maximal subpart
5340b57cec5SDimitry Andric    * is defined to have length 1 in these cases.
5350b57cec5SDimitry Andric    */
5360b57cec5SDimitry Andric   return 1;
5370b57cec5SDimitry Andric }
5380b57cec5SDimitry Andric 
5390b57cec5SDimitry Andric /* --------------------------------------------------------------------- */
5400b57cec5SDimitry Andric 
5410b57cec5SDimitry Andric /*
5420b57cec5SDimitry Andric  * Exported function to return the total number of bytes in a codepoint
5430b57cec5SDimitry Andric  * represented in UTF-8, given the value of the first byte.
5440b57cec5SDimitry Andric  */
getNumBytesForUTF8(UTF8 first)5450b57cec5SDimitry Andric unsigned getNumBytesForUTF8(UTF8 first) {
5460b57cec5SDimitry Andric   return trailingBytesForUTF8[first] + 1;
5470b57cec5SDimitry Andric }
5480b57cec5SDimitry Andric 
5490b57cec5SDimitry Andric /* --------------------------------------------------------------------- */
5500b57cec5SDimitry Andric 
5510b57cec5SDimitry Andric /*
5520b57cec5SDimitry Andric  * Exported function to return whether a UTF-8 string is legal or not.
5530b57cec5SDimitry Andric  * This is not used here; it's just exported.
5540b57cec5SDimitry Andric  */
isLegalUTF8String(const UTF8 ** source,const UTF8 * sourceEnd)5550b57cec5SDimitry Andric Boolean isLegalUTF8String(const UTF8 **source, const UTF8 *sourceEnd) {
5560b57cec5SDimitry Andric     while (*source != sourceEnd) {
5570b57cec5SDimitry Andric         int length = trailingBytesForUTF8[**source] + 1;
5580b57cec5SDimitry Andric         if (length > sourceEnd - *source || !isLegalUTF8(*source, length))
5590b57cec5SDimitry Andric             return false;
5600b57cec5SDimitry Andric         *source += length;
5610b57cec5SDimitry Andric     }
5620b57cec5SDimitry Andric     return true;
5630b57cec5SDimitry Andric }
5640b57cec5SDimitry Andric 
5650b57cec5SDimitry Andric /* --------------------------------------------------------------------- */
5660b57cec5SDimitry Andric 
ConvertUTF8toUTF16(const UTF8 ** sourceStart,const UTF8 * sourceEnd,UTF16 ** targetStart,UTF16 * targetEnd,ConversionFlags flags)5670b57cec5SDimitry Andric ConversionResult ConvertUTF8toUTF16 (
5680b57cec5SDimitry Andric         const UTF8** sourceStart, const UTF8* sourceEnd,
5690b57cec5SDimitry Andric         UTF16** targetStart, UTF16* targetEnd, ConversionFlags flags) {
5700b57cec5SDimitry Andric     ConversionResult result = conversionOK;
5710b57cec5SDimitry Andric     const UTF8* source = *sourceStart;
5720b57cec5SDimitry Andric     UTF16* target = *targetStart;
5730b57cec5SDimitry Andric     while (source < sourceEnd) {
5740b57cec5SDimitry Andric         UTF32 ch = 0;
5750b57cec5SDimitry Andric         unsigned short extraBytesToRead = trailingBytesForUTF8[*source];
5760b57cec5SDimitry Andric         if (extraBytesToRead >= sourceEnd - source) {
5770b57cec5SDimitry Andric             result = sourceExhausted; break;
5780b57cec5SDimitry Andric         }
5790b57cec5SDimitry Andric         /* Do this check whether lenient or strict */
5800b57cec5SDimitry Andric         if (!isLegalUTF8(source, extraBytesToRead+1)) {
5810b57cec5SDimitry Andric             result = sourceIllegal;
5820b57cec5SDimitry Andric             break;
5830b57cec5SDimitry Andric         }
5840b57cec5SDimitry Andric         /*
5850b57cec5SDimitry Andric          * The cases all fall through. See "Note A" below.
5860b57cec5SDimitry Andric          */
5870b57cec5SDimitry Andric         switch (extraBytesToRead) {
5880b57cec5SDimitry Andric             case 5: ch += *source++; ch <<= 6; /* remember, illegal UTF-8 */
5890b57cec5SDimitry Andric             case 4: ch += *source++; ch <<= 6; /* remember, illegal UTF-8 */
5900b57cec5SDimitry Andric             case 3: ch += *source++; ch <<= 6;
5910b57cec5SDimitry Andric             case 2: ch += *source++; ch <<= 6;
5920b57cec5SDimitry Andric             case 1: ch += *source++; ch <<= 6;
5930b57cec5SDimitry Andric             case 0: ch += *source++;
5940b57cec5SDimitry Andric         }
5950b57cec5SDimitry Andric         ch -= offsetsFromUTF8[extraBytesToRead];
5960b57cec5SDimitry Andric 
5970b57cec5SDimitry Andric         if (target >= targetEnd) {
5980b57cec5SDimitry Andric             source -= (extraBytesToRead+1); /* Back up source pointer! */
5990b57cec5SDimitry Andric             result = targetExhausted; break;
6000b57cec5SDimitry Andric         }
6010b57cec5SDimitry Andric         if (ch <= UNI_MAX_BMP) { /* Target is a character <= 0xFFFF */
6020b57cec5SDimitry Andric             /* UTF-16 surrogate values are illegal in UTF-32 */
6030b57cec5SDimitry Andric             if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
6040b57cec5SDimitry Andric                 if (flags == strictConversion) {
6050b57cec5SDimitry Andric                     source -= (extraBytesToRead+1); /* return to the illegal value itself */
6060b57cec5SDimitry Andric                     result = sourceIllegal;
6070b57cec5SDimitry Andric                     break;
6080b57cec5SDimitry Andric                 } else {
6090b57cec5SDimitry Andric                     *target++ = UNI_REPLACEMENT_CHAR;
6100b57cec5SDimitry Andric                 }
6110b57cec5SDimitry Andric             } else {
6120b57cec5SDimitry Andric                 *target++ = (UTF16)ch; /* normal case */
6130b57cec5SDimitry Andric             }
6140b57cec5SDimitry Andric         } else if (ch > UNI_MAX_UTF16) {
6150b57cec5SDimitry Andric             if (flags == strictConversion) {
6160b57cec5SDimitry Andric                 result = sourceIllegal;
6170b57cec5SDimitry Andric                 source -= (extraBytesToRead+1); /* return to the start */
6180b57cec5SDimitry Andric                 break; /* Bail out; shouldn't continue */
6190b57cec5SDimitry Andric             } else {
6200b57cec5SDimitry Andric                 *target++ = UNI_REPLACEMENT_CHAR;
6210b57cec5SDimitry Andric             }
6220b57cec5SDimitry Andric         } else {
6230b57cec5SDimitry Andric             /* target is a character in range 0xFFFF - 0x10FFFF. */
6240b57cec5SDimitry Andric             if (target + 1 >= targetEnd) {
6250b57cec5SDimitry Andric                 source -= (extraBytesToRead+1); /* Back up source pointer! */
6260b57cec5SDimitry Andric                 result = targetExhausted; break;
6270b57cec5SDimitry Andric             }
6280b57cec5SDimitry Andric             ch -= halfBase;
6290b57cec5SDimitry Andric             *target++ = (UTF16)((ch >> halfShift) + UNI_SUR_HIGH_START);
6300b57cec5SDimitry Andric             *target++ = (UTF16)((ch & halfMask) + UNI_SUR_LOW_START);
6310b57cec5SDimitry Andric         }
6320b57cec5SDimitry Andric     }
6330b57cec5SDimitry Andric     *sourceStart = source;
6340b57cec5SDimitry Andric     *targetStart = target;
6350b57cec5SDimitry Andric     return result;
6360b57cec5SDimitry Andric }
6370b57cec5SDimitry Andric 
6380b57cec5SDimitry Andric /* --------------------------------------------------------------------- */
6390b57cec5SDimitry Andric 
ConvertUTF8toUTF32Impl(const UTF8 ** sourceStart,const UTF8 * sourceEnd,UTF32 ** targetStart,UTF32 * targetEnd,ConversionFlags flags,Boolean InputIsPartial)6400b57cec5SDimitry Andric static ConversionResult ConvertUTF8toUTF32Impl(
6410b57cec5SDimitry Andric         const UTF8** sourceStart, const UTF8* sourceEnd,
6420b57cec5SDimitry Andric         UTF32** targetStart, UTF32* targetEnd, ConversionFlags flags,
6430b57cec5SDimitry Andric         Boolean InputIsPartial) {
6440b57cec5SDimitry Andric     ConversionResult result = conversionOK;
6450b57cec5SDimitry Andric     const UTF8* source = *sourceStart;
6460b57cec5SDimitry Andric     UTF32* target = *targetStart;
6470b57cec5SDimitry Andric     while (source < sourceEnd) {
6480b57cec5SDimitry Andric         UTF32 ch = 0;
6490b57cec5SDimitry Andric         unsigned short extraBytesToRead = trailingBytesForUTF8[*source];
6500b57cec5SDimitry Andric         if (extraBytesToRead >= sourceEnd - source) {
6510b57cec5SDimitry Andric             if (flags == strictConversion || InputIsPartial) {
6520b57cec5SDimitry Andric                 result = sourceExhausted;
6530b57cec5SDimitry Andric                 break;
6540b57cec5SDimitry Andric             } else {
6550b57cec5SDimitry Andric                 result = sourceIllegal;
6560b57cec5SDimitry Andric 
6570b57cec5SDimitry Andric                 /*
6580b57cec5SDimitry Andric                  * Replace the maximal subpart of ill-formed sequence with
6590b57cec5SDimitry Andric                  * replacement character.
6600b57cec5SDimitry Andric                  */
6610b57cec5SDimitry Andric                 source += findMaximalSubpartOfIllFormedUTF8Sequence(source,
6620b57cec5SDimitry Andric                                                                     sourceEnd);
6630b57cec5SDimitry Andric                 *target++ = UNI_REPLACEMENT_CHAR;
6640b57cec5SDimitry Andric                 continue;
6650b57cec5SDimitry Andric             }
6660b57cec5SDimitry Andric         }
6670b57cec5SDimitry Andric         if (target >= targetEnd) {
6680b57cec5SDimitry Andric             result = targetExhausted; break;
6690b57cec5SDimitry Andric         }
6700b57cec5SDimitry Andric 
6710b57cec5SDimitry Andric         /* Do this check whether lenient or strict */
6720b57cec5SDimitry Andric         if (!isLegalUTF8(source, extraBytesToRead+1)) {
6730b57cec5SDimitry Andric             result = sourceIllegal;
6740b57cec5SDimitry Andric             if (flags == strictConversion) {
6750b57cec5SDimitry Andric                 /* Abort conversion. */
6760b57cec5SDimitry Andric                 break;
6770b57cec5SDimitry Andric             } else {
6780b57cec5SDimitry Andric                 /*
6790b57cec5SDimitry Andric                  * Replace the maximal subpart of ill-formed sequence with
6800b57cec5SDimitry Andric                  * replacement character.
6810b57cec5SDimitry Andric                  */
6820b57cec5SDimitry Andric                 source += findMaximalSubpartOfIllFormedUTF8Sequence(source,
6830b57cec5SDimitry Andric                                                                     sourceEnd);
6840b57cec5SDimitry Andric                 *target++ = UNI_REPLACEMENT_CHAR;
6850b57cec5SDimitry Andric                 continue;
6860b57cec5SDimitry Andric             }
6870b57cec5SDimitry Andric         }
6880b57cec5SDimitry Andric         /*
6890b57cec5SDimitry Andric          * The cases all fall through. See "Note A" below.
6900b57cec5SDimitry Andric          */
6910b57cec5SDimitry Andric         switch (extraBytesToRead) {
6920b57cec5SDimitry Andric             case 5: ch += *source++; ch <<= 6;
6930b57cec5SDimitry Andric             case 4: ch += *source++; ch <<= 6;
6940b57cec5SDimitry Andric             case 3: ch += *source++; ch <<= 6;
6950b57cec5SDimitry Andric             case 2: ch += *source++; ch <<= 6;
6960b57cec5SDimitry Andric             case 1: ch += *source++; ch <<= 6;
6970b57cec5SDimitry Andric             case 0: ch += *source++;
6980b57cec5SDimitry Andric         }
6990b57cec5SDimitry Andric         ch -= offsetsFromUTF8[extraBytesToRead];
7000b57cec5SDimitry Andric 
7010b57cec5SDimitry Andric         if (ch <= UNI_MAX_LEGAL_UTF32) {
7020b57cec5SDimitry Andric             /*
7030b57cec5SDimitry Andric              * UTF-16 surrogate values are illegal in UTF-32, and anything
7040b57cec5SDimitry Andric              * over Plane 17 (> 0x10FFFF) is illegal.
7050b57cec5SDimitry Andric              */
7060b57cec5SDimitry Andric             if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
7070b57cec5SDimitry Andric                 if (flags == strictConversion) {
7080b57cec5SDimitry Andric                     source -= (extraBytesToRead+1); /* return to the illegal value itself */
7090b57cec5SDimitry Andric                     result = sourceIllegal;
7100b57cec5SDimitry Andric                     break;
7110b57cec5SDimitry Andric                 } else {
7120b57cec5SDimitry Andric                     *target++ = UNI_REPLACEMENT_CHAR;
7130b57cec5SDimitry Andric                 }
7140b57cec5SDimitry Andric             } else {
7150b57cec5SDimitry Andric                 *target++ = ch;
7160b57cec5SDimitry Andric             }
7170b57cec5SDimitry Andric         } else { /* i.e., ch > UNI_MAX_LEGAL_UTF32 */
7180b57cec5SDimitry Andric             result = sourceIllegal;
7190b57cec5SDimitry Andric             *target++ = UNI_REPLACEMENT_CHAR;
7200b57cec5SDimitry Andric         }
7210b57cec5SDimitry Andric     }
7220b57cec5SDimitry Andric     *sourceStart = source;
7230b57cec5SDimitry Andric     *targetStart = target;
7240b57cec5SDimitry Andric     return result;
7250b57cec5SDimitry Andric }
7260b57cec5SDimitry Andric 
ConvertUTF8toUTF32Partial(const UTF8 ** sourceStart,const UTF8 * sourceEnd,UTF32 ** targetStart,UTF32 * targetEnd,ConversionFlags flags)7270b57cec5SDimitry Andric ConversionResult ConvertUTF8toUTF32Partial(const UTF8 **sourceStart,
7280b57cec5SDimitry Andric                                            const UTF8 *sourceEnd,
7290b57cec5SDimitry Andric                                            UTF32 **targetStart,
7300b57cec5SDimitry Andric                                            UTF32 *targetEnd,
7310b57cec5SDimitry Andric                                            ConversionFlags flags) {
7320b57cec5SDimitry Andric   return ConvertUTF8toUTF32Impl(sourceStart, sourceEnd, targetStart, targetEnd,
7330b57cec5SDimitry Andric                                 flags, /*InputIsPartial=*/true);
7340b57cec5SDimitry Andric }
7350b57cec5SDimitry Andric 
ConvertUTF8toUTF32(const UTF8 ** sourceStart,const UTF8 * sourceEnd,UTF32 ** targetStart,UTF32 * targetEnd,ConversionFlags flags)7360b57cec5SDimitry Andric ConversionResult ConvertUTF8toUTF32(const UTF8 **sourceStart,
7370b57cec5SDimitry Andric                                     const UTF8 *sourceEnd, UTF32 **targetStart,
7380b57cec5SDimitry Andric                                     UTF32 *targetEnd, ConversionFlags flags) {
7390b57cec5SDimitry Andric   return ConvertUTF8toUTF32Impl(sourceStart, sourceEnd, targetStart, targetEnd,
7400b57cec5SDimitry Andric                                 flags, /*InputIsPartial=*/false);
7410b57cec5SDimitry Andric }
7420b57cec5SDimitry Andric 
7430b57cec5SDimitry Andric /* ---------------------------------------------------------------------
7440b57cec5SDimitry Andric 
7450b57cec5SDimitry Andric     Note A.
7460b57cec5SDimitry Andric     The fall-through switches in UTF-8 reading code save a
7470b57cec5SDimitry Andric     temp variable, some decrements & conditionals.  The switches
7480b57cec5SDimitry Andric     are equivalent to the following loop:
7490b57cec5SDimitry Andric         {
7500b57cec5SDimitry Andric             int tmpBytesToRead = extraBytesToRead+1;
7510b57cec5SDimitry Andric             do {
7520b57cec5SDimitry Andric                 ch += *source++;
7530b57cec5SDimitry Andric                 --tmpBytesToRead;
7540b57cec5SDimitry Andric                 if (tmpBytesToRead) ch <<= 6;
7550b57cec5SDimitry Andric             } while (tmpBytesToRead > 0);
7560b57cec5SDimitry Andric         }
7570b57cec5SDimitry Andric     In UTF-8 writing code, the switches on "bytesToWrite" are
7580b57cec5SDimitry Andric     similarly unrolled loops.
7590b57cec5SDimitry Andric 
7600b57cec5SDimitry Andric    --------------------------------------------------------------------- */
7610b57cec5SDimitry Andric 
7620b57cec5SDimitry Andric } // namespace llvm
7630b57cec5SDimitry Andric 
7640b57cec5SDimitry Andric ConvertUTF_RESTORE_WARNINGS
765