lib/Support/ConvertUTF.cpp

0b57cec5SDimitry Andric/*===--- ConvertUTF.c - Universal Character Names conversions ---------------===
0b57cec5SDimitry Andric *
0b57cec5SDimitry Andric * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
0b57cec5SDimitry Andric * See https://llvm.org/LICENSE.txt for license information.
0b57cec5SDimitry Andric * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
0b57cec5SDimitry Andric *
0b57cec5SDimitry Andric *===------------------------------------------------------------------------=*/
0b57cec5SDimitry Andric/*
bdd1243dSDimitry Andric * Copyright © 1991-2015 Unicode, Inc. All rights reserved.
bdd1243dSDimitry Andric * Distributed under the Terms of Use in
bdd1243dSDimitry Andric * http://www.unicode.org/copyright.html.
0b57cec5SDimitry Andric *
bdd1243dSDimitry Andric * Permission is hereby granted, free of charge, to any person obtaining
bdd1243dSDimitry Andric * a copy of the Unicode data files and any associated documentation
bdd1243dSDimitry Andric * (the "Data Files") or Unicode software and any associated documentation
bdd1243dSDimitry Andric * (the "Software") to deal in the Data Files or Software
bdd1243dSDimitry Andric * without restriction, including without limitation the rights to use,
bdd1243dSDimitry Andric * copy, modify, merge, publish, distribute, and/or sell copies of
bdd1243dSDimitry Andric * the Data Files or Software, and to permit persons to whom the Data Files
bdd1243dSDimitry Andric * or Software are furnished to do so, provided that
bdd1243dSDimitry Andric * (a) this copyright and permission notice appear with all copies
bdd1243dSDimitry Andric * of the Data Files or Software,
bdd1243dSDimitry Andric * (b) this copyright and permission notice appear in associated
bdd1243dSDimitry Andric * documentation, and
bdd1243dSDimitry Andric * (c) there is clear notice in each modified Data File or in the Software
bdd1243dSDimitry Andric * as well as in the documentation associated with the Data File(s) or
bdd1243dSDimitry Andric * Software that the data or software has been modified.
0b57cec5SDimitry Andric *
bdd1243dSDimitry Andric * THE DATA FILES AND SOFTWARE ARE PROVIDED "AS IS", WITHOUT WARRANTY OF
bdd1243dSDimitry Andric * ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
bdd1243dSDimitry Andric * WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
bdd1243dSDimitry Andric * NONINFRINGEMENT OF THIRD PARTY RIGHTS.
bdd1243dSDimitry Andric * IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS INCLUDED IN THIS
bdd1243dSDimitry Andric * NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL
bdd1243dSDimitry Andric * DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE,
bdd1243dSDimitry Andric * DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
bdd1243dSDimitry Andric * TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
bdd1243dSDimitry Andric * PERFORMANCE OF THE DATA FILES OR SOFTWARE.
0b57cec5SDimitry Andric *
bdd1243dSDimitry Andric * Except as contained in this notice, the name of a copyright holder
bdd1243dSDimitry Andric * shall not be used in advertising or otherwise to promote the sale,
bdd1243dSDimitry Andric * use or other dealings in these Data Files or Software without prior
bdd1243dSDimitry Andric * written authorization of the copyright holder.
0b57cec5SDimitry Andric */
0b57cec5SDimitry Andric
0b57cec5SDimitry Andric/* ---------------------------------------------------------------------
0b57cec5SDimitry Andric
0b57cec5SDimitry Andric    Conversions between UTF32, UTF-16, and UTF-8. Source code file.
0b57cec5SDimitry Andric    Author: Mark E. Davis, 1994.
0b57cec5SDimitry Andric    Rev History: Rick McGowan, fixes & updates May 2001.
0b57cec5SDimitry Andric    Sept 2001: fixed const & error conditions per
0b57cec5SDimitry Andric        mods suggested by S. Parent & A. Lillich.
0b57cec5SDimitry Andric    June 2002: Tim Dodd added detection and handling of incomplete
0b57cec5SDimitry Andric        source sequences, enhanced error detection, added casts
0b57cec5SDimitry Andric        to eliminate compiler warnings.
0b57cec5SDimitry Andric    July 2003: slight mods to back out aggressive FFFE detection.
0b57cec5SDimitry Andric    Jan 2004: updated switches in from-UTF8 conversions.
0b57cec5SDimitry Andric    Oct 2004: updated to use UNI_MAX_LEGAL_UTF32 in UTF-32 conversions.
0b57cec5SDimitry Andric
0b57cec5SDimitry Andric    See the header file "ConvertUTF.h" for complete documentation.
0b57cec5SDimitry Andric
0b57cec5SDimitry Andric------------------------------------------------------------------------ */
0b57cec5SDimitry Andric
0b57cec5SDimitry Andric#include "llvm/Support/ConvertUTF.h"
0b57cec5SDimitry Andric#ifdef CVTUTF_DEBUG
0b57cec5SDimitry Andric#include <stdio.h>
0b57cec5SDimitry Andric#endif
0b57cec5SDimitry Andric#include <assert.h>
0b57cec5SDimitry Andric
0b57cec5SDimitry Andric/*
0b57cec5SDimitry Andric * This code extensively uses fall-through switches.
0b57cec5SDimitry Andric * Keep the compiler from warning about that.
0b57cec5SDimitry Andric */
0b57cec5SDimitry Andric#if defined(__clang__) && defined(__has_warning)
0b57cec5SDimitry Andric# if __has_warning("-Wimplicit-fallthrough")
0b57cec5SDimitry Andric#  define ConvertUTF_DISABLE_WARNINGS \
0b57cec5SDimitry Andric    _Pragma("clang diagnostic push")  \
0b57cec5SDimitry Andric    _Pragma("clang diagnostic ignored \"-Wimplicit-fallthrough\"")
0b57cec5SDimitry Andric#  define ConvertUTF_RESTORE_WARNINGS \
0b57cec5SDimitry Andric    _Pragma("clang diagnostic pop")
0b57cec5SDimitry Andric# endif
0b57cec5SDimitry Andric#elif defined(__GNUC__) && __GNUC__ > 6
0b57cec5SDimitry Andric# define ConvertUTF_DISABLE_WARNINGS \
0b57cec5SDimitry Andric   _Pragma("GCC diagnostic push")    \
0b57cec5SDimitry Andric   _Pragma("GCC diagnostic ignored \"-Wimplicit-fallthrough\"")
0b57cec5SDimitry Andric# define ConvertUTF_RESTORE_WARNINGS \
0b57cec5SDimitry Andric   _Pragma("GCC diagnostic pop")
0b57cec5SDimitry Andric#endif
0b57cec5SDimitry Andric#ifndef ConvertUTF_DISABLE_WARNINGS
0b57cec5SDimitry Andric# define ConvertUTF_DISABLE_WARNINGS
0b57cec5SDimitry Andric#endif
0b57cec5SDimitry Andric#ifndef ConvertUTF_RESTORE_WARNINGS
0b57cec5SDimitry Andric# define ConvertUTF_RESTORE_WARNINGS
0b57cec5SDimitry Andric#endif
0b57cec5SDimitry Andric
0b57cec5SDimitry AndricConvertUTF_DISABLE_WARNINGS
0b57cec5SDimitry Andric
0b57cec5SDimitry Andricnamespace llvm {
0b57cec5SDimitry Andric
0b57cec5SDimitry Andricstatic const int halfShift  = 10; /* used for shifting by 10 bits */
0b57cec5SDimitry Andric
0b57cec5SDimitry Andricstatic const UTF32 halfBase = 0x0010000UL;
0b57cec5SDimitry Andricstatic const UTF32 halfMask = 0x3FFUL;
0b57cec5SDimitry Andric
0b57cec5SDimitry Andric#define UNI_SUR_HIGH_START  (UTF32)0xD800
0b57cec5SDimitry Andric#define UNI_SUR_HIGH_END    (UTF32)0xDBFF
0b57cec5SDimitry Andric#define UNI_SUR_LOW_START   (UTF32)0xDC00
0b57cec5SDimitry Andric#define UNI_SUR_LOW_END     (UTF32)0xDFFF
0b57cec5SDimitry Andric
0b57cec5SDimitry Andric/* --------------------------------------------------------------------- */
0b57cec5SDimitry Andric
0b57cec5SDimitry Andric/*
0b57cec5SDimitry Andric * Index into the table below with the first byte of a UTF-8 sequence to
0b57cec5SDimitry Andric * get the number of trailing bytes that are supposed to follow it.
0b57cec5SDimitry Andric * Note that *legal* UTF-8 values can't have 4 or 5-bytes. The table is
0b57cec5SDimitry Andric * left as-is for anyone who may want to do such conversion, which was
0b57cec5SDimitry Andric * allowed in earlier algorithms.
0b57cec5SDimitry Andric */
0b57cec5SDimitry Andricstatic const char trailingBytesForUTF8[256] = {
0b57cec5SDimitry Andric    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0b57cec5SDimitry Andric    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0b57cec5SDimitry Andric    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0b57cec5SDimitry Andric    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0b57cec5SDimitry Andric    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0b57cec5SDimitry Andric    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0b57cec5SDimitry Andric    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
0b57cec5SDimitry Andric    2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5
0b57cec5SDimitry Andric};
0b57cec5SDimitry Andric
0b57cec5SDimitry Andric/*
0b57cec5SDimitry Andric * Magic values subtracted from a buffer value during UTF8 conversion.
0b57cec5SDimitry Andric * This table contains as many values as there might be trailing bytes
0b57cec5SDimitry Andric * in a UTF-8 sequence.
0b57cec5SDimitry Andric */
0b57cec5SDimitry Andricstatic const UTF32 offsetsFromUTF8[6] = { 0x00000000UL, 0x00003080UL, 0x000E2080UL,
0b57cec5SDimitry Andric                     0x03C82080UL, 0xFA082080UL, 0x82082080UL };
0b57cec5SDimitry Andric
0b57cec5SDimitry Andric/*
0b57cec5SDimitry Andric * Once the bits are split out into bytes of UTF-8, this is a mask OR-ed
0b57cec5SDimitry Andric * into the first byte, depending on how many bytes follow.  There are
0b57cec5SDimitry Andric * as many entries in this table as there are UTF-8 sequence types.
0b57cec5SDimitry Andric * (I.e., one byte sequence, two byte... etc.). Remember that sequencs
0b57cec5SDimitry Andric * for *legal* UTF-8 will be 4 or fewer bytes total.
0b57cec5SDimitry Andric */
0b57cec5SDimitry Andricstatic const UTF8 firstByteMark[7] = { 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC };
0b57cec5SDimitry Andric
0b57cec5SDimitry Andric/* --------------------------------------------------------------------- */
0b57cec5SDimitry Andric
0b57cec5SDimitry Andric/* The interface converts a whole buffer to avoid function-call overhead.
0b57cec5SDimitry Andric * Constants have been gathered. Loops & conditionals have been removed as
0b57cec5SDimitry Andric * much as possible for efficiency, in favor of drop-through switches.
0b57cec5SDimitry Andric * (See "Note A" at the bottom of the file for equivalent code.)
0b57cec5SDimitry Andric * If your compiler supports it, the "isLegalUTF8" call can be turned
0b57cec5SDimitry Andric * into an inline function.
0b57cec5SDimitry Andric */
0b57cec5SDimitry Andric
0b57cec5SDimitry Andric
0b57cec5SDimitry Andric/* --------------------------------------------------------------------- */
0b57cec5SDimitry Andric
0b57cec5SDimitry AndricConversionResult ConvertUTF32toUTF16 (
0b57cec5SDimitry Andric        const UTF32** sourceStart, const UTF32* sourceEnd,
0b57cec5SDimitry Andric        UTF16** targetStart, UTF16* targetEnd, ConversionFlags flags) {
0b57cec5SDimitry Andric    ConversionResult result = conversionOK;
0b57cec5SDimitry Andric    const UTF32* source = *sourceStart;
0b57cec5SDimitry Andric    UTF16* target = *targetStart;
0b57cec5SDimitry Andric    while (source < sourceEnd) {
0b57cec5SDimitry Andric        UTF32 ch;
0b57cec5SDimitry Andric        if (target >= targetEnd) {
0b57cec5SDimitry Andric            result = targetExhausted; break;
0b57cec5SDimitry Andric        }
0b57cec5SDimitry Andric        ch = *source++;
0b57cec5SDimitry Andric        if (ch <= UNI_MAX_BMP) { /* Target is a character <= 0xFFFF */
0b57cec5SDimitry Andric            /* UTF-16 surrogate values are illegal in UTF-32; 0xffff or 0xfffe are both reserved values */
0b57cec5SDimitry Andric            if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
0b57cec5SDimitry Andric                if (flags == strictConversion) {
0b57cec5SDimitry Andric                    --source; /* return to the illegal value itself */
0b57cec5SDimitry Andric                    result = sourceIllegal;
0b57cec5SDimitry Andric                    break;
0b57cec5SDimitry Andric                } else {
0b57cec5SDimitry Andric                    *target++ = UNI_REPLACEMENT_CHAR;
0b57cec5SDimitry Andric                }
0b57cec5SDimitry Andric            } else {
0b57cec5SDimitry Andric                *target++ = (UTF16)ch; /* normal case */
0b57cec5SDimitry Andric            }
0b57cec5SDimitry Andric        } else if (ch > UNI_MAX_LEGAL_UTF32) {
0b57cec5SDimitry Andric            if (flags == strictConversion) {
0b57cec5SDimitry Andric                result = sourceIllegal;
0b57cec5SDimitry Andric            } else {
0b57cec5SDimitry Andric                *target++ = UNI_REPLACEMENT_CHAR;
0b57cec5SDimitry Andric            }
0b57cec5SDimitry Andric        } else {
0b57cec5SDimitry Andric            /* target is a character in range 0xFFFF - 0x10FFFF. */
0b57cec5SDimitry Andric            if (target + 1 >= targetEnd) {
0b57cec5SDimitry Andric                --source; /* Back up source pointer! */
0b57cec5SDimitry Andric                result = targetExhausted; break;
0b57cec5SDimitry Andric            }
0b57cec5SDimitry Andric            ch -= halfBase;
0b57cec5SDimitry Andric            *target++ = (UTF16)((ch >> halfShift) + UNI_SUR_HIGH_START);
0b57cec5SDimitry Andric            *target++ = (UTF16)((ch & halfMask) + UNI_SUR_LOW_START);
0b57cec5SDimitry Andric        }
0b57cec5SDimitry Andric    }
0b57cec5SDimitry Andric    *sourceStart = source;
0b57cec5SDimitry Andric    *targetStart = target;
0b57cec5SDimitry Andric    return result;
0b57cec5SDimitry Andric}
0b57cec5SDimitry Andric
0b57cec5SDimitry Andric/* --------------------------------------------------------------------- */
0b57cec5SDimitry Andric
0b57cec5SDimitry AndricConversionResult ConvertUTF16toUTF32 (
0b57cec5SDimitry Andric        const UTF16** sourceStart, const UTF16* sourceEnd,
0b57cec5SDimitry Andric        UTF32** targetStart, UTF32* targetEnd, ConversionFlags flags) {
0b57cec5SDimitry Andric    ConversionResult result = conversionOK;
0b57cec5SDimitry Andric    const UTF16* source = *sourceStart;
0b57cec5SDimitry Andric    UTF32* target = *targetStart;
0b57cec5SDimitry Andric    UTF32 ch, ch2;
0b57cec5SDimitry Andric    while (source < sourceEnd) {
0b57cec5SDimitry Andric        const UTF16* oldSource = source; /*  In case we have to back up because of target overflow. */
0b57cec5SDimitry Andric        ch = *source++;
0b57cec5SDimitry Andric        /* If we have a surrogate pair, convert to UTF32 first. */
0b57cec5SDimitry Andric        if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END) {
0b57cec5SDimitry Andric            /* If the 16 bits following the high surrogate are in the source buffer... */
0b57cec5SDimitry Andric            if (source < sourceEnd) {
0b57cec5SDimitry Andric                ch2 = *source;
0b57cec5SDimitry Andric                /* If it's a low surrogate, convert to UTF32. */
0b57cec5SDimitry Andric                if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END) {
0b57cec5SDimitry Andric                    ch = ((ch - UNI_SUR_HIGH_START) << halfShift)
0b57cec5SDimitry Andric                        + (ch2 - UNI_SUR_LOW_START) + halfBase;
0b57cec5SDimitry Andric                    ++source;
0b57cec5SDimitry Andric                } else if (flags == strictConversion) { /* it's an unpaired high surrogate */
0b57cec5SDimitry Andric                    --source; /* return to the illegal value itself */
0b57cec5SDimitry Andric                    result = sourceIllegal;
0b57cec5SDimitry Andric                    break;
0b57cec5SDimitry Andric                }
0b57cec5SDimitry Andric            } else { /* We don't have the 16 bits following the high surrogate. */
0b57cec5SDimitry Andric                --source; /* return to the high surrogate */
0b57cec5SDimitry Andric                result = sourceExhausted;
0b57cec5SDimitry Andric                break;
0b57cec5SDimitry Andric            }
0b57cec5SDimitry Andric        } else if (flags == strictConversion) {
0b57cec5SDimitry Andric            /* UTF-16 surrogate values are illegal in UTF-32 */
0b57cec5SDimitry Andric            if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END) {
0b57cec5SDimitry Andric                --source; /* return to the illegal value itself */
0b57cec5SDimitry Andric                result = sourceIllegal;
0b57cec5SDimitry Andric                break;
0b57cec5SDimitry Andric            }
0b57cec5SDimitry Andric        }
0b57cec5SDimitry Andric        if (target >= targetEnd) {
0b57cec5SDimitry Andric            source = oldSource; /* Back up source pointer! */
0b57cec5SDimitry Andric            result = targetExhausted; break;
0b57cec5SDimitry Andric        }
0b57cec5SDimitry Andric        *target++ = ch;
0b57cec5SDimitry Andric    }
0b57cec5SDimitry Andric    *sourceStart = source;
0b57cec5SDimitry Andric    *targetStart = target;
0b57cec5SDimitry Andric#ifdef CVTUTF_DEBUG
0b57cec5SDimitry Andricif (result == sourceIllegal) {
0b57cec5SDimitry Andric    fprintf(stderr, "ConvertUTF16toUTF32 illegal seq 0x%04x,%04x\n", ch, ch2);
0b57cec5SDimitry Andric    fflush(stderr);
0b57cec5SDimitry Andric}
0b57cec5SDimitry Andric#endif
0b57cec5SDimitry Andric    return result;
0b57cec5SDimitry Andric}
0b57cec5SDimitry AndricConversionResult ConvertUTF16toUTF8 (
0b57cec5SDimitry Andric        const UTF16** sourceStart, const UTF16* sourceEnd,
0b57cec5SDimitry Andric        UTF8** targetStart, UTF8* targetEnd, ConversionFlags flags) {
0b57cec5SDimitry Andric    ConversionResult result = conversionOK;
0b57cec5SDimitry Andric    const UTF16* source = *sourceStart;
0b57cec5SDimitry Andric    UTF8* target = *targetStart;
0b57cec5SDimitry Andric    while (source < sourceEnd) {
0b57cec5SDimitry Andric        UTF32 ch;
0b57cec5SDimitry Andric        unsigned short bytesToWrite = 0;
0b57cec5SDimitry Andric        const UTF32 byteMask = 0xBF;
0b57cec5SDimitry Andric        const UTF32 byteMark = 0x80;
0b57cec5SDimitry Andric        const UTF16* oldSource = source; /* In case we have to back up because of target overflow. */
0b57cec5SDimitry Andric        ch = *source++;
0b57cec5SDimitry Andric        /* If we have a surrogate pair, convert to UTF32 first. */
0b57cec5SDimitry Andric        if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END) {
0b57cec5SDimitry Andric            /* If the 16 bits following the high surrogate are in the source buffer... */
0b57cec5SDimitry Andric            if (source < sourceEnd) {
0b57cec5SDimitry Andric                UTF32 ch2 = *source;
0b57cec5SDimitry Andric                /* If it's a low surrogate, convert to UTF32. */
0b57cec5SDimitry Andric                if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END) {
0b57cec5SDimitry Andric                    ch = ((ch - UNI_SUR_HIGH_START) << halfShift)
0b57cec5SDimitry Andric                        + (ch2 - UNI_SUR_LOW_START) + halfBase;
0b57cec5SDimitry Andric                    ++source;
0b57cec5SDimitry Andric                } else if (flags == strictConversion) { /* it's an unpaired high surrogate */
0b57cec5SDimitry Andric                    --source; /* return to the illegal value itself */
0b57cec5SDimitry Andric                    result = sourceIllegal;
0b57cec5SDimitry Andric                    break;
0b57cec5SDimitry Andric                }
0b57cec5SDimitry Andric            } else { /* We don't have the 16 bits following the high surrogate. */
0b57cec5SDimitry Andric                --source; /* return to the high surrogate */
0b57cec5SDimitry Andric                result = sourceExhausted;
0b57cec5SDimitry Andric                break;
0b57cec5SDimitry Andric            }
0b57cec5SDimitry Andric        } else if (flags == strictConversion) {
0b57cec5SDimitry Andric            /* UTF-16 surrogate values are illegal in UTF-32 */
0b57cec5SDimitry Andric            if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END) {
0b57cec5SDimitry Andric                --source; /* return to the illegal value itself */
0b57cec5SDimitry Andric                result = sourceIllegal;
0b57cec5SDimitry Andric                break;
0b57cec5SDimitry Andric            }
0b57cec5SDimitry Andric        }
0b57cec5SDimitry Andric        /* Figure out how many bytes the result will require */
0b57cec5SDimitry Andric        if (ch < (UTF32)0x80) {      bytesToWrite = 1;
0b57cec5SDimitry Andric        } else if (ch < (UTF32)0x800) {     bytesToWrite = 2;
0b57cec5SDimitry Andric        } else if (ch < (UTF32)0x10000) {   bytesToWrite = 3;
0b57cec5SDimitry Andric        } else if (ch < (UTF32)0x110000) {  bytesToWrite = 4;
0b57cec5SDimitry Andric        } else {                            bytesToWrite = 3;
0b57cec5SDimitry Andric                                            ch = UNI_REPLACEMENT_CHAR;
0b57cec5SDimitry Andric        }
0b57cec5SDimitry Andric
0b57cec5SDimitry Andric        target += bytesToWrite;
0b57cec5SDimitry Andric        if (target > targetEnd) {
0b57cec5SDimitry Andric            source = oldSource; /* Back up source pointer! */
0b57cec5SDimitry Andric            target -= bytesToWrite; result = targetExhausted; break;
0b57cec5SDimitry Andric        }
0b57cec5SDimitry Andric        switch (bytesToWrite) { /* note: everything falls through. */
0b57cec5SDimitry Andric            case 4: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
0b57cec5SDimitry Andric            case 3: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
0b57cec5SDimitry Andric            case 2: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
0b57cec5SDimitry Andric            case 1: *--target =  (UTF8)(ch | firstByteMark[bytesToWrite]);
0b57cec5SDimitry Andric        }
0b57cec5SDimitry Andric        target += bytesToWrite;
0b57cec5SDimitry Andric    }
0b57cec5SDimitry Andric    *sourceStart = source;
0b57cec5SDimitry Andric    *targetStart = target;
0b57cec5SDimitry Andric    return result;
0b57cec5SDimitry Andric}
0b57cec5SDimitry Andric
0b57cec5SDimitry Andric/* --------------------------------------------------------------------- */
0b57cec5SDimitry Andric
0b57cec5SDimitry AndricConversionResult ConvertUTF32toUTF8 (
0b57cec5SDimitry Andric        const UTF32** sourceStart, const UTF32* sourceEnd,
0b57cec5SDimitry Andric        UTF8** targetStart, UTF8* targetEnd, ConversionFlags flags) {
0b57cec5SDimitry Andric    ConversionResult result = conversionOK;
0b57cec5SDimitry Andric    const UTF32* source = *sourceStart;
0b57cec5SDimitry Andric    UTF8* target = *targetStart;
0b57cec5SDimitry Andric    while (source < sourceEnd) {
0b57cec5SDimitry Andric        UTF32 ch;
0b57cec5SDimitry Andric        unsigned short bytesToWrite = 0;
0b57cec5SDimitry Andric        const UTF32 byteMask = 0xBF;
0b57cec5SDimitry Andric        const UTF32 byteMark = 0x80;
0b57cec5SDimitry Andric        ch = *source++;
0b57cec5SDimitry Andric        if (flags == strictConversion ) {
0b57cec5SDimitry Andric            /* UTF-16 surrogate values are illegal in UTF-32 */
0b57cec5SDimitry Andric            if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
0b57cec5SDimitry Andric                --source; /* return to the illegal value itself */
0b57cec5SDimitry Andric                result = sourceIllegal;
0b57cec5SDimitry Andric                break;
0b57cec5SDimitry Andric            }
0b57cec5SDimitry Andric        }
0b57cec5SDimitry Andric        /*
0b57cec5SDimitry Andric         * Figure out how many bytes the result will require. Turn any
0b57cec5SDimitry Andric         * illegally large UTF32 things (> Plane 17) into replacement chars.
0b57cec5SDimitry Andric         */
0b57cec5SDimitry Andric        if (ch < (UTF32)0x80) {      bytesToWrite = 1;
0b57cec5SDimitry Andric        } else if (ch < (UTF32)0x800) {     bytesToWrite = 2;
0b57cec5SDimitry Andric        } else if (ch < (UTF32)0x10000) {   bytesToWrite = 3;
0b57cec5SDimitry Andric        } else if (ch <= UNI_MAX_LEGAL_UTF32) {  bytesToWrite = 4;
0b57cec5SDimitry Andric        } else {                            bytesToWrite = 3;
0b57cec5SDimitry Andric                                            ch = UNI_REPLACEMENT_CHAR;
0b57cec5SDimitry Andric                                            result = sourceIllegal;
0b57cec5SDimitry Andric        }
0b57cec5SDimitry Andric
0b57cec5SDimitry Andric        target += bytesToWrite;
0b57cec5SDimitry Andric        if (target > targetEnd) {
0b57cec5SDimitry Andric            --source; /* Back up source pointer! */
0b57cec5SDimitry Andric            target -= bytesToWrite; result = targetExhausted; break;
0b57cec5SDimitry Andric        }
0b57cec5SDimitry Andric        switch (bytesToWrite) { /* note: everything falls through. */
0b57cec5SDimitry Andric            case 4: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
0b57cec5SDimitry Andric            case 3: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
0b57cec5SDimitry Andric            case 2: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
0b57cec5SDimitry Andric            case 1: *--target = (UTF8) (ch | firstByteMark[bytesToWrite]);
0b57cec5SDimitry Andric        }
0b57cec5SDimitry Andric        target += bytesToWrite;
0b57cec5SDimitry Andric    }
0b57cec5SDimitry Andric    *sourceStart = source;
0b57cec5SDimitry Andric    *targetStart = target;
0b57cec5SDimitry Andric    return result;
0b57cec5SDimitry Andric}
0b57cec5SDimitry Andric
0b57cec5SDimitry Andric/* --------------------------------------------------------------------- */
0b57cec5SDimitry Andric
0b57cec5SDimitry Andric/*
0b57cec5SDimitry Andric * Utility routine to tell whether a sequence of bytes is legal UTF-8.
0b57cec5SDimitry Andric * This must be called with the length pre-determined by the first byte.
0b57cec5SDimitry Andric * If not calling this from ConvertUTF8to*, then the length can be set by:
0b57cec5SDimitry Andric *  length = trailingBytesForUTF8[*source]+1;
0b57cec5SDimitry Andric * and the sequence is illegal right away if there aren't that many bytes
0b57cec5SDimitry Andric * available.
0b57cec5SDimitry Andric * If presented with a length > 4, this returns false.  The Unicode
0b57cec5SDimitry Andric * definition of UTF-8 goes up to 4-byte sequences.
0b57cec5SDimitry Andric */
0b57cec5SDimitry Andric
0b57cec5SDimitry Andricstatic Boolean isLegalUTF8(const UTF8 *source, int length) {
0b57cec5SDimitry Andric    UTF8 a;
0b57cec5SDimitry Andric    const UTF8 *srcptr = source+length;
0b57cec5SDimitry Andric    switch (length) {
0b57cec5SDimitry Andric    default: return false;
0b57cec5SDimitry Andric        /* Everything else falls through when "true"... */
0b57cec5SDimitry Andric    case 4: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;
0b57cec5SDimitry Andric    case 3: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;
0b57cec5SDimitry Andric    case 2: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;
0b57cec5SDimitry Andric
0b57cec5SDimitry Andric        switch (*source) {
0b57cec5SDimitry Andric            /* no fall-through in this inner switch */
0b57cec5SDimitry Andric            case 0xE0: if (a < 0xA0) return false; break;
0b57cec5SDimitry Andric            case 0xED: if (a > 0x9F) return false; break;
0b57cec5SDimitry Andric            case 0xF0: if (a < 0x90) return false; break;
0b57cec5SDimitry Andric            case 0xF4: if (a > 0x8F) return false; break;
0b57cec5SDimitry Andric            default:   if (a < 0x80) return false;
0b57cec5SDimitry Andric        }
0b57cec5SDimitry Andric
0b57cec5SDimitry Andric    case 1: if (*source >= 0x80 && *source < 0xC2) return false;
0b57cec5SDimitry Andric    }
0b57cec5SDimitry Andric    if (*source > 0xF4) return false;
0b57cec5SDimitry Andric    return true;
0b57cec5SDimitry Andric}
0b57cec5SDimitry Andric
0b57cec5SDimitry Andric/* --------------------------------------------------------------------- */
0b57cec5SDimitry Andric
0b57cec5SDimitry Andric/*
0b57cec5SDimitry Andric * Exported function to return whether a UTF-8 sequence is legal or not.
0b57cec5SDimitry Andric * This is not used here; it's just exported.
0b57cec5SDimitry Andric */
0b57cec5SDimitry AndricBoolean isLegalUTF8Sequence(const UTF8 *source, const UTF8 *sourceEnd) {
0b57cec5SDimitry Andric    int length = trailingBytesForUTF8[*source]+1;
0b57cec5SDimitry Andric    if (length > sourceEnd - source) {
0b57cec5SDimitry Andric        return false;
0b57cec5SDimitry Andric    }
0b57cec5SDimitry Andric    return isLegalUTF8(source, length);
0b57cec5SDimitry Andric}
0b57cec5SDimitry Andric
753f127fSDimitry Andric/*
753f127fSDimitry Andric * Exported function to return the size of the first utf-8 code unit sequence,
753f127fSDimitry Andric * Or 0 if the sequence is not valid;
753f127fSDimitry Andric */
753f127fSDimitry Andricunsigned getUTF8SequenceSize(const UTF8 *source, const UTF8 *sourceEnd) {
753f127fSDimitry Andric  int length = trailingBytesForUTF8[*source] + 1;
753f127fSDimitry Andric  return (length <= sourceEnd - source && isLegalUTF8(source, length)) ? length
753f127fSDimitry Andric                                                                       : 0;
753f127fSDimitry Andric}
753f127fSDimitry Andric
0b57cec5SDimitry Andric/* --------------------------------------------------------------------- */
0b57cec5SDimitry Andric
0b57cec5SDimitry Andricstatic unsigned
0b57cec5SDimitry AndricfindMaximalSubpartOfIllFormedUTF8Sequence(const UTF8 *source,
0b57cec5SDimitry Andric                                          const UTF8 *sourceEnd) {
0b57cec5SDimitry Andric  UTF8 b1, b2, b3;
0b57cec5SDimitry Andric
0b57cec5SDimitry Andric  assert(!isLegalUTF8Sequence(source, sourceEnd));
0b57cec5SDimitry Andric
0b57cec5SDimitry Andric  /*
0b57cec5SDimitry Andric   * Unicode 6.3.0, D93b:
0b57cec5SDimitry Andric   *
0b57cec5SDimitry Andric   *   Maximal subpart of an ill-formed subsequence: The longest code unit
0b57cec5SDimitry Andric   *   subsequence starting at an unconvertible offset that is either:
0b57cec5SDimitry Andric   *   a. the initial subsequence of a well-formed code unit sequence, or
0b57cec5SDimitry Andric   *   b. a subsequence of length one.
0b57cec5SDimitry Andric   */
0b57cec5SDimitry Andric
0b57cec5SDimitry Andric  if (source == sourceEnd)
0b57cec5SDimitry Andric    return 0;
0b57cec5SDimitry Andric
0b57cec5SDimitry Andric  /*
0b57cec5SDimitry Andric   * Perform case analysis.  See Unicode 6.3.0, Table 3-7. Well-Formed UTF-8
0b57cec5SDimitry Andric   * Byte Sequences.
0b57cec5SDimitry Andric   */
0b57cec5SDimitry Andric
0b57cec5SDimitry Andric  b1 = *source;
0b57cec5SDimitry Andric  ++source;
0b57cec5SDimitry Andric  if (b1 >= 0xC2 && b1 <= 0xDF) {
0b57cec5SDimitry Andric    /*
0b57cec5SDimitry Andric     * First byte is valid, but we know that this code unit sequence is
0b57cec5SDimitry Andric     * invalid, so the maximal subpart has to end after the first byte.
0b57cec5SDimitry Andric     */
0b57cec5SDimitry Andric    return 1;
0b57cec5SDimitry Andric  }
0b57cec5SDimitry Andric
0b57cec5SDimitry Andric  if (source == sourceEnd)
0b57cec5SDimitry Andric    return 1;
0b57cec5SDimitry Andric
0b57cec5SDimitry Andric  b2 = *source;
0b57cec5SDimitry Andric  ++source;
0b57cec5SDimitry Andric
0b57cec5SDimitry Andric  if (b1 == 0xE0) {
0b57cec5SDimitry Andric    return (b2 >= 0xA0 && b2 <= 0xBF) ? 2 : 1;
0b57cec5SDimitry Andric  }
0b57cec5SDimitry Andric  if (b1 >= 0xE1 && b1 <= 0xEC) {
0b57cec5SDimitry Andric    return (b2 >= 0x80 && b2 <= 0xBF) ? 2 : 1;
0b57cec5SDimitry Andric  }
0b57cec5SDimitry Andric  if (b1 == 0xED) {
0b57cec5SDimitry Andric    return (b2 >= 0x80 && b2 <= 0x9F) ? 2 : 1;
0b57cec5SDimitry Andric  }
0b57cec5SDimitry Andric  if (b1 >= 0xEE && b1 <= 0xEF) {
0b57cec5SDimitry Andric    return (b2 >= 0x80 && b2 <= 0xBF) ? 2 : 1;
0b57cec5SDimitry Andric  }
0b57cec5SDimitry Andric  if (b1 == 0xF0) {
0b57cec5SDimitry Andric    if (b2 >= 0x90 && b2 <= 0xBF) {
0b57cec5SDimitry Andric      if (source == sourceEnd)
0b57cec5SDimitry Andric        return 2;
0b57cec5SDimitry Andric
0b57cec5SDimitry Andric      b3 = *source;
0b57cec5SDimitry Andric      return (b3 >= 0x80 && b3 <= 0xBF) ? 3 : 2;
0b57cec5SDimitry Andric    }
0b57cec5SDimitry Andric    return 1;
0b57cec5SDimitry Andric  }
0b57cec5SDimitry Andric  if (b1 >= 0xF1 && b1 <= 0xF3) {
0b57cec5SDimitry Andric    if (b2 >= 0x80 && b2 <= 0xBF) {
0b57cec5SDimitry Andric      if (source == sourceEnd)
0b57cec5SDimitry Andric        return 2;
0b57cec5SDimitry Andric
0b57cec5SDimitry Andric      b3 = *source;
0b57cec5SDimitry Andric      return (b3 >= 0x80 && b3 <= 0xBF) ? 3 : 2;
0b57cec5SDimitry Andric    }
0b57cec5SDimitry Andric    return 1;
0b57cec5SDimitry Andric  }
0b57cec5SDimitry Andric  if (b1 == 0xF4) {
0b57cec5SDimitry Andric    if (b2 >= 0x80 && b2 <= 0x8F) {
0b57cec5SDimitry Andric      if (source == sourceEnd)
0b57cec5SDimitry Andric        return 2;
0b57cec5SDimitry Andric
0b57cec5SDimitry Andric      b3 = *source;
0b57cec5SDimitry Andric      return (b3 >= 0x80 && b3 <= 0xBF) ? 3 : 2;
0b57cec5SDimitry Andric    }
0b57cec5SDimitry Andric    return 1;
0b57cec5SDimitry Andric  }
0b57cec5SDimitry Andric
0b57cec5SDimitry Andric  assert((b1 >= 0x80 && b1 <= 0xC1) || b1 >= 0xF5);
0b57cec5SDimitry Andric  /*
0b57cec5SDimitry Andric   * There are no valid sequences that start with these bytes.  Maximal subpart
0b57cec5SDimitry Andric   * is defined to have length 1 in these cases.
0b57cec5SDimitry Andric   */
0b57cec5SDimitry Andric  return 1;
0b57cec5SDimitry Andric}
0b57cec5SDimitry Andric
0b57cec5SDimitry Andric/* --------------------------------------------------------------------- */
0b57cec5SDimitry Andric
0b57cec5SDimitry Andric/*
0b57cec5SDimitry Andric * Exported function to return the total number of bytes in a codepoint
0b57cec5SDimitry Andric * represented in UTF-8, given the value of the first byte.
0b57cec5SDimitry Andric */
0b57cec5SDimitry Andricunsigned getNumBytesForUTF8(UTF8 first) {
0b57cec5SDimitry Andric  return trailingBytesForUTF8[first] + 1;
0b57cec5SDimitry Andric}
0b57cec5SDimitry Andric
0b57cec5SDimitry Andric/* --------------------------------------------------------------------- */
0b57cec5SDimitry Andric
0b57cec5SDimitry Andric/*
0b57cec5SDimitry Andric * Exported function to return whether a UTF-8 string is legal or not.
0b57cec5SDimitry Andric * This is not used here; it's just exported.
0b57cec5SDimitry Andric */
0b57cec5SDimitry AndricBoolean isLegalUTF8String(const UTF8 **source, const UTF8 *sourceEnd) {
0b57cec5SDimitry Andric    while (*source != sourceEnd) {
0b57cec5SDimitry Andric        int length = trailingBytesForUTF8[**source] + 1;
0b57cec5SDimitry Andric        if (length > sourceEnd - *source || !isLegalUTF8(*source, length))
0b57cec5SDimitry Andric            return false;
0b57cec5SDimitry Andric        *source += length;
0b57cec5SDimitry Andric    }
0b57cec5SDimitry Andric    return true;
0b57cec5SDimitry Andric}
0b57cec5SDimitry Andric
0b57cec5SDimitry Andric/* --------------------------------------------------------------------- */
0b57cec5SDimitry Andric
0b57cec5SDimitry AndricConversionResult ConvertUTF8toUTF16 (
0b57cec5SDimitry Andric        const UTF8** sourceStart, const UTF8* sourceEnd,
0b57cec5SDimitry Andric        UTF16** targetStart, UTF16* targetEnd, ConversionFlags flags) {
0b57cec5SDimitry Andric    ConversionResult result = conversionOK;
0b57cec5SDimitry Andric    const UTF8* source = *sourceStart;
0b57cec5SDimitry Andric    UTF16* target = *targetStart;
0b57cec5SDimitry Andric    while (source < sourceEnd) {
0b57cec5SDimitry Andric        UTF32 ch = 0;
0b57cec5SDimitry Andric        unsigned short extraBytesToRead = trailingBytesForUTF8[*source];
0b57cec5SDimitry Andric        if (extraBytesToRead >= sourceEnd - source) {
0b57cec5SDimitry Andric            result = sourceExhausted; break;
0b57cec5SDimitry Andric        }
0b57cec5SDimitry Andric        /* Do this check whether lenient or strict */
0b57cec5SDimitry Andric        if (!isLegalUTF8(source, extraBytesToRead+1)) {
0b57cec5SDimitry Andric            result = sourceIllegal;
0b57cec5SDimitry Andric            break;
0b57cec5SDimitry Andric        }
0b57cec5SDimitry Andric        /*
0b57cec5SDimitry Andric         * The cases all fall through. See "Note A" below.
0b57cec5SDimitry Andric         */
0b57cec5SDimitry Andric        switch (extraBytesToRead) {
0b57cec5SDimitry Andric            case 5: ch += *source++; ch <<= 6; /* remember, illegal UTF-8 */
0b57cec5SDimitry Andric            case 4: ch += *source++; ch <<= 6; /* remember, illegal UTF-8 */
0b57cec5SDimitry Andric            case 3: ch += *source++; ch <<= 6;
0b57cec5SDimitry Andric            case 2: ch += *source++; ch <<= 6;
0b57cec5SDimitry Andric            case 1: ch += *source++; ch <<= 6;
0b57cec5SDimitry Andric            case 0: ch += *source++;
0b57cec5SDimitry Andric        }
0b57cec5SDimitry Andric        ch -= offsetsFromUTF8[extraBytesToRead];
0b57cec5SDimitry Andric
0b57cec5SDimitry Andric        if (target >= targetEnd) {
0b57cec5SDimitry Andric            source -= (extraBytesToRead+1); /* Back up source pointer! */
0b57cec5SDimitry Andric            result = targetExhausted; break;
0b57cec5SDimitry Andric        }
0b57cec5SDimitry Andric        if (ch <= UNI_MAX_BMP) { /* Target is a character <= 0xFFFF */
0b57cec5SDimitry Andric            /* UTF-16 surrogate values are illegal in UTF-32 */
0b57cec5SDimitry Andric            if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
0b57cec5SDimitry Andric                if (flags == strictConversion) {
0b57cec5SDimitry Andric                    source -= (extraBytesToRead+1); /* return to the illegal value itself */
0b57cec5SDimitry Andric                    result = sourceIllegal;
0b57cec5SDimitry Andric                    break;
0b57cec5SDimitry Andric                } else {
0b57cec5SDimitry Andric                    *target++ = UNI_REPLACEMENT_CHAR;
0b57cec5SDimitry Andric                }
0b57cec5SDimitry Andric            } else {
0b57cec5SDimitry Andric                *target++ = (UTF16)ch; /* normal case */
0b57cec5SDimitry Andric            }
0b57cec5SDimitry Andric        } else if (ch > UNI_MAX_UTF16) {
0b57cec5SDimitry Andric            if (flags == strictConversion) {
0b57cec5SDimitry Andric                result = sourceIllegal;
0b57cec5SDimitry Andric                source -= (extraBytesToRead+1); /* return to the start */
0b57cec5SDimitry Andric                break; /* Bail out; shouldn't continue */
0b57cec5SDimitry Andric            } else {
0b57cec5SDimitry Andric                *target++ = UNI_REPLACEMENT_CHAR;
0b57cec5SDimitry Andric            }
0b57cec5SDimitry Andric        } else {
0b57cec5SDimitry Andric            /* target is a character in range 0xFFFF - 0x10FFFF. */
0b57cec5SDimitry Andric            if (target + 1 >= targetEnd) {
0b57cec5SDimitry Andric                source -= (extraBytesToRead+1); /* Back up source pointer! */
0b57cec5SDimitry Andric                result = targetExhausted; break;
0b57cec5SDimitry Andric            }
0b57cec5SDimitry Andric            ch -= halfBase;
0b57cec5SDimitry Andric            *target++ = (UTF16)((ch >> halfShift) + UNI_SUR_HIGH_START);
0b57cec5SDimitry Andric            *target++ = (UTF16)((ch & halfMask) + UNI_SUR_LOW_START);
0b57cec5SDimitry Andric        }
0b57cec5SDimitry Andric    }
0b57cec5SDimitry Andric    *sourceStart = source;
0b57cec5SDimitry Andric    *targetStart = target;
0b57cec5SDimitry Andric    return result;
0b57cec5SDimitry Andric}
0b57cec5SDimitry Andric
0b57cec5SDimitry Andric/* --------------------------------------------------------------------- */
0b57cec5SDimitry Andric
0b57cec5SDimitry Andricstatic ConversionResult ConvertUTF8toUTF32Impl(
0b57cec5SDimitry Andric        const UTF8** sourceStart, const UTF8* sourceEnd,
0b57cec5SDimitry Andric        UTF32** targetStart, UTF32* targetEnd, ConversionFlags flags,
0b57cec5SDimitry Andric        Boolean InputIsPartial) {
0b57cec5SDimitry Andric    ConversionResult result = conversionOK;
0b57cec5SDimitry Andric    const UTF8* source = *sourceStart;
0b57cec5SDimitry Andric    UTF32* target = *targetStart;
0b57cec5SDimitry Andric    while (source < sourceEnd) {
0b57cec5SDimitry Andric        UTF32 ch = 0;
0b57cec5SDimitry Andric        unsigned short extraBytesToRead = trailingBytesForUTF8[*source];
0b57cec5SDimitry Andric        if (extraBytesToRead >= sourceEnd - source) {
0b57cec5SDimitry Andric            if (flags == strictConversion || InputIsPartial) {
0b57cec5SDimitry Andric                result = sourceExhausted;
0b57cec5SDimitry Andric                break;
0b57cec5SDimitry Andric            } else {
0b57cec5SDimitry Andric                result = sourceIllegal;
0b57cec5SDimitry Andric
0b57cec5SDimitry Andric                /*
0b57cec5SDimitry Andric                 * Replace the maximal subpart of ill-formed sequence with
0b57cec5SDimitry Andric                 * replacement character.
0b57cec5SDimitry Andric                 */
0b57cec5SDimitry Andric                source += findMaximalSubpartOfIllFormedUTF8Sequence(source,
0b57cec5SDimitry Andric                                                                    sourceEnd);
0b57cec5SDimitry Andric                *target++ = UNI_REPLACEMENT_CHAR;
0b57cec5SDimitry Andric                continue;
0b57cec5SDimitry Andric            }
0b57cec5SDimitry Andric        }
0b57cec5SDimitry Andric        if (target >= targetEnd) {
0b57cec5SDimitry Andric            result = targetExhausted; break;
0b57cec5SDimitry Andric        }
0b57cec5SDimitry Andric
0b57cec5SDimitry Andric        /* Do this check whether lenient or strict */
0b57cec5SDimitry Andric        if (!isLegalUTF8(source, extraBytesToRead+1)) {
0b57cec5SDimitry Andric            result = sourceIllegal;
0b57cec5SDimitry Andric            if (flags == strictConversion) {
0b57cec5SDimitry Andric                /* Abort conversion. */
0b57cec5SDimitry Andric                break;
0b57cec5SDimitry Andric            } else {
0b57cec5SDimitry Andric                /*
0b57cec5SDimitry Andric                 * Replace the maximal subpart of ill-formed sequence with
0b57cec5SDimitry Andric                 * replacement character.
0b57cec5SDimitry Andric                 */
0b57cec5SDimitry Andric                source += findMaximalSubpartOfIllFormedUTF8Sequence(source,
0b57cec5SDimitry Andric                                                                    sourceEnd);
0b57cec5SDimitry Andric                *target++ = UNI_REPLACEMENT_CHAR;
0b57cec5SDimitry Andric                continue;
0b57cec5SDimitry Andric            }
0b57cec5SDimitry Andric        }
0b57cec5SDimitry Andric        /*
0b57cec5SDimitry Andric         * The cases all fall through. See "Note A" below.
0b57cec5SDimitry Andric         */
0b57cec5SDimitry Andric        switch (extraBytesToRead) {
0b57cec5SDimitry Andric            case 5: ch += *source++; ch <<= 6;
0b57cec5SDimitry Andric            case 4: ch += *source++; ch <<= 6;
0b57cec5SDimitry Andric            case 3: ch += *source++; ch <<= 6;
0b57cec5SDimitry Andric            case 2: ch += *source++; ch <<= 6;
0b57cec5SDimitry Andric            case 1: ch += *source++; ch <<= 6;
0b57cec5SDimitry Andric            case 0: ch += *source++;
0b57cec5SDimitry Andric        }
0b57cec5SDimitry Andric        ch -= offsetsFromUTF8[extraBytesToRead];
0b57cec5SDimitry Andric
0b57cec5SDimitry Andric        if (ch <= UNI_MAX_LEGAL_UTF32) {
0b57cec5SDimitry Andric            /*
0b57cec5SDimitry Andric             * UTF-16 surrogate values are illegal in UTF-32, and anything
0b57cec5SDimitry Andric             * over Plane 17 (> 0x10FFFF) is illegal.
0b57cec5SDimitry Andric             */
0b57cec5SDimitry Andric            if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
0b57cec5SDimitry Andric                if (flags == strictConversion) {
0b57cec5SDimitry Andric                    source -= (extraBytesToRead+1); /* return to the illegal value itself */
0b57cec5SDimitry Andric                    result = sourceIllegal;
0b57cec5SDimitry Andric                    break;
0b57cec5SDimitry Andric                } else {
0b57cec5SDimitry Andric                    *target++ = UNI_REPLACEMENT_CHAR;
0b57cec5SDimitry Andric                }
0b57cec5SDimitry Andric            } else {
0b57cec5SDimitry Andric                *target++ = ch;
0b57cec5SDimitry Andric            }
0b57cec5SDimitry Andric        } else { /* i.e., ch > UNI_MAX_LEGAL_UTF32 */
0b57cec5SDimitry Andric            result = sourceIllegal;
0b57cec5SDimitry Andric            *target++ = UNI_REPLACEMENT_CHAR;
0b57cec5SDimitry Andric        }
0b57cec5SDimitry Andric    }
0b57cec5SDimitry Andric    *sourceStart = source;
0b57cec5SDimitry Andric    *targetStart = target;
0b57cec5SDimitry Andric    return result;
0b57cec5SDimitry Andric}
0b57cec5SDimitry Andric
0b57cec5SDimitry AndricConversionResult ConvertUTF8toUTF32Partial(const UTF8 **sourceStart,
0b57cec5SDimitry Andric                                           const UTF8 *sourceEnd,
0b57cec5SDimitry Andric                                           UTF32 **targetStart,
0b57cec5SDimitry Andric                                           UTF32 *targetEnd,
0b57cec5SDimitry Andric                                           ConversionFlags flags) {
0b57cec5SDimitry Andric  return ConvertUTF8toUTF32Impl(sourceStart, sourceEnd, targetStart, targetEnd,
0b57cec5SDimitry Andric                                flags, /*InputIsPartial=*/true);
0b57cec5SDimitry Andric}
0b57cec5SDimitry Andric
0b57cec5SDimitry AndricConversionResult ConvertUTF8toUTF32(const UTF8 **sourceStart,
0b57cec5SDimitry Andric                                    const UTF8 *sourceEnd, UTF32 **targetStart,
0b57cec5SDimitry Andric                                    UTF32 *targetEnd, ConversionFlags flags) {
0b57cec5SDimitry Andric  return ConvertUTF8toUTF32Impl(sourceStart, sourceEnd, targetStart, targetEnd,
0b57cec5SDimitry Andric                                flags, /*InputIsPartial=*/false);
0b57cec5SDimitry Andric}
0b57cec5SDimitry Andric
0b57cec5SDimitry Andric/* ---------------------------------------------------------------------
0b57cec5SDimitry Andric
0b57cec5SDimitry Andric    Note A.
0b57cec5SDimitry Andric    The fall-through switches in UTF-8 reading code save a
0b57cec5SDimitry Andric    temp variable, some decrements & conditionals.  The switches
0b57cec5SDimitry Andric    are equivalent to the following loop:
0b57cec5SDimitry Andric        {
0b57cec5SDimitry Andric            int tmpBytesToRead = extraBytesToRead+1;
0b57cec5SDimitry Andric            do {
0b57cec5SDimitry Andric                ch += *source++;
0b57cec5SDimitry Andric                --tmpBytesToRead;
0b57cec5SDimitry Andric                if (tmpBytesToRead) ch <<= 6;
0b57cec5SDimitry Andric            } while (tmpBytesToRead > 0);
0b57cec5SDimitry Andric        }
0b57cec5SDimitry Andric    In UTF-8 writing code, the switches on "bytesToWrite" are
0b57cec5SDimitry Andric    similarly unrolled loops.
0b57cec5SDimitry Andric
0b57cec5SDimitry Andric   --------------------------------------------------------------------- */
0b57cec5SDimitry Andric
0b57cec5SDimitry Andric} // namespace llvm
0b57cec5SDimitry Andric
0b57cec5SDimitry AndricConvertUTF_RESTORE_WARNINGS