poppler-21.12.0/poppler/UTF.cc

//========================================================================
//
// UTF.cc
//
// Copyright 2001-2003 Glyph & Cog, LLC
//
//========================================================================

//========================================================================
//
// Modified under the Poppler project - http://poppler.freedesktop.org
//
// All changes made under the Poppler project to this file are licensed
// under GPL version 2 or later
//
// Copyright (C) 2008 Koji Otani <sho@bbr.jp>
// Copyright (C) 2012, 2017, 2021 Adrian Johnson <ajohnson@redneon.com>
// Copyright (C) 2012 Hib Eris <hib@hiberis.nl>
// Copyright (C) 2016, 2018-2021 Albert Astals Cid <aacid@kde.org>
// Copyright (C) 2016 Jason Crain <jason@aquaticape.us>
// Copyright (C) 2018 Klarälvdalens Datakonsult AB, a KDAB Group company, <info@kdab.com>. Work sponsored by the LiMux project of the city of Munich
// Copyright (C) 2018, 2020 Nelson Benítez León <nbenitezl@gmail.com>
// Copyright (C) 2021 Georgiy Sgibnev <georgiy@sgibnev.com>. Work sponsored by lab50.net.
//
// To see a description of the changes please see the Changelog file that
// came with your tarball or type make ChangeLog if you are building from git
//
//========================================================================

#include "goo/gmem.h"
#include "PDFDocEncoding.h"
#include "GlobalParams.h"
#include "UnicodeMap.h"
#include "UTF.h"
#include "UnicodeMapFuncs.h"
#include <algorithm>

#include <config.h>

bool UnicodeIsValid(Unicode ucs4)
{
    return (ucs4 < 0x110000) && ((ucs4 & 0xfffff800) != 0xd800) && (ucs4 < 0xfdd0 || ucs4 > 0xfdef) && ((ucs4 & 0xfffe) != 0xfffe);
}

int UTF16toUCS4(const Unicode *utf16, int utf16Len, Unicode **ucs4_out)
{
    int i, n, len;
    Unicode *u;

    // count characters
    len = 0;
    for (i = 0; i < utf16Len; i++) {
        if (utf16[i] >= 0xd800 && utf16[i] < 0xdc00 && i + 1 < utf16Len && utf16[i + 1] >= 0xdc00 && utf16[i + 1] < 0xe000) {
            i++; /* surrogate pair */
        }
        len++;
    }
    if (ucs4_out == nullptr)
        return len;

    u = (Unicode *)gmallocn(len, sizeof(Unicode));
    n = 0;
    // convert string
    for (i = 0; i < utf16Len; i++) {
        if (utf16[i] >= 0xd800 && utf16[i] < 0xdc00) { /* surrogate pair */
            if (i + 1 < utf16Len && utf16[i + 1] >= 0xdc00 && utf16[i + 1] < 0xe000) {
                /* next code is a low surrogate */
                u[n] = (((utf16[i] & 0x3ff) << 10) | (utf16[i + 1] & 0x3ff)) + 0x10000;
                ++i;
            } else {
                /* missing low surrogate
                   replace it with REPLACEMENT CHARACTER (U+FFFD) */
                u[n] = 0xfffd;
            }
        } else if (utf16[i] >= 0xdc00 && utf16[i] < 0xe000) {
            /* invalid low surrogate
               replace it with REPLACEMENT CHARACTER (U+FFFD) */
            u[n] = 0xfffd;
        } else {
            u[n] = utf16[i];
        }
        if (!UnicodeIsValid(u[n])) {
            u[n] = 0xfffd;
        }
        n++;
    }
    *ucs4_out = u;
    return len;
}

int TextStringToUCS4(const std::string &textStr, Unicode **ucs4)
{
    int i, len;
    const char *s;
    Unicode *u;
    bool isUnicode, isUnicodeLE;

    len = textStr.size();
    s = textStr.c_str();
    if (len == 0) {
        *ucs4 = nullptr;
        return 0;
    }

    if (GooString::hasUnicodeMarker(textStr)) {
        isUnicode = true;
        isUnicodeLE = false;
    } else if (GooString::hasUnicodeMarkerLE(textStr)) {
        isUnicode = false;
        isUnicodeLE = true;
    } else {
        isUnicode = false;
        isUnicodeLE = false;
    }

    if (isUnicode || isUnicodeLE) {
        Unicode *utf16;
        len = len / 2 - 1;
        if (len > 0) {
            utf16 = new Unicode[len];
            for (i = 0; i < len; i++) {
                if (isUnicode)
                    utf16[i] = (s[2 + i * 2] & 0xff) << 8 | (s[3 + i * 2] & 0xff);
                else // UnicodeLE
                    utf16[i] = (s[3 + i * 2] & 0xff) << 8 | (s[2 + i * 2] & 0xff);
            }
            len = UTF16toUCS4(utf16, len, &u);
            delete[] utf16;
        } else {
            u = nullptr;
        }
    } else {
        u = (Unicode *)gmallocn(len, sizeof(Unicode));
        for (i = 0; i < len; i++) {
            u[i] = pdfDocEncoding[s[i] & 0xff];
        }
    }
    *ucs4 = u;
    return len;
}

bool UnicodeIsWhitespace(Unicode ucs4)
{
    static Unicode const spaces[] = { 0x0009, 0x000A, 0x000B, 0x000C, 0x000D, 0x0020, 0x0085, 0x00A0, 0x2000, 0x2001, 0x2002, 0x2003, 0x2004, 0x2005, 0x2006, 0x2007, 0x2008, 0x2009, 0x200A, 0x2028, 0x2029, 0x202F, 0x205F, 0x3000 };
    Unicode const *end = spaces + sizeof(spaces) / sizeof(spaces[0]);
    Unicode const *i = std::lower_bound(spaces, end, ucs4);
    return (i != end && *i == ucs4);
}

//
// decodeUtf8() and decodeUtf8Table are:
//
// Copyright (c) 2008-2009 Bjoern Hoehrmann <bjoern@hoehrmann.de>
//
// Permission is hereby granted, free of charge, to any person
// obtaining a copy of this software and associated documentation
// files (the "Software"), to deal in the Software without
// restriction, including without limitation the rights to use, copy,
// modify, merge, publish, distribute, sublicense, and/or sell copies
// of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:

// The above copyright notice and this permission notice shall be
// included in all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
// MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
// BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
// ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
// CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
// SOFTWARE.
//
// See http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ for details.
//
static const uint32_t UTF8_ACCEPT = 0;
static const uint32_t UTF8_REJECT = 12;
static const uint32_t UCS4_MAX = 0x10FFFF;
static const Unicode REPLACEMENT_CHAR = 0xFFFD;

// clang-format off
static const uint8_t decodeUtf8Table[] = {
  // The first part of the table maps bytes to character classes
  // to reduce the size of the transition table and create bitmasks.
   0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 00..1f
   0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 20..3f
   0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 40..5f
   0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 60..7f
   1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,  9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, // 80..9f
   7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,  7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, // a0..bf
   8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2,  2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, // c0..df
  10,3,3,3,3,3,3,3,3,3,3,3,3,4,3,3, 11,6,6,6,5,8,8,8,8,8,8,8,8,8,8,8, // e0..ff

  // The second part is a transition table that maps a combination
  // of a state of the automaton and a character class to a state.
   0,12,24,36,60,96,84,12,12,12,48,72, 12,12,12,12,12,12,12,12,12,12,12,12,
  12, 0,12,12,12,12,12, 0,12, 0,12,12, 12,24,12,12,12,12,12,24,12,24,12,12,
  12,12,12,12,12,12,12,24,12,12,12,12, 12,24,12,12,12,12,12,12,12,24,12,12,
  12,12,12,12,12,12,12,36,12,36,12,12, 12,36,12,12,12,12,12,36,12,36,12,12,
  12,36,12,12,12,12,12,12,12,12,12,12,
};
// clang-format on

// Decode utf8 state machine for fast UTF-8 decoding. Initialise state
// to 0 and call decodeUtf8() for each byte of UTF-8. Return value
// (and state) is UTF8_ACCEPT when it has found a valid codepoint
// (codepoint returned in codep), UTF8_REJECT when the byte is not
// allowed to occur at its position, and some other positive value if
// more bytes have to be read.  Reset state to 0 to recover from
// errors.
inline uint32_t decodeUtf8(uint32_t *state, uint32_t *codep, char byte)
{
    uint32_t b = (unsigned char)byte;
    uint32_t type = decodeUtf8Table[b];

    *codep = (*state != UTF8_ACCEPT) ? (b & 0x3fu) | (*codep << 6) : (0xff >> type) & (b);

    *state = decodeUtf8Table[256 + *state + type];
    return *state;
}

int utf8CountUCS4(const char *utf8)
{
    uint32_t codepoint;
    uint32_t state = 0;
    int count = 0;

    while (*utf8) {
        decodeUtf8(&state, &codepoint, *utf8);
        if (state == UTF8_ACCEPT) {
            count++;
        } else if (state == UTF8_REJECT) {
            count++; // replace with REPLACEMENT_CHAR
            state = 0;
        }
        utf8++;
    }
    if (state != UTF8_ACCEPT && state != UTF8_REJECT)
        count++; // replace with REPLACEMENT_CHAR

    return count;
}

int utf8ToUCS4(const char *utf8, Unicode **ucs4_out)
{
    int len = utf8CountUCS4(utf8);
    Unicode *u = (Unicode *)gmallocn(len, sizeof(Unicode));
    int n = 0;
    uint32_t codepoint;
    uint32_t state = 0;

    while (*utf8 && n < len) {
        decodeUtf8(&state, &codepoint, *utf8);
        if (state == UTF8_ACCEPT) {
            u[n++] = codepoint;
        } else if (state == UTF8_REJECT) {
            u[n++] = REPLACEMENT_CHAR; // invalid byte for this position
            state = 0;
        }
        utf8++;
    }
    if (state != UTF8_ACCEPT && state != UTF8_REJECT)
        u[n] = REPLACEMENT_CHAR; // invalid byte for this position

    *ucs4_out = u;
    return len;
}

// Count number of UTF-16 code units required to convert a UTF-8 string
// (excluding terminating NULL). Each invalid byte is counted as a
// code point since the UTF-8 conversion functions will replace it with
// REPLACEMENT_CHAR.
int utf8CountUtf16CodeUnits(const char *utf8)
{
    uint32_t codepoint;
    uint32_t state = 0;
    int count = 0;

    while (*utf8) {
        decodeUtf8(&state, &codepoint, *utf8);
        if (state == UTF8_ACCEPT) {
            if (codepoint < 0x10000)
                count++;
            else if (codepoint <= UCS4_MAX)
                count += 2;
            else
                count++; // replace with REPLACEMENT_CHAR
        } else if (state == UTF8_REJECT) {
            count++; // replace with REPLACEMENT_CHAR
            state = 0;
        }
        utf8++;
    }
    if (state != UTF8_ACCEPT && state != UTF8_REJECT)
        count++; // replace with REPLACEMENT_CHAR

    return count;
}

// Convert UTF-8 to UTF-16
//  utf8- UTF-8 string to convert. If not null terminated, set maxUtf8 to num
//        bytes to convert
//  utf16 - output buffer to write UTF-16 to. Output will always be null terminated.
//  maxUtf16 - maximum size of output buffer including space for null.
//  maxUtf8 - maximum number of UTF-8 bytes to convert. Conversion stops when
//            either this count is reached or a null is encountered.
// Returns number of UTF-16 code units written (excluding NULL).
int utf8ToUtf16(const char *utf8, uint16_t *utf16, int maxUtf16, int maxUtf8)
{
    uint16_t *p = utf16;
    uint32_t codepoint;
    uint32_t state = 0;
    int nIn = 0;
    int nOut = 0;
    while (*utf8 && nIn < maxUtf8 && nOut < maxUtf16 - 1) {
        decodeUtf8(&state, &codepoint, *utf8);
        if (state == UTF8_ACCEPT) {
            if (codepoint < 0x10000) {
                *p++ = (uint16_t)codepoint;
                nOut++;
            } else if (codepoint <= UCS4_MAX) {
                *p++ = (uint16_t)(0xD7C0 + (codepoint >> 10));
                *p++ = (uint16_t)(0xDC00 + (codepoint & 0x3FF));
                nOut += 2;
            } else {
                *p++ = REPLACEMENT_CHAR;
                nOut++;
                state = 0;
            }
        } else if (state == UTF8_REJECT) {
            *p++ = REPLACEMENT_CHAR; // invalid byte for this position
            nOut++;
        }
        utf8++;
        nIn++;
    }
    // replace any trailing bytes too short for a valid UTF-8 with a replacement char
    if (state != UTF8_ACCEPT && state != UTF8_REJECT && nOut < maxUtf16 - 1) {
        *p++ = REPLACEMENT_CHAR;
        nOut++;
    }
    if (nOut > maxUtf16 - 1)
        nOut = maxUtf16 - 1;
    utf16[nOut] = 0;
    return nOut;
}

// Allocate utf16 string and convert utf8 into it.
uint16_t *utf8ToUtf16(const char *utf8, int *len)
{
    int n = utf8CountUtf16CodeUnits(utf8);
    if (len)
        *len = n;
    uint16_t *utf16 = (uint16_t *)gmallocn(n + 1, sizeof(uint16_t));
    utf8ToUtf16(utf8, utf16);
    return utf16;
}

GooString *utf8ToUtf16WithBom(const std::string &utf8)
{
    GooString *result = new GooString();
    if (utf8.empty()) {
        return result;
    }
    int tmp_length; // Number of UTF-16 symbols.
    char *tmp_str = (char *)utf8ToUtf16(utf8.c_str(), &tmp_length);
#ifndef WORDS_BIGENDIAN
    for (int i = 0; i < tmp_length; i++) {
        std::swap(tmp_str[i * 2], tmp_str[i * 2 + 1]);
    }
#endif

    result->prependUnicodeMarker();
    result->append(tmp_str, tmp_length * 2);
    gfree(tmp_str);
    return result;
}

static const uint32_t UTF16_ACCEPT = 0;
static const uint32_t UTF16_REJECT = -1;

// Initialise state to 0. Returns UTF16_ACCEPT when a valid code point
// has been found, UTF16_REJECT when invalid code unit for this state,
// some other valid if another code unit needs to be read.
inline uint32_t decodeUtf16(uint32_t *state, uint32_t *codePoint, uint16_t codeUnit)
{
    if (*state == 0) {
        if (codeUnit >= 0xd800 && codeUnit < 0xdc00) { /* surrogate pair */
            *state = codeUnit;
            return *state;
        } else if (codeUnit >= 0xdc00 && codeUnit < 0xe000) {
            /* invalid low surrogate */
            return UTF16_REJECT;
        } else {
            *codePoint = codeUnit;
            return UTF16_ACCEPT;
        }
    } else {
        if (codeUnit >= 0xdc00 && codeUnit < 0xe000) {
            *codePoint = (((*state & 0x3ff) << 10) | (codeUnit & 0x3ff)) + 0x10000;
            *state = 0;
            return UTF16_ACCEPT;
        } else {
            /* invalid high surrogate */
            return UTF16_REJECT;
        }
    }
}

// Count number of UTF-8 bytes required to convert a UTF-16 string to
// UTF-8 (excluding terminating NULL).
int utf16CountUtf8Bytes(const uint16_t *utf16)
{
    uint32_t codepoint = 0;
    uint32_t state = 0;
    int count = 0;

    while (*utf16) {
        decodeUtf16(&state, &codepoint, *utf16);
        if (state == UTF16_ACCEPT) {
            if (codepoint < 0x80)
                count++;
            else if (codepoint < 0x800)
                count += 2;
            else if (codepoint < 0x10000)
                count += 3;
            else if (codepoint <= UCS4_MAX)
                count += 4;
            else
                count += 3; // replace with REPLACEMENT_CHAR
        } else if (state == UTF16_REJECT) {
            count += 3; // replace with REPLACEMENT_CHAR
            state = 0;
        }
        utf16++;
    }
    if (state != UTF8_ACCEPT && state != UTF8_REJECT)
        count++; // replace with REPLACEMENT_CHAR

    return count;
}

// Convert UTF-16 to UTF-8
//  utf16- UTF-16 string to convert. If not null terminated, set maxUtf16 to num
//        code units to convert
//  utf8 - output buffer to write UTF-8 to. Output will always be null terminated.
//  maxUtf8 - maximum size of output buffer including space for null.
//  maxUtf16 - maximum number of UTF-16 code units to convert. Conversion stops when
//            either this count is reached or a null is encountered.
// Returns number of UTF-8 bytes written (excluding NULL).
int utf16ToUtf8(const uint16_t *utf16, char *utf8, int maxUtf8, int maxUtf16)
{
    uint32_t codepoint = 0;
    uint32_t state = 0;
    int nIn = 0;
    int nOut = 0;
    char *p = utf8;
    while (*utf16 && nIn < maxUtf16 && nOut < maxUtf8 - 1) {
        decodeUtf16(&state, &codepoint, *utf16);
        if (state == UTF16_ACCEPT || state == UTF16_REJECT) {
            if (state == UTF16_REJECT || codepoint > UCS4_MAX) {
                codepoint = REPLACEMENT_CHAR;
                state = 0;
            }

            int bufSize = maxUtf8 - nOut;
            int count = mapUTF8(codepoint, p, bufSize);
            p += count;
            nOut += count;
        }
        utf16++;
        nIn++;
    }
    // replace any trailing bytes too short for a valid UTF-8 with a replacement char
    if (state != UTF16_ACCEPT && state != UTF16_REJECT && nOut < maxUtf8 - 1) {
        int bufSize = maxUtf8 - nOut;
        int count = mapUTF8(REPLACEMENT_CHAR, p, bufSize);
        p += count;
        nOut += count;
        nOut++;
    }
    if (nOut > maxUtf8 - 1)
        nOut = maxUtf8 - 1;
    utf8[nOut] = 0;
    return nOut;
}

// Allocate utf8 string and convert utf16 into it.
char *utf16ToUtf8(const uint16_t *utf16, int *len)
{
    int n = utf16CountUtf8Bytes(utf16);
    if (len)
        *len = n;
    char *utf8 = (char *)gmalloc(n + 1);
    utf16ToUtf8(utf16, utf8);
    return utf8;
}

void unicodeToAscii7(const Unicode *in, int len, Unicode **ucs4_out, int *out_len, const int *in_idx, int **indices)
{
    const UnicodeMap *uMap = globalParams->getUnicodeMap("ASCII7");
    int *idx = nullptr;

    if (!len) {
        *ucs4_out = nullptr;
        *out_len = 0;
        return;
    }

    if (indices) {
        if (!in_idx)
            indices = nullptr;
        else
            idx = (int *)gmallocn(len * 8 + 1, sizeof(int));
    }

    std::string str;

    char buf[8]; // 8 is enough for mapping an unicode char to a string
    int i, n, k;

    for (i = k = 0; i < len; ++i) {
        n = uMap->mapUnicode(in[i], buf, sizeof(buf));
        if (!n) {
            // the Unicode char could not be converted to ascii7 counterpart
            // so just fill with a non-printable ascii char
            buf[0] = 31;
            n = 1;
        }
        str.append(buf, n);
        if (indices) {
            for (; n > 0; n--)
                idx[k++] = in_idx[i];
        }
    }

    *out_len = TextStringToUCS4(str, ucs4_out);

    if (indices) {
        idx[k] = in_idx[len];
        *indices = idx;
    }
}