1 //======================================================================== 2 // 3 // UTF.h 4 // 5 // This file is licensed under the GPLv2 or later 6 // 7 // Copyright (C) 2012, 2017, 2021 Adrian Johnson <ajohnson@redneon.com> 8 // Copyright (C) 2016 Jason Crain <jason@aquaticape.us> 9 // Copyright (C) 2018 Klarälvdalens Datakonsult AB, a KDAB Group company, <info@kdab.com>. Work sponsored by the LiMux project of the city of Munich 10 // Copyright (C) 2018 Nelson Benítez León <nbenitezl@gmail.com> 11 // Copyright (C) 2019-2021 Albert Astals Cid <aacid@kde.org> 12 // Copyright (C) 2021 Georgiy Sgibnev <georgiy@sgibnev.com>. Work sponsored by lab50.net. 13 // 14 //======================================================================== 15 16 #ifndef UTF_H 17 #define UTF_H 18 19 #include <cstdint> 20 #include <climits> 21 22 #include "goo/GooString.h" 23 #include "CharTypes.h" 24 #include "poppler_private_export.h" 25 26 // Convert a UTF-16 string to a UCS-4 27 // utf16 - utf16 bytes 28 // utf16_len - number of UTF-16 characters 29 // ucs4_out - if not NULL, allocates and returns UCS-4 string. Free with gfree. 30 // returns number of UCS-4 characters 31 int UTF16toUCS4(const Unicode *utf16, int utf16Len, Unicode **ucs4_out); 32 33 // Convert a PDF Text String to UCS-4 34 // s - PDF text string 35 // ucs4 - if the number of UCS-4 characters is > 0, allocates and 36 // returns UCS-4 string. Free with gfree. 37 // returns number of UCS-4 characters 38 int POPPLER_PRIVATE_EXPORT TextStringToUCS4(const std::string &textStr, Unicode **ucs4); 39 40 // check if UCS-4 character is valid 41 bool UnicodeIsValid(Unicode ucs4); 42 43 // is a unicode whitespace character 44 bool UnicodeIsWhitespace(Unicode ucs4); 45 46 // Count number of UCS-4 characters required to convert a UTF-8 string to 47 // UCS-4 (excluding terminating NULL). 48 int POPPLER_PRIVATE_EXPORT utf8CountUCS4(const char *utf8); 49 50 // Convert a UTF-8 string to a UCS-4 51 // utf8 - utf8 bytes 52 // ucs4_out - if not NULL, allocates and returns UCS-4 string. Free with gfree. 53 // returns number of UCS-4 characters 54 int POPPLER_PRIVATE_EXPORT utf8ToUCS4(const char *utf8, Unicode **ucs4_out); 55 56 // Count number of UTF-16 code units required to convert a UTF-8 string 57 // (excluding terminating NULL). Each invalid byte is counted as a 58 // code point since the UTF-8 conversion functions will replace it with 59 // REPLACEMENT_CHAR. 60 int POPPLER_PRIVATE_EXPORT utf8CountUtf16CodeUnits(const char *utf8); 61 62 // Convert UTF-8 to UTF-16 63 // utf8- UTF-8 string to convert. If not null terminated, set maxUtf8 to num 64 // bytes to convert 65 // utf16 - output buffer to write UTF-16 to. Output will always be null terminated. 66 // maxUtf16 - maximum size of output buffer including space for null. 67 // maxUtf8 - maximum number of UTF-8 bytes to convert. Conversion stops when 68 // either this count is reached or a null is encountered. 69 // Returns number of UTF-16 code units written (excluding NULL). 70 int POPPLER_PRIVATE_EXPORT utf8ToUtf16(const char *utf8, uint16_t *utf16, int maxUtf16 = INT_MAX, int maxUtf8 = INT_MAX); 71 72 // Allocate utf16 string and convert utf8 into it. 73 uint16_t POPPLER_PRIVATE_EXPORT *utf8ToUtf16(const char *utf8, int *len = nullptr); 74 75 // Converts a UTF-8 string to a big endian UTF-16 string with BOM. 76 // The caller owns the returned pointer. 77 // utf8 - UTF-8 string to convert. An empty string is acceptable. 78 // Returns a big endian UTF-16 string with BOM or an empty string without BOM. 79 GooString POPPLER_PRIVATE_EXPORT *utf8ToUtf16WithBom(const std::string &utf8); 80 81 // Count number of UTF-8 bytes required to convert a UTF-16 string to 82 // UTF-8 (excluding terminating NULL). 83 int POPPLER_PRIVATE_EXPORT utf16CountUtf8Bytes(const uint16_t *utf16); 84 85 // Convert UTF-16 to UTF-8 86 // utf16- UTF-16 string to convert. If not null terminated, set maxUtf16 to num 87 // code units to convert 88 // utf8 - output buffer to write UTF-8 to. Output will always be null terminated. 89 // maxUtf8 - maximum size of output buffer including space for null. 90 // maxUtf16 - maximum number of UTF-16 code units to convert. Conversion stops when 91 // either this count is reached or a null is encountered. 92 // Returns number of UTF-8 bytes written (excluding NULL). 93 int POPPLER_PRIVATE_EXPORT utf16ToUtf8(const uint16_t *utf16, char *utf8, int maxUtf8 = INT_MAX, int maxUtf16 = INT_MAX); 94 95 // Allocate utf8 string and convert utf16 into it. 96 char POPPLER_PRIVATE_EXPORT *utf16ToUtf8(const uint16_t *utf16, int *len = nullptr); 97 98 // Convert a UCS-4 string to pure ASCII (7bit) 99 // in - UCS-4 string bytes 100 // len - number of UCS-4 characters 101 // ucs4_out - if not NULL, allocates and returns UCS-4 string. Free with gfree. 102 // out_len - number of UCS-4 characters in ucs4_out. 103 // in_idx - if not NULL, the int array returned by the out fourth parameter of 104 // unicodeNormalizeNFKC() function. Optional, needed for @indices out parameter. 105 // indices - if not NULL, @indices is assigned the location of a newly-allocated array 106 // of length @out_len + 1, for each character in the ascii string giving the index 107 // of the corresponding character in the text of the line (thanks to this info 108 // being passed in @in_idx parameter). 109 void POPPLER_PRIVATE_EXPORT unicodeToAscii7(const Unicode *in, int len, Unicode **ucs4_out, int *out_len, const int *in_idx, int **indices); 110 111 #endif 112