1 //========================================================================
2 //
3 // UTF.cc
4 //
5 // Copyright 2001-2003 Glyph & Cog, LLC
6 //
7 //========================================================================
8 
9 //========================================================================
10 //
11 // Modified under the Poppler project - http://poppler.freedesktop.org
12 //
13 // All changes made under the Poppler project to this file are licensed
14 // under GPL version 2 or later
15 //
16 // Copyright (C) 2008 Koji Otani <sho@bbr.jp>
17 // Copyright (C) 2012, 2017, 2021 Adrian Johnson <ajohnson@redneon.com>
18 // Copyright (C) 2012 Hib Eris <hib@hiberis.nl>
19 // Copyright (C) 2016, 2018-2021 Albert Astals Cid <aacid@kde.org>
20 // Copyright (C) 2016 Jason Crain <jason@aquaticape.us>
21 // Copyright (C) 2018 Klarälvdalens Datakonsult AB, a KDAB Group company, <info@kdab.com>. Work sponsored by the LiMux project of the city of Munich
22 // Copyright (C) 2018, 2020 Nelson Benítez León <nbenitezl@gmail.com>
23 // Copyright (C) 2021 Georgiy Sgibnev <georgiy@sgibnev.com>. Work sponsored by lab50.net.
24 //
25 // To see a description of the changes please see the Changelog file that
26 // came with your tarball or type make ChangeLog if you are building from git
27 //
28 //========================================================================
29 
30 #include "goo/gmem.h"
31 #include "PDFDocEncoding.h"
32 #include "GlobalParams.h"
33 #include "UnicodeMap.h"
34 #include "UTF.h"
35 #include "UnicodeMapFuncs.h"
36 #include <algorithm>
37 
38 #include <config.h>
39 
UnicodeIsValid(Unicode ucs4)40 bool UnicodeIsValid(Unicode ucs4)
41 {
42     return (ucs4 < 0x110000) && ((ucs4 & 0xfffff800) != 0xd800) && (ucs4 < 0xfdd0 || ucs4 > 0xfdef) && ((ucs4 & 0xfffe) != 0xfffe);
43 }
44 
UTF16toUCS4(const Unicode * utf16,int utf16Len,Unicode ** ucs4_out)45 int UTF16toUCS4(const Unicode *utf16, int utf16Len, Unicode **ucs4_out)
46 {
47     int i, n, len;
48     Unicode *u;
49 
50     // count characters
51     len = 0;
52     for (i = 0; i < utf16Len; i++) {
53         if (utf16[i] >= 0xd800 && utf16[i] < 0xdc00 && i + 1 < utf16Len && utf16[i + 1] >= 0xdc00 && utf16[i + 1] < 0xe000) {
54             i++; /* surrogate pair */
55         }
56         len++;
57     }
58     if (ucs4_out == nullptr)
59         return len;
60 
61     u = (Unicode *)gmallocn(len, sizeof(Unicode));
62     n = 0;
63     // convert string
64     for (i = 0; i < utf16Len; i++) {
65         if (utf16[i] >= 0xd800 && utf16[i] < 0xdc00) { /* surrogate pair */
66             if (i + 1 < utf16Len && utf16[i + 1] >= 0xdc00 && utf16[i + 1] < 0xe000) {
67                 /* next code is a low surrogate */
68                 u[n] = (((utf16[i] & 0x3ff) << 10) | (utf16[i + 1] & 0x3ff)) + 0x10000;
69                 ++i;
70             } else {
71                 /* missing low surrogate
72                    replace it with REPLACEMENT CHARACTER (U+FFFD) */
73                 u[n] = 0xfffd;
74             }
75         } else if (utf16[i] >= 0xdc00 && utf16[i] < 0xe000) {
76             /* invalid low surrogate
77                replace it with REPLACEMENT CHARACTER (U+FFFD) */
78             u[n] = 0xfffd;
79         } else {
80             u[n] = utf16[i];
81         }
82         if (!UnicodeIsValid(u[n])) {
83             u[n] = 0xfffd;
84         }
85         n++;
86     }
87     *ucs4_out = u;
88     return len;
89 }
90 
TextStringToUCS4(const std::string & textStr,Unicode ** ucs4)91 int TextStringToUCS4(const std::string &textStr, Unicode **ucs4)
92 {
93     int i, len;
94     const char *s;
95     Unicode *u;
96     bool isUnicode, isUnicodeLE;
97 
98     len = textStr.size();
99     s = textStr.c_str();
100     if (len == 0) {
101         *ucs4 = nullptr;
102         return 0;
103     }
104 
105     if (GooString::hasUnicodeMarker(textStr)) {
106         isUnicode = true;
107         isUnicodeLE = false;
108     } else if (GooString::hasUnicodeMarkerLE(textStr)) {
109         isUnicode = false;
110         isUnicodeLE = true;
111     } else {
112         isUnicode = false;
113         isUnicodeLE = false;
114     }
115 
116     if (isUnicode || isUnicodeLE) {
117         Unicode *utf16;
118         len = len / 2 - 1;
119         if (len > 0) {
120             utf16 = new Unicode[len];
121             for (i = 0; i < len; i++) {
122                 if (isUnicode)
123                     utf16[i] = (s[2 + i * 2] & 0xff) << 8 | (s[3 + i * 2] & 0xff);
124                 else // UnicodeLE
125                     utf16[i] = (s[3 + i * 2] & 0xff) << 8 | (s[2 + i * 2] & 0xff);
126             }
127             len = UTF16toUCS4(utf16, len, &u);
128             delete[] utf16;
129         } else {
130             u = nullptr;
131         }
132     } else {
133         u = (Unicode *)gmallocn(len, sizeof(Unicode));
134         for (i = 0; i < len; i++) {
135             u[i] = pdfDocEncoding[s[i] & 0xff];
136         }
137     }
138     *ucs4 = u;
139     return len;
140 }
141 
UnicodeIsWhitespace(Unicode ucs4)142 bool UnicodeIsWhitespace(Unicode ucs4)
143 {
144     static Unicode const spaces[] = { 0x0009, 0x000A, 0x000B, 0x000C, 0x000D, 0x0020, 0x0085, 0x00A0, 0x2000, 0x2001, 0x2002, 0x2003, 0x2004, 0x2005, 0x2006, 0x2007, 0x2008, 0x2009, 0x200A, 0x2028, 0x2029, 0x202F, 0x205F, 0x3000 };
145     Unicode const *end = spaces + sizeof(spaces) / sizeof(spaces[0]);
146     Unicode const *i = std::lower_bound(spaces, end, ucs4);
147     return (i != end && *i == ucs4);
148 }
149 
150 //
151 // decodeUtf8() and decodeUtf8Table are:
152 //
153 // Copyright (c) 2008-2009 Bjoern Hoehrmann <bjoern@hoehrmann.de>
154 //
155 // Permission is hereby granted, free of charge, to any person
156 // obtaining a copy of this software and associated documentation
157 // files (the "Software"), to deal in the Software without
158 // restriction, including without limitation the rights to use, copy,
159 // modify, merge, publish, distribute, sublicense, and/or sell copies
160 // of the Software, and to permit persons to whom the Software is
161 // furnished to do so, subject to the following conditions:
162 
163 // The above copyright notice and this permission notice shall be
164 // included in all copies or substantial portions of the Software.
165 //
166 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
167 // EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
168 // MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
169 // NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
170 // BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
171 // ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
172 // CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
173 // SOFTWARE.
174 //
175 // See http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ for details.
176 //
177 static const uint32_t UTF8_ACCEPT = 0;
178 static const uint32_t UTF8_REJECT = 12;
179 static const uint32_t UCS4_MAX = 0x10FFFF;
180 static const Unicode REPLACEMENT_CHAR = 0xFFFD;
181 
182 // clang-format off
183 static const uint8_t decodeUtf8Table[] = {
184   // The first part of the table maps bytes to character classes
185   // to reduce the size of the transition table and create bitmasks.
186    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 00..1f
187    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 20..3f
188    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 40..5f
189    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 60..7f
190    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,  9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, // 80..9f
191    7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,  7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, // a0..bf
192    8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2,  2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, // c0..df
193   10,3,3,3,3,3,3,3,3,3,3,3,3,4,3,3, 11,6,6,6,5,8,8,8,8,8,8,8,8,8,8,8, // e0..ff
194 
195   // The second part is a transition table that maps a combination
196   // of a state of the automaton and a character class to a state.
197    0,12,24,36,60,96,84,12,12,12,48,72, 12,12,12,12,12,12,12,12,12,12,12,12,
198   12, 0,12,12,12,12,12, 0,12, 0,12,12, 12,24,12,12,12,12,12,24,12,24,12,12,
199   12,12,12,12,12,12,12,24,12,12,12,12, 12,24,12,12,12,12,12,12,12,24,12,12,
200   12,12,12,12,12,12,12,36,12,36,12,12, 12,36,12,12,12,12,12,36,12,36,12,12,
201   12,36,12,12,12,12,12,12,12,12,12,12,
202 };
203 // clang-format on
204 
205 // Decode utf8 state machine for fast UTF-8 decoding. Initialise state
206 // to 0 and call decodeUtf8() for each byte of UTF-8. Return value
207 // (and state) is UTF8_ACCEPT when it has found a valid codepoint
208 // (codepoint returned in codep), UTF8_REJECT when the byte is not
209 // allowed to occur at its position, and some other positive value if
210 // more bytes have to be read.  Reset state to 0 to recover from
211 // errors.
decodeUtf8(uint32_t * state,uint32_t * codep,char byte)212 inline uint32_t decodeUtf8(uint32_t *state, uint32_t *codep, char byte)
213 {
214     uint32_t b = (unsigned char)byte;
215     uint32_t type = decodeUtf8Table[b];
216 
217     *codep = (*state != UTF8_ACCEPT) ? (b & 0x3fu) | (*codep << 6) : (0xff >> type) & (b);
218 
219     *state = decodeUtf8Table[256 + *state + type];
220     return *state;
221 }
222 
utf8CountUCS4(const char * utf8)223 int utf8CountUCS4(const char *utf8)
224 {
225     uint32_t codepoint;
226     uint32_t state = 0;
227     int count = 0;
228 
229     while (*utf8) {
230         decodeUtf8(&state, &codepoint, *utf8);
231         if (state == UTF8_ACCEPT) {
232             count++;
233         } else if (state == UTF8_REJECT) {
234             count++; // replace with REPLACEMENT_CHAR
235             state = 0;
236         }
237         utf8++;
238     }
239     if (state != UTF8_ACCEPT && state != UTF8_REJECT)
240         count++; // replace with REPLACEMENT_CHAR
241 
242     return count;
243 }
244 
utf8ToUCS4(const char * utf8,Unicode ** ucs4_out)245 int utf8ToUCS4(const char *utf8, Unicode **ucs4_out)
246 {
247     int len = utf8CountUCS4(utf8);
248     Unicode *u = (Unicode *)gmallocn(len, sizeof(Unicode));
249     int n = 0;
250     uint32_t codepoint;
251     uint32_t state = 0;
252 
253     while (*utf8 && n < len) {
254         decodeUtf8(&state, &codepoint, *utf8);
255         if (state == UTF8_ACCEPT) {
256             u[n++] = codepoint;
257         } else if (state == UTF8_REJECT) {
258             u[n++] = REPLACEMENT_CHAR; // invalid byte for this position
259             state = 0;
260         }
261         utf8++;
262     }
263     if (state != UTF8_ACCEPT && state != UTF8_REJECT)
264         u[n] = REPLACEMENT_CHAR; // invalid byte for this position
265 
266     *ucs4_out = u;
267     return len;
268 }
269 
270 // Count number of UTF-16 code units required to convert a UTF-8 string
271 // (excluding terminating NULL). Each invalid byte is counted as a
272 // code point since the UTF-8 conversion functions will replace it with
273 // REPLACEMENT_CHAR.
utf8CountUtf16CodeUnits(const char * utf8)274 int utf8CountUtf16CodeUnits(const char *utf8)
275 {
276     uint32_t codepoint;
277     uint32_t state = 0;
278     int count = 0;
279 
280     while (*utf8) {
281         decodeUtf8(&state, &codepoint, *utf8);
282         if (state == UTF8_ACCEPT) {
283             if (codepoint < 0x10000)
284                 count++;
285             else if (codepoint <= UCS4_MAX)
286                 count += 2;
287             else
288                 count++; // replace with REPLACEMENT_CHAR
289         } else if (state == UTF8_REJECT) {
290             count++; // replace with REPLACEMENT_CHAR
291             state = 0;
292         }
293         utf8++;
294     }
295     if (state != UTF8_ACCEPT && state != UTF8_REJECT)
296         count++; // replace with REPLACEMENT_CHAR
297 
298     return count;
299 }
300 
301 // Convert UTF-8 to UTF-16
302 //  utf8- UTF-8 string to convert. If not null terminated, set maxUtf8 to num
303 //        bytes to convert
304 //  utf16 - output buffer to write UTF-16 to. Output will always be null terminated.
305 //  maxUtf16 - maximum size of output buffer including space for null.
306 //  maxUtf8 - maximum number of UTF-8 bytes to convert. Conversion stops when
307 //            either this count is reached or a null is encountered.
308 // Returns number of UTF-16 code units written (excluding NULL).
utf8ToUtf16(const char * utf8,uint16_t * utf16,int maxUtf16,int maxUtf8)309 int utf8ToUtf16(const char *utf8, uint16_t *utf16, int maxUtf16, int maxUtf8)
310 {
311     uint16_t *p = utf16;
312     uint32_t codepoint;
313     uint32_t state = 0;
314     int nIn = 0;
315     int nOut = 0;
316     while (*utf8 && nIn < maxUtf8 && nOut < maxUtf16 - 1) {
317         decodeUtf8(&state, &codepoint, *utf8);
318         if (state == UTF8_ACCEPT) {
319             if (codepoint < 0x10000) {
320                 *p++ = (uint16_t)codepoint;
321                 nOut++;
322             } else if (codepoint <= UCS4_MAX) {
323                 *p++ = (uint16_t)(0xD7C0 + (codepoint >> 10));
324                 *p++ = (uint16_t)(0xDC00 + (codepoint & 0x3FF));
325                 nOut += 2;
326             } else {
327                 *p++ = REPLACEMENT_CHAR;
328                 nOut++;
329                 state = 0;
330             }
331         } else if (state == UTF8_REJECT) {
332             *p++ = REPLACEMENT_CHAR; // invalid byte for this position
333             nOut++;
334         }
335         utf8++;
336         nIn++;
337     }
338     // replace any trailing bytes too short for a valid UTF-8 with a replacement char
339     if (state != UTF8_ACCEPT && state != UTF8_REJECT && nOut < maxUtf16 - 1) {
340         *p++ = REPLACEMENT_CHAR;
341         nOut++;
342     }
343     if (nOut > maxUtf16 - 1)
344         nOut = maxUtf16 - 1;
345     utf16[nOut] = 0;
346     return nOut;
347 }
348 
349 // Allocate utf16 string and convert utf8 into it.
utf8ToUtf16(const char * utf8,int * len)350 uint16_t *utf8ToUtf16(const char *utf8, int *len)
351 {
352     int n = utf8CountUtf16CodeUnits(utf8);
353     if (len)
354         *len = n;
355     uint16_t *utf16 = (uint16_t *)gmallocn(n + 1, sizeof(uint16_t));
356     utf8ToUtf16(utf8, utf16);
357     return utf16;
358 }
359 
utf8ToUtf16WithBom(const std::string & utf8)360 GooString *utf8ToUtf16WithBom(const std::string &utf8)
361 {
362     GooString *result = new GooString();
363     if (utf8.empty()) {
364         return result;
365     }
366     int tmp_length; // Number of UTF-16 symbols.
367     char *tmp_str = (char *)utf8ToUtf16(utf8.c_str(), &tmp_length);
368 #ifndef WORDS_BIGENDIAN
369     for (int i = 0; i < tmp_length; i++) {
370         std::swap(tmp_str[i * 2], tmp_str[i * 2 + 1]);
371     }
372 #endif
373 
374     result->prependUnicodeMarker();
375     result->append(tmp_str, tmp_length * 2);
376     gfree(tmp_str);
377     return result;
378 }
379 
380 static const uint32_t UTF16_ACCEPT = 0;
381 static const uint32_t UTF16_REJECT = -1;
382 
383 // Initialise state to 0. Returns UTF16_ACCEPT when a valid code point
384 // has been found, UTF16_REJECT when invalid code unit for this state,
385 // some other valid if another code unit needs to be read.
decodeUtf16(uint32_t * state,uint32_t * codePoint,uint16_t codeUnit)386 inline uint32_t decodeUtf16(uint32_t *state, uint32_t *codePoint, uint16_t codeUnit)
387 {
388     if (*state == 0) {
389         if (codeUnit >= 0xd800 && codeUnit < 0xdc00) { /* surrogate pair */
390             *state = codeUnit;
391             return *state;
392         } else if (codeUnit >= 0xdc00 && codeUnit < 0xe000) {
393             /* invalid low surrogate */
394             return UTF16_REJECT;
395         } else {
396             *codePoint = codeUnit;
397             return UTF16_ACCEPT;
398         }
399     } else {
400         if (codeUnit >= 0xdc00 && codeUnit < 0xe000) {
401             *codePoint = (((*state & 0x3ff) << 10) | (codeUnit & 0x3ff)) + 0x10000;
402             *state = 0;
403             return UTF16_ACCEPT;
404         } else {
405             /* invalid high surrogate */
406             return UTF16_REJECT;
407         }
408     }
409 }
410 
411 // Count number of UTF-8 bytes required to convert a UTF-16 string to
412 // UTF-8 (excluding terminating NULL).
utf16CountUtf8Bytes(const uint16_t * utf16)413 int utf16CountUtf8Bytes(const uint16_t *utf16)
414 {
415     uint32_t codepoint = 0;
416     uint32_t state = 0;
417     int count = 0;
418 
419     while (*utf16) {
420         decodeUtf16(&state, &codepoint, *utf16);
421         if (state == UTF16_ACCEPT) {
422             if (codepoint < 0x80)
423                 count++;
424             else if (codepoint < 0x800)
425                 count += 2;
426             else if (codepoint < 0x10000)
427                 count += 3;
428             else if (codepoint <= UCS4_MAX)
429                 count += 4;
430             else
431                 count += 3; // replace with REPLACEMENT_CHAR
432         } else if (state == UTF16_REJECT) {
433             count += 3; // replace with REPLACEMENT_CHAR
434             state = 0;
435         }
436         utf16++;
437     }
438     if (state != UTF8_ACCEPT && state != UTF8_REJECT)
439         count++; // replace with REPLACEMENT_CHAR
440 
441     return count;
442 }
443 
444 // Convert UTF-16 to UTF-8
445 //  utf16- UTF-16 string to convert. If not null terminated, set maxUtf16 to num
446 //        code units to convert
447 //  utf8 - output buffer to write UTF-8 to. Output will always be null terminated.
448 //  maxUtf8 - maximum size of output buffer including space for null.
449 //  maxUtf16 - maximum number of UTF-16 code units to convert. Conversion stops when
450 //            either this count is reached or a null is encountered.
451 // Returns number of UTF-8 bytes written (excluding NULL).
utf16ToUtf8(const uint16_t * utf16,char * utf8,int maxUtf8,int maxUtf16)452 int utf16ToUtf8(const uint16_t *utf16, char *utf8, int maxUtf8, int maxUtf16)
453 {
454     uint32_t codepoint = 0;
455     uint32_t state = 0;
456     int nIn = 0;
457     int nOut = 0;
458     char *p = utf8;
459     while (*utf16 && nIn < maxUtf16 && nOut < maxUtf8 - 1) {
460         decodeUtf16(&state, &codepoint, *utf16);
461         if (state == UTF16_ACCEPT || state == UTF16_REJECT) {
462             if (state == UTF16_REJECT || codepoint > UCS4_MAX) {
463                 codepoint = REPLACEMENT_CHAR;
464                 state = 0;
465             }
466 
467             int bufSize = maxUtf8 - nOut;
468             int count = mapUTF8(codepoint, p, bufSize);
469             p += count;
470             nOut += count;
471         }
472         utf16++;
473         nIn++;
474     }
475     // replace any trailing bytes too short for a valid UTF-8 with a replacement char
476     if (state != UTF16_ACCEPT && state != UTF16_REJECT && nOut < maxUtf8 - 1) {
477         int bufSize = maxUtf8 - nOut;
478         int count = mapUTF8(REPLACEMENT_CHAR, p, bufSize);
479         p += count;
480         nOut += count;
481         nOut++;
482     }
483     if (nOut > maxUtf8 - 1)
484         nOut = maxUtf8 - 1;
485     utf8[nOut] = 0;
486     return nOut;
487 }
488 
489 // Allocate utf8 string and convert utf16 into it.
utf16ToUtf8(const uint16_t * utf16,int * len)490 char *utf16ToUtf8(const uint16_t *utf16, int *len)
491 {
492     int n = utf16CountUtf8Bytes(utf16);
493     if (len)
494         *len = n;
495     char *utf8 = (char *)gmalloc(n + 1);
496     utf16ToUtf8(utf16, utf8);
497     return utf8;
498 }
499 
unicodeToAscii7(const Unicode * in,int len,Unicode ** ucs4_out,int * out_len,const int * in_idx,int ** indices)500 void unicodeToAscii7(const Unicode *in, int len, Unicode **ucs4_out, int *out_len, const int *in_idx, int **indices)
501 {
502     const UnicodeMap *uMap = globalParams->getUnicodeMap("ASCII7");
503     int *idx = nullptr;
504 
505     if (!len) {
506         *ucs4_out = nullptr;
507         *out_len = 0;
508         return;
509     }
510 
511     if (indices) {
512         if (!in_idx)
513             indices = nullptr;
514         else
515             idx = (int *)gmallocn(len * 8 + 1, sizeof(int));
516     }
517 
518     std::string str;
519 
520     char buf[8]; // 8 is enough for mapping an unicode char to a string
521     int i, n, k;
522 
523     for (i = k = 0; i < len; ++i) {
524         n = uMap->mapUnicode(in[i], buf, sizeof(buf));
525         if (!n) {
526             // the Unicode char could not be converted to ascii7 counterpart
527             // so just fill with a non-printable ascii char
528             buf[0] = 31;
529             n = 1;
530         }
531         str.append(buf, n);
532         if (indices) {
533             for (; n > 0; n--)
534                 idx[k++] = in_idx[i];
535         }
536     }
537 
538     *out_len = TextStringToUCS4(str, ucs4_out);
539 
540     if (indices) {
541         idx[k] = in_idx[len];
542         *indices = idx;
543     }
544 }
545