1 //========================================================================
2 //
3 // UTF.h
4 //
5 // Copyright 2001-2003 Glyph & Cog, LLC
6 //
7 //========================================================================
8 
9 //========================================================================
10 //
11 // Modified under the Poppler project - http://poppler.freedesktop.org
12 //
13 // All changes made under the Poppler project to this file are licensed
14 // under GPL version 2 or later
15 //
16 // Copyright (C) 2008 Koji Otani <sho@bbr.jp>
17 // Copyright (C) 2012 Adrian Johnson <ajohnson@redneon.com>
18 // Copyright (C) 2012 Hib Eris <hib@hiberis.nl>
19 // Copyright (C) 2016 Albert Astals Cid <aacid@kde.org>
20 //
21 // To see a description of the changes please see the Changelog file that
22 // came with your tarball or type make ChangeLog if you are building from git
23 //
24 //========================================================================
25 
26 #include "goo/gmem.h"
27 #include "PDFDocEncoding.h"
28 #include "UTF.h"
29 
UnicodeIsValid(Unicode ucs4)30 bool UnicodeIsValid(Unicode ucs4)
31 {
32   return (ucs4 < 0x110000) &&
33     ((ucs4 & 0xfffff800) != 0xd800) &&
34     (ucs4 < 0xfdd0 || ucs4 > 0xfdef) &&
35     ((ucs4 & 0xfffe) != 0xfffe);
36 }
37 
UTF16toUCS4(const Unicode * utf16,int utf16Len,Unicode ** ucs4)38 int UTF16toUCS4(const Unicode *utf16, int utf16Len, Unicode **ucs4)
39 {
40   int i, n, len;
41   Unicode *u;
42 
43   // count characters
44   len = 0;
45   for (i = 0; i < utf16Len; i++) {
46     if (utf16[i] >= 0xd800 && utf16[i] < 0xdc00 && i + 1 < utf16Len &&
47         utf16[i+1] >= 0xdc00 && utf16[i+1] < 0xe000) {
48       i++; /* surrogate pair */
49     }
50     len++;
51   }
52   if (ucs4 == NULL)
53     return len;
54 
55   u = (Unicode*)gmallocn(len, sizeof(Unicode));
56   n = 0;
57   // convert string
58   for (i = 0; i < utf16Len; i++) {
59     if (utf16[i] >= 0xd800 && utf16[i] < 0xdc00) { /* surrogate pair */
60       if (i + 1 < utf16Len && utf16[i+1] >= 0xdc00 && utf16[i+1] < 0xe000) {
61 	/* next code is a low surrogate */
62 	u[n] = (((utf16[i] & 0x3ff) << 10) | (utf16[i+1] & 0x3ff)) + 0x10000;
63 	++i;
64       } else {
65 	/* missing low surrogate
66 	   replace it with REPLACEMENT CHARACTER (U+FFFD) */
67 	u[n] = 0xfffd;
68       }
69     } else if (utf16[i] >= 0xdc00 && utf16[i] < 0xe000) {
70       /* invalid low surrogate
71 	 replace it with REPLACEMENT CHARACTER (U+FFFD) */
72       u[n] = 0xfffd;
73     } else {
74       u[n] = utf16[i];
75     }
76     if (!UnicodeIsValid(u[n])) {
77       u[n] = 0xfffd;
78     }
79     n++;
80   }
81   *ucs4 = u;
82   return len;
83 }
84 
TextStringToUCS4(GooString * textStr,Unicode ** ucs4)85 int TextStringToUCS4(GooString *textStr, Unicode **ucs4)
86 {
87   int i, len;
88   const char *s;
89   Unicode *u;
90 
91   len = textStr->getLength();
92   s = textStr->getCString();
93   if (len == 0) {
94     *ucs4 = 0;
95     return 0;
96   }
97 
98   if (textStr->hasUnicodeMarker()) {
99     Unicode *utf16;
100     len = len/2 - 1;
101     if (len > 0) {
102       utf16 = new Unicode[len];
103       for (i = 0 ; i < len; i++) {
104         utf16[i] = (s[2 + i*2] & 0xff) << 8 | (s[3 + i*2] & 0xff);
105       }
106       len = UTF16toUCS4(utf16, len, &u);
107       delete[] utf16;
108     } else {
109       u = NULL;
110     }
111   } else {
112     u = (Unicode*)gmallocn(len, sizeof(Unicode));
113     for (i = 0 ; i < len; i++) {
114       u[i] = pdfDocEncoding[s[i] & 0xff];
115     }
116   }
117   *ucs4 = u;
118   return len;
119 }
120