1 /*
2 Source File : UnicodeString.cpp
3
4
5 Copyright 2011 Gal Kahana PDFWriter
6
7 Licensed under the Apache License, Version 2.0 (the "License");
8 you may not use this file except in compliance with the License.
9 You may obtain a copy of the License at
10
11 http://www.apache.org/licenses/LICENSE-2.0
12
13 Unless required by applicable law or agreed to in writing, software
14 distributed under the License is distributed on an "AS IS" BASIS,
15 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 See the License for the specific language governing permissions and
17 limitations under the License.
18
19
20 */
21 #include "UnicodeString.h"
22 #include "Trace.h"
23 #include <sstream>
24
25 using namespace PDFHummus;
26
UnicodeString(void)27 UnicodeString::UnicodeString(void)
28 {
29 }
30
~UnicodeString(void)31 UnicodeString::~UnicodeString(void)
32 {
33 }
34
UnicodeString(const UnicodeString & inOtherString)35 UnicodeString::UnicodeString(const UnicodeString& inOtherString)
36 {
37 mUnicodeCharacters = inOtherString.mUnicodeCharacters;
38 }
39
UnicodeString(const ULongList & inOtherList)40 UnicodeString::UnicodeString(const ULongList& inOtherList)
41 {
42 mUnicodeCharacters = inOtherList;
43 }
44
operator =(const UnicodeString & inOtherString)45 UnicodeString& UnicodeString::operator =(const UnicodeString& inOtherString)
46 {
47 mUnicodeCharacters = inOtherString.mUnicodeCharacters;
48 return *this;
49 }
50
operator =(const ULongList & inOtherList)51 UnicodeString& UnicodeString::operator =(const ULongList& inOtherList)
52 {
53 mUnicodeCharacters = inOtherList;
54 return *this;
55 }
56
operator ==(const UnicodeString & inOtherString) const57 bool UnicodeString::operator==(const UnicodeString& inOtherString) const
58 {
59 return mUnicodeCharacters == inOtherString.mUnicodeCharacters;
60 }
61
62
GetUnicodeList() const63 const ULongList& UnicodeString::GetUnicodeList() const
64 {
65 return mUnicodeCharacters;
66 }
67
GetUnicodeList()68 ULongList& UnicodeString::GetUnicodeList()
69 {
70 return mUnicodeCharacters;
71 }
72
FromUTF8(const std::string & inString)73 EStatusCode UnicodeString::FromUTF8(const std::string& inString)
74 {
75 mUnicodeCharacters.clear();
76 std::string::const_iterator it = inString.begin();
77 EStatusCode status = PDFHummus::eSuccess;
78 unsigned long unicodeCharacter;
79
80
81 for(; it != inString.end() && PDFHummus::eSuccess == status; ++it)
82 {
83 if((unsigned char)*it <= 0x7F)
84 {
85 unicodeCharacter = (unsigned char)*it;
86 }
87 else if(((unsigned char)*it>>5) == 0x6) // 2 bytes encoding
88 {
89 unicodeCharacter = (unsigned char)*it & 0x1F;
90 ++it;
91 if(it == inString.end() || ((unsigned char)*it>>6 != 0x2))
92 {
93 status = PDFHummus::eFailure;
94 break;
95 }
96
97 unicodeCharacter = (unicodeCharacter << 6) | ((unsigned char)*it & 0x3F);
98 }
99 else if(((unsigned char)*it>>4) == 0xE) // 3 bytes encoding
100 {
101 unicodeCharacter = (unsigned char)*it & 0xF;
102 for(int i =0 ; i < 2 && PDFHummus::eSuccess == status; ++i)
103 {
104 ++it;
105 if(it == inString.end() || ((unsigned char)*it>>6 != 0x2))
106 {
107 status = PDFHummus::eFailure;
108 break;
109 }
110 unicodeCharacter = (unicodeCharacter << 6) | ((unsigned char)*it & 0x3F);
111 }
112 if(status != PDFHummus::eSuccess)
113 break;
114 }
115 else if(((unsigned char)*it>>3) == 0x1E) // 4 bytes encoding
116 {
117 unicodeCharacter = (unsigned char)*it & 0x7;
118 for(int i =0 ; i < 3 && PDFHummus::eSuccess == status; ++i)
119 {
120 ++it;
121 if(it == inString.end() || ((unsigned char)*it>>6 != 0x2))
122 {
123 status = PDFHummus::eFailure;
124 break;
125 }
126 unicodeCharacter = (unicodeCharacter << 6) | ((unsigned char)*it & 0x3F);
127 }
128 if(status != PDFHummus::eSuccess)
129 break;
130 }
131 else
132 {
133 status = PDFHummus::eFailure;
134 break;
135 }
136
137 mUnicodeCharacters.push_back(unicodeCharacter);
138 }
139
140 return status;
141 }
142
ToUTF8() const143 EStatusCodeAndString UnicodeString::ToUTF8() const
144 {
145 ULongList::const_iterator it = mUnicodeCharacters.begin();
146 EStatusCode status = PDFHummus::eSuccess;
147 std::stringstream result;
148
149 for(; it != mUnicodeCharacters.end() && PDFHummus::eSuccess == status; ++it)
150 {
151 // Encode Unicode to UTF8
152 if(*it <= 0x7F)
153 {
154 result.put((unsigned char)*it);
155 }
156 else if(0x7F < *it && *it <= 0x7FF)
157 {
158 result.put((unsigned char)((0xC0 | (*it>>6))));
159 result.put((unsigned char)(0x80 | (*it & 0x3F)));
160 }
161 else if(0x7FF < *it && *it <= 0xFFFF)
162 {
163 result.put((unsigned char)(0xE0 | (*it>>12)));
164 result.put((unsigned char)(0x80 | ((*it>>6) & 0x3F)));
165 result.put((unsigned char)(0x80 | (*it & 0x3F)));
166 }
167 else if(0xFFFF < *it && *it <= 0x10FFFF)
168 {
169 result.put((unsigned char)(0xF0 | (*it>>18)));
170 result.put((unsigned char)(0x80 | ((*it>>12) & 0x3F)));
171 result.put((unsigned char)(0x80 | ((*it>>6) & 0x3F)));
172 result.put((unsigned char)(0x80 | (*it & 0x3F)));
173 }
174 else
175 {
176 TRACE_LOG("UnicodeString::ToUTF8, contains unicode characters that cannot be coded into UTF8");
177 status = PDFHummus::eFailure;
178 }
179 }
180
181 return EStatusCodeAndString(status,result.str());
182 }
183
FromUTF16(const std::string & inString)184 EStatusCode UnicodeString::FromUTF16(const std::string& inString)
185 {
186 return FromUTF16((const unsigned char*)inString.c_str(),(unsigned long)inString.length());
187 }
188
FromUTF16(const unsigned char * inString,unsigned long inLength)189 EStatusCode UnicodeString::FromUTF16(const unsigned char* inString, unsigned long inLength)
190 {
191 // Read BOM
192 if(inLength < 2)
193 return PDFHummus::eFailure;
194
195 if(inString[0] == 0xFE && inString[1] == 0xFF)
196 return FromUTF16BE(inString+2,inLength-2);
197 else if(inString[0] == 0xFF && inString[1] == 0xFE)
198 return FromUTF16LE(inString+2,inLength-2);
199 else
200 return PDFHummus::eFailure; // no bom
201 }
202
FromUTF16BE(const std::string & inString)203 EStatusCode UnicodeString::FromUTF16BE(const std::string& inString)
204 {
205 return FromUTF16BE((const unsigned char*)inString.c_str(),(unsigned long)inString.length());
206 }
207
FromUTF16BE(const unsigned char * inString,unsigned long inLength)208 EStatusCode UnicodeString::FromUTF16BE(const unsigned char* inString, unsigned long inLength)
209 {
210 mUnicodeCharacters.clear();
211 EStatusCode status = PDFHummus::eSuccess;
212
213 if(inLength % 2 != 0)
214 {
215 TRACE_LOG("UnicodeString::FromUTF16BE, invalid UTF16 string, has odd numbers of characters");
216 return PDFHummus::eFailure;
217 }
218
219 for(unsigned long i = 0; i < inLength-1 && PDFHummus::eSuccess == status; i+=2)
220 {
221 unsigned short buffer = (((unsigned short)inString[i])<<8) + inString[i+1];
222
223 if(0xD800 <= buffer && buffer <= 0xDBFF)
224 {
225 // Aha! high surrogate! this means that this character requires 2 w_chars
226 unsigned short highSurrogate = buffer;
227 i+=2;
228 if(i>=inLength-1)
229 {
230 TRACE_LOG("UnicodeString::FromUTF16BE, fault string - high surrogat encountered without a low surrogate");
231 status = PDFHummus::eFailure;
232 break;
233 }
234
235 unsigned short buffer = (((unsigned short)inString[i])<<8) + inString[i+1];
236 if(0xDC00 > buffer|| buffer > 0xDFFF)
237 {
238 TRACE_LOG("UnicodeString::FromUTF16BE, fault string - high surrogat encountered without a low surrogate");
239 status = PDFHummus::eFailure;
240 break;
241 }
242
243 unsigned short lowSurrogate = buffer;
244
245 mUnicodeCharacters.push_back(0x10000 + ((highSurrogate - 0xD800) << 5) + (lowSurrogate - 0xDC00));
246 }
247 else
248 mUnicodeCharacters.push_back(buffer);
249 }
250
251 return status;
252 }
253
FromUTF16LE(const std::string & inString)254 EStatusCode UnicodeString::FromUTF16LE(const std::string& inString)
255 {
256 return FromUTF16LE((const unsigned char*)inString.c_str(),(unsigned long)inString.length());
257 }
258
259
FromUTF16LE(const unsigned char * inString,unsigned long inLength)260 EStatusCode UnicodeString::FromUTF16LE(const unsigned char* inString, unsigned long inLength)
261 {
262 mUnicodeCharacters.clear();
263 EStatusCode status = PDFHummus::eSuccess;
264
265 if(inLength % 2 != 0)
266 {
267 TRACE_LOG("UnicodeString::FromUTF16LE, invalid UTF16 string, has odd numbers of characters");
268 return PDFHummus::eFailure;
269 }
270
271 for(unsigned long i = 0; i < inLength-1 && PDFHummus::eSuccess == status; i+=2)
272 {
273 unsigned short buffer = (((unsigned short)inString[i+1])<<8) + inString[i];
274
275 if(0xD800 <= buffer && buffer <= 0xDBFF)
276 {
277 // Aha! high surrogate! this means that this character requires 2 w_chars
278 unsigned short highSurrogate = buffer;
279 i+=2;
280 if(i>=inLength-1)
281 {
282 TRACE_LOG("UnicodeString::FromUTF16LE, fault string - high surrogat encountered without a low surrogate");
283 status = PDFHummus::eFailure;
284 break;
285 }
286
287 unsigned short buffer = (((unsigned short)inString[i+1])<<8) + inString[i];
288 if(0xDC00 > buffer|| buffer > 0xDFFF)
289 {
290 TRACE_LOG("UnicodeString::FromUTF16LE, fault string - high surrogat encountered without a low surrogate");
291 status = PDFHummus::eFailure;
292 break;
293 }
294
295 unsigned short lowSurrogate = buffer;
296
297 mUnicodeCharacters.push_back(0x10000 + ((highSurrogate - 0xD800) << 5) + (lowSurrogate - 0xDC00));
298 }
299 else
300 mUnicodeCharacters.push_back(buffer);
301 }
302
303 return status;
304 }
305
FromUTF16UShort(const unsigned short * inShorts,unsigned long inLength)306 EStatusCode UnicodeString::FromUTF16UShort(const unsigned short* inShorts, unsigned long inLength)
307 {
308 mUnicodeCharacters.clear();
309 EStatusCode status = PDFHummus::eSuccess;
310
311 for(unsigned long i = 0; i < inLength && PDFHummus::eSuccess == status; ++i)
312 {
313 if(0xD800 <= inShorts[i] && inShorts[i] <= 0xDBFF)
314 {
315 // Aha! high surrogate! this means that this character requires 2 w_chars
316 ++i;
317 if(i>=inLength)
318 {
319 TRACE_LOG("UnicodeString::FromUTF16UShort, fault string - high surrogat encountered without a low surrogate");
320 status = PDFHummus::eFailure;
321 break;
322 }
323
324 if(0xDC00 > inShorts[i] || inShorts[i] > 0xDFFF)
325 {
326 TRACE_LOG("UnicodeString::FromUTF16UShort, fault string - high surrogat encountered without a low surrogate");
327 status = PDFHummus::eFailure;
328 break;
329 }
330
331 mUnicodeCharacters.push_back(0x10000 + ((inShorts[i-1] - 0xD800) << 5) + (inShorts[i] - 0xDC00));
332 }
333 else
334 mUnicodeCharacters.push_back(inShorts[i]);
335 }
336
337 return status;
338 }
339
340
ToUTF16BE(bool inPrependWithBom) const341 EStatusCodeAndString UnicodeString::ToUTF16BE(bool inPrependWithBom) const
342 {
343 ULongList::const_iterator it = mUnicodeCharacters.begin();
344 EStatusCode status = PDFHummus::eSuccess;
345 std::stringstream result;
346
347 if(inPrependWithBom)
348 {
349 result.put((unsigned char)0xFE);
350 result.put((unsigned char)0xFF);
351 }
352
353 for(; it != mUnicodeCharacters.end() && PDFHummus::eSuccess == status; ++it)
354 {
355 if(*it < 0xD7FF || (0xE000 < *it && *it < 0xFFFF))
356 {
357 result.put((unsigned char)(*it>>8));
358 result.put((unsigned char)(*it & 0xFF));
359 }
360 else if(0xFFFF < *it && *it <= 0x10FFFF)
361 {
362 unsigned short highSurrogate = (unsigned short)(((*it - 0x10000) >> 10) + 0xD800);
363 unsigned short lowSurrogate = (unsigned short)(((*it - 0x10000) & 0x3FF) + 0xDC00);
364
365 result.put((unsigned char)(highSurrogate>>8));
366 result.put((unsigned char)(highSurrogate & 0xFF));
367 result.put((unsigned char)(lowSurrogate>>8));
368 result.put((unsigned char)(lowSurrogate & 0xFF));
369 }
370 else
371 {
372 status = PDFHummus::eFailure;
373 break;
374 }
375 }
376
377 return EStatusCodeAndString(status,result.str());
378 }
379
ToUTF16LE(bool inPrependWithBom) const380 EStatusCodeAndString UnicodeString::ToUTF16LE(bool inPrependWithBom) const
381 {
382 ULongList::const_iterator it = mUnicodeCharacters.begin();
383 EStatusCode status = PDFHummus::eSuccess;
384 std::stringstream result;
385
386 if(inPrependWithBom)
387 {
388 result.put((unsigned char)0xFF);
389 result.put((unsigned char)0xFE);
390 }
391
392 for(; it != mUnicodeCharacters.end() && PDFHummus::eSuccess == status; ++it)
393 {
394 if(*it < 0xD7FF || (0xE000 < *it && *it < 0xFFFF))
395 {
396 result.put((unsigned char)(*it & 0xFF));
397 result.put((unsigned char)(*it>>8));
398 }
399 else if(0xFFFF < *it && *it <= 0x10FFFF)
400 {
401 unsigned short highSurrogate = (unsigned short)(((*it - 0x10000) >> 10) + 0xD800);
402 unsigned short lowSurrogate = (unsigned short)(((*it - 0x10000) & 0x3FF) + 0xDC00);
403
404 result.put((unsigned char)(highSurrogate & 0xFF));
405 result.put((unsigned char)(highSurrogate>>8));
406 result.put((unsigned char)(lowSurrogate & 0xFF));
407 result.put((unsigned char)(lowSurrogate>>8));
408 }
409 else
410 {
411 status = PDFHummus::eFailure;
412 break;
413 }
414 }
415
416 return EStatusCodeAndString(status,result.str());
417 }
418
ToUTF16UShort() const419 EStatusCodeAndUShortList UnicodeString::ToUTF16UShort() const
420 {
421 ULongList::const_iterator it = mUnicodeCharacters.begin();
422 EStatusCode status = PDFHummus::eSuccess;
423 UShortList result;
424
425 for(; it != mUnicodeCharacters.end() && PDFHummus::eSuccess == status; ++it)
426 {
427 if(*it < 0xD7FF || (0xE000 < *it && *it < 0xFFFF))
428 {
429 result.push_back((unsigned short)*it);
430 }
431 else if(0xFFFF < *it && *it <= 0x10FFFF)
432 {
433 unsigned short highSurrogate = (unsigned short)(((*it - 0x10000) >> 10) + 0xD800);
434 unsigned short lowSurrogate = (unsigned short)(((*it - 0x10000) & 0x3FF) + 0xDC00);
435
436 result.push_back(highSurrogate);
437 result.push_back(lowSurrogate);
438 }
439 else
440 {
441 status = PDFHummus::eFailure;
442 break;
443 }
444 }
445
446 return EStatusCodeAndUShortList(status,result);
447 }
448
449
450