1 /*
2    Source File : UnicodeString.cpp
3 
4 
5    Copyright 2011 Gal Kahana PDFWriter
6 
7    Licensed under the Apache License, Version 2.0 (the "License");
8    you may not use this file except in compliance with the License.
9    You may obtain a copy of the License at
10 
11        http://www.apache.org/licenses/LICENSE-2.0
12 
13    Unless required by applicable law or agreed to in writing, software
14    distributed under the License is distributed on an "AS IS" BASIS,
15    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16    See the License for the specific language governing permissions and
17    limitations under the License.
18 
19 
20 */
21 #include "UnicodeString.h"
22 #include "Trace.h"
23 #include <sstream>
24 
25 using namespace PDFHummus;
26 
UnicodeString(void)27 UnicodeString::UnicodeString(void)
28 {
29 }
30 
~UnicodeString(void)31 UnicodeString::~UnicodeString(void)
32 {
33 }
34 
UnicodeString(const UnicodeString & inOtherString)35 UnicodeString::UnicodeString(const UnicodeString& inOtherString)
36 {
37 	mUnicodeCharacters = inOtherString.mUnicodeCharacters;
38 }
39 
UnicodeString(const ULongList & inOtherList)40 UnicodeString::UnicodeString(const ULongList& inOtherList)
41 {
42 	mUnicodeCharacters = inOtherList;
43 }
44 
operator =(const UnicodeString & inOtherString)45 UnicodeString& UnicodeString::operator =(const UnicodeString& inOtherString)
46 {
47 	mUnicodeCharacters = inOtherString.mUnicodeCharacters;
48 	return *this;
49 }
50 
operator =(const ULongList & inOtherList)51 UnicodeString& UnicodeString::operator =(const ULongList& inOtherList)
52 {
53 	mUnicodeCharacters = inOtherList;
54 	return *this;
55 }
56 
operator ==(const UnicodeString & inOtherString) const57 bool UnicodeString::operator==(const UnicodeString& inOtherString) const
58 {
59 	return mUnicodeCharacters == inOtherString.mUnicodeCharacters;
60 }
61 
62 
GetUnicodeList() const63 const ULongList& UnicodeString::GetUnicodeList() const
64 {
65 	return mUnicodeCharacters;
66 }
67 
GetUnicodeList()68 ULongList& UnicodeString::GetUnicodeList()
69 {
70 	return mUnicodeCharacters;
71 }
72 
FromUTF8(const std::string & inString)73 EStatusCode UnicodeString::FromUTF8(const std::string& inString)
74 {
75 	mUnicodeCharacters.clear();
76 	std::string::const_iterator it = inString.begin();
77 	EStatusCode status = PDFHummus::eSuccess;
78 	unsigned long unicodeCharacter;
79 
80 
81 	for(; it != inString.end() && PDFHummus::eSuccess == status; ++it)
82 	{
83 		if((unsigned char)*it <= 0x7F)
84 		{
85 			unicodeCharacter = (unsigned char)*it;
86 		}
87 		else if(((unsigned char)*it>>5) == 0x6) // 2 bytes encoding
88 		{
89 			unicodeCharacter = (unsigned char)*it & 0x1F;
90 			++it;
91 			if(it == inString.end() || ((unsigned char)*it>>6 != 0x2))
92 			{
93 				status = PDFHummus::eFailure;
94 				break;
95 			}
96 
97 			unicodeCharacter = (unicodeCharacter << 6) | ((unsigned char)*it & 0x3F);
98 		}
99 		else if(((unsigned char)*it>>4) == 0xE) // 3 bytes encoding
100 		{
101 			unicodeCharacter = (unsigned char)*it & 0xF;
102 			for(int i =0 ; i < 2 && PDFHummus::eSuccess == status; ++i)
103 			{
104 				++it;
105 				if(it == inString.end() || ((unsigned char)*it>>6 != 0x2))
106 				{
107 					status = PDFHummus::eFailure;
108 					break;
109 				}
110 				unicodeCharacter = (unicodeCharacter << 6) | ((unsigned char)*it & 0x3F);
111 			}
112 			if(status != PDFHummus::eSuccess)
113 				break;
114 		}
115 		else if(((unsigned char)*it>>3) == 0x1E) // 4 bytes encoding
116 		{
117 			unicodeCharacter = (unsigned char)*it & 0x7;
118 			for(int i =0 ; i < 3 && PDFHummus::eSuccess == status; ++i)
119 			{
120 				++it;
121 				if(it == inString.end() || ((unsigned char)*it>>6 != 0x2))
122 				{
123 					status = PDFHummus::eFailure;
124 					break;
125 				}
126 				unicodeCharacter = (unicodeCharacter << 6) | ((unsigned char)*it & 0x3F);
127 			}
128 			if(status != PDFHummus::eSuccess)
129 				break;
130 		}
131 		else
132 		{
133 			status = PDFHummus::eFailure;
134 			break;
135 		}
136 
137 		mUnicodeCharacters.push_back(unicodeCharacter);
138 	}
139 
140 	return status;
141 }
142 
ToUTF8() const143 EStatusCodeAndString UnicodeString::ToUTF8() const
144 {
145 	ULongList::const_iterator it = mUnicodeCharacters.begin();
146 	EStatusCode status = PDFHummus::eSuccess;
147 	std::stringstream result;
148 
149 	for(; it != mUnicodeCharacters.end() && PDFHummus::eSuccess == status; ++it)
150 	{
151 		// Encode Unicode to UTF8
152 		if(*it <= 0x7F)
153 		{
154 			result.put((unsigned char)*it);
155 		}
156 		else if(0x7F < *it && *it <= 0x7FF)
157 		{
158 			result.put((unsigned char)((0xC0 | (*it>>6))));
159 			result.put((unsigned char)(0x80 | (*it & 0x3F)));
160 		}
161 		else if(0x7FF < *it && *it <= 0xFFFF)
162 		{
163 			result.put((unsigned char)(0xE0 | (*it>>12)));
164 			result.put((unsigned char)(0x80 | ((*it>>6) & 0x3F)));
165 			result.put((unsigned char)(0x80 | (*it & 0x3F)));
166 		}
167 		else if(0xFFFF < *it && *it <= 0x10FFFF)
168 		{
169 			result.put((unsigned char)(0xF0 | (*it>>18)));
170 			result.put((unsigned char)(0x80 | ((*it>>12) & 0x3F)));
171 			result.put((unsigned char)(0x80 | ((*it>>6) & 0x3F)));
172 			result.put((unsigned char)(0x80 | (*it & 0x3F)));
173 		}
174 		else
175 		{
176 			TRACE_LOG("UnicodeString::ToUTF8, contains unicode characters that cannot be coded into UTF8");
177 			status = PDFHummus::eFailure;
178 		}
179 	}
180 
181 	return EStatusCodeAndString(status,result.str());
182 }
183 
FromUTF16(const std::string & inString)184 EStatusCode UnicodeString::FromUTF16(const std::string& inString)
185 {
186 	return FromUTF16((const unsigned char*)inString.c_str(),(unsigned long)inString.length());
187 }
188 
FromUTF16(const unsigned char * inString,unsigned long inLength)189 EStatusCode UnicodeString::FromUTF16(const unsigned char* inString, unsigned long inLength)
190 {
191 	// Read BOM
192 	if(inLength < 2)
193 		return PDFHummus::eFailure;
194 
195 	if(inString[0] == 0xFE && inString[1] == 0xFF)
196 		return FromUTF16BE(inString+2,inLength-2);
197 	else if(inString[0] == 0xFF && inString[1] == 0xFE)
198 		return FromUTF16LE(inString+2,inLength-2);
199 	else
200 		return PDFHummus::eFailure; // no bom
201 }
202 
FromUTF16BE(const std::string & inString)203 EStatusCode UnicodeString::FromUTF16BE(const std::string& inString)
204 {
205 	return FromUTF16BE((const unsigned char*)inString.c_str(),(unsigned long)inString.length());
206 }
207 
FromUTF16BE(const unsigned char * inString,unsigned long inLength)208 EStatusCode UnicodeString::FromUTF16BE(const unsigned char* inString, unsigned long inLength)
209 {
210 	mUnicodeCharacters.clear();
211 	EStatusCode status = PDFHummus::eSuccess;
212 
213 	if(inLength % 2 != 0)
214 	{
215 		TRACE_LOG("UnicodeString::FromUTF16BE, invalid UTF16 string, has odd numbers of characters");
216 		return PDFHummus::eFailure;
217 	}
218 
219 	for(unsigned long i = 0; i < inLength-1 && PDFHummus::eSuccess == status; i+=2)
220 	{
221 		unsigned short buffer = (((unsigned short)inString[i])<<8) + inString[i+1];
222 
223 		if(0xD800 <= buffer && buffer <= 0xDBFF)
224 		{
225 			// Aha! high surrogate! this means that this character requires 2 w_chars
226 			unsigned short highSurrogate = buffer;
227 			i+=2;
228 			if(i>=inLength-1)
229 			{
230 				TRACE_LOG("UnicodeString::FromUTF16BE, fault string - high surrogat encountered without a low surrogate");
231 				status = PDFHummus::eFailure;
232 				break;
233 			}
234 
235 			unsigned short buffer = (((unsigned short)inString[i])<<8) + inString[i+1];
236 			if(0xDC00 > buffer|| buffer > 0xDFFF)
237 			{
238 				TRACE_LOG("UnicodeString::FromUTF16BE, fault string - high surrogat encountered without a low surrogate");
239 				status = PDFHummus::eFailure;
240 				break;
241 			}
242 
243 			unsigned short lowSurrogate = buffer;
244 
245 			mUnicodeCharacters.push_back(0x10000 + ((highSurrogate - 0xD800) << 5) + (lowSurrogate - 0xDC00));
246 		}
247 		else
248 			mUnicodeCharacters.push_back(buffer);
249 	}
250 
251 	return status;
252 }
253 
FromUTF16LE(const std::string & inString)254 EStatusCode UnicodeString::FromUTF16LE(const std::string& inString)
255 {
256 	return FromUTF16LE((const unsigned char*)inString.c_str(),(unsigned long)inString.length());
257 }
258 
259 
FromUTF16LE(const unsigned char * inString,unsigned long inLength)260 EStatusCode UnicodeString::FromUTF16LE(const unsigned char* inString, unsigned long inLength)
261 {
262 	mUnicodeCharacters.clear();
263 	EStatusCode status = PDFHummus::eSuccess;
264 
265 	if(inLength % 2 != 0)
266 	{
267 		TRACE_LOG("UnicodeString::FromUTF16LE, invalid UTF16 string, has odd numbers of characters");
268 		return PDFHummus::eFailure;
269 	}
270 
271 	for(unsigned long i = 0; i < inLength-1 && PDFHummus::eSuccess == status; i+=2)
272 	{
273 		unsigned short buffer = (((unsigned short)inString[i+1])<<8) + inString[i];
274 
275 		if(0xD800 <= buffer && buffer <= 0xDBFF)
276 		{
277 			// Aha! high surrogate! this means that this character requires 2 w_chars
278 			unsigned short highSurrogate = buffer;
279 			i+=2;
280 			if(i>=inLength-1)
281 			{
282 				TRACE_LOG("UnicodeString::FromUTF16LE, fault string - high surrogat encountered without a low surrogate");
283 				status = PDFHummus::eFailure;
284 				break;
285 			}
286 
287 			unsigned short buffer = (((unsigned short)inString[i+1])<<8) + inString[i];
288 			if(0xDC00 > buffer|| buffer > 0xDFFF)
289 			{
290 				TRACE_LOG("UnicodeString::FromUTF16LE, fault string - high surrogat encountered without a low surrogate");
291 				status = PDFHummus::eFailure;
292 				break;
293 			}
294 
295 			unsigned short lowSurrogate = buffer;
296 
297 			mUnicodeCharacters.push_back(0x10000 + ((highSurrogate - 0xD800) << 5) + (lowSurrogate - 0xDC00));
298 		}
299 		else
300 			mUnicodeCharacters.push_back(buffer);
301 	}
302 
303 	return status;
304 }
305 
FromUTF16UShort(const unsigned short * inShorts,unsigned long inLength)306 EStatusCode UnicodeString::FromUTF16UShort(const unsigned short* inShorts, unsigned long inLength)
307 {
308 	mUnicodeCharacters.clear();
309 	EStatusCode status = PDFHummus::eSuccess;
310 
311 	for(unsigned long i = 0; i < inLength && PDFHummus::eSuccess == status; ++i)
312 	{
313 		if(0xD800 <= inShorts[i] && inShorts[i] <= 0xDBFF)
314 		{
315 			// Aha! high surrogate! this means that this character requires 2 w_chars
316 			++i;
317 			if(i>=inLength)
318 			{
319 				TRACE_LOG("UnicodeString::FromUTF16UShort, fault string - high surrogat encountered without a low surrogate");
320 				status = PDFHummus::eFailure;
321 				break;
322 			}
323 
324 			if(0xDC00 > inShorts[i] || inShorts[i] > 0xDFFF)
325 			{
326 				TRACE_LOG("UnicodeString::FromUTF16UShort, fault string - high surrogat encountered without a low surrogate");
327 				status = PDFHummus::eFailure;
328 				break;
329 			}
330 
331 			mUnicodeCharacters.push_back(0x10000 + ((inShorts[i-1] - 0xD800) << 5) + (inShorts[i] - 0xDC00));
332 		}
333 		else
334 			mUnicodeCharacters.push_back(inShorts[i]);
335 	}
336 
337 	return status;
338 }
339 
340 
ToUTF16BE(bool inPrependWithBom) const341 EStatusCodeAndString UnicodeString::ToUTF16BE(bool inPrependWithBom) const
342 {
343 	ULongList::const_iterator it = mUnicodeCharacters.begin();
344 	EStatusCode status = PDFHummus::eSuccess;
345 	std::stringstream result;
346 
347 	if(inPrependWithBom)
348 	{
349 		result.put((unsigned char)0xFE);
350 		result.put((unsigned char)0xFF);
351 	}
352 
353 	for(; it != mUnicodeCharacters.end() && PDFHummus::eSuccess == status; ++it)
354 	{
355 		if(*it < 0xD7FF || (0xE000 < *it && *it < 0xFFFF))
356 		{
357 			result.put((unsigned char)(*it>>8));
358 			result.put((unsigned char)(*it & 0xFF));
359 		}
360 		else if(0xFFFF < *it && *it <= 0x10FFFF)
361 		{
362 			unsigned short highSurrogate = (unsigned short)(((*it - 0x10000) >> 10) + 0xD800);
363 			unsigned short lowSurrogate = (unsigned short)(((*it - 0x10000) & 0x3FF) + 0xDC00);
364 
365 			result.put((unsigned char)(highSurrogate>>8));
366 			result.put((unsigned char)(highSurrogate & 0xFF));
367 			result.put((unsigned char)(lowSurrogate>>8));
368 			result.put((unsigned char)(lowSurrogate & 0xFF));
369 		}
370 		else
371 		{
372 			status = PDFHummus::eFailure;
373 			break;
374 		}
375 	}
376 
377 	return EStatusCodeAndString(status,result.str());
378 }
379 
ToUTF16LE(bool inPrependWithBom) const380 EStatusCodeAndString UnicodeString::ToUTF16LE(bool inPrependWithBom) const
381 {
382 	ULongList::const_iterator it = mUnicodeCharacters.begin();
383 	EStatusCode status = PDFHummus::eSuccess;
384 	std::stringstream result;
385 
386 	if(inPrependWithBom)
387 	{
388 		result.put((unsigned char)0xFF);
389 		result.put((unsigned char)0xFE);
390 	}
391 
392 	for(; it != mUnicodeCharacters.end() && PDFHummus::eSuccess == status; ++it)
393 	{
394 		if(*it < 0xD7FF || (0xE000 < *it && *it < 0xFFFF))
395 		{
396 			result.put((unsigned char)(*it & 0xFF));
397 			result.put((unsigned char)(*it>>8));
398 		}
399 		else if(0xFFFF < *it && *it <= 0x10FFFF)
400 		{
401 			unsigned short highSurrogate = (unsigned short)(((*it - 0x10000) >> 10) + 0xD800);
402 			unsigned short lowSurrogate = (unsigned short)(((*it - 0x10000) & 0x3FF) + 0xDC00);
403 
404 			result.put((unsigned char)(highSurrogate & 0xFF));
405 			result.put((unsigned char)(highSurrogate>>8));
406 			result.put((unsigned char)(lowSurrogate & 0xFF));
407 			result.put((unsigned char)(lowSurrogate>>8));
408 		}
409 		else
410 		{
411 			status = PDFHummus::eFailure;
412 			break;
413 		}
414 	}
415 
416 	return EStatusCodeAndString(status,result.str());
417 }
418 
ToUTF16UShort() const419 EStatusCodeAndUShortList UnicodeString::ToUTF16UShort() const
420 {
421 	ULongList::const_iterator it = mUnicodeCharacters.begin();
422 	EStatusCode status = PDFHummus::eSuccess;
423 	UShortList result;
424 
425 	for(; it != mUnicodeCharacters.end() && PDFHummus::eSuccess == status; ++it)
426 	{
427 		if(*it < 0xD7FF || (0xE000 < *it && *it < 0xFFFF))
428 		{
429 			result.push_back((unsigned short)*it);
430 		}
431 		else if(0xFFFF < *it && *it <= 0x10FFFF)
432 		{
433 			unsigned short highSurrogate = (unsigned short)(((*it - 0x10000) >> 10) + 0xD800);
434 			unsigned short lowSurrogate = (unsigned short)(((*it - 0x10000) & 0x3FF) + 0xDC00);
435 
436 			result.push_back(highSurrogate);
437 			result.push_back(lowSurrogate);
438 		}
439 		else
440 		{
441 			status = PDFHummus::eFailure;
442 			break;
443 		}
444 	}
445 
446 	return EStatusCodeAndUShortList(status,result);
447 }
448 
449 
450