1 //
2 // VMime library (http://www.vmime.org)
3 // Copyright (C) 2002-2013 Vincent Richard <vincent@vmime.org>
4 //
5 // This program is free software; you can redistribute it and/or
6 // modify it under the terms of the GNU General Public License as
7 // published by the Free Software Foundation; either version 3 of
8 // the License, or (at your option) any later version.
9 //
10 // This program is distributed in the hope that it will be useful,
11 // but WITHOUT ANY WARRANTY; without even the implied warranty of
12 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 // General Public License for more details.
14 //
15 // You should have received a copy of the GNU General Public License along
16 // with this program; if not, write to the Free Software Foundation, Inc.,
17 // 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
18 //
19 // Linking this library statically or dynamically with other modules is making
20 // a combined work based on this library.  Thus, the terms and conditions of
21 // the GNU General Public License cover the whole combination.
22 //
23 
24 #include "vmime/config.hpp"
25 
26 
27 #if VMIME_CHARSETCONV_LIB_IS_ICU
28 
29 
30 #include "vmime/charsetConverter_icu.hpp"
31 
32 #include "vmime/exception.hpp"
33 #include "vmime/utility/inputStreamStringAdapter.hpp"
34 #include "vmime/utility/outputStreamStringAdapter.hpp"
35 
36 
37 extern "C"
38 {
39 #ifndef VMIME_BUILDING_DOC
40 
41 	#include <unicode/ucnv.h>
42 	#include <unicode/ucnv_err.h>
43 
44 #endif // VMIME_BUILDING_DOC
45 }
46 
47 
48 #include <unicode/unistr.h>
49 
50 
51 namespace vmime
52 {
53 
54 
55 // static
createGenericConverter(const charset & source,const charset & dest,const charsetConverterOptions & opts)56 shared_ptr <charsetConverter> charsetConverter::createGenericConverter
57 	(const charset& source, const charset& dest,
58 	 const charsetConverterOptions& opts)
59 {
60 	return make_shared <charsetConverter_icu>(source, dest, opts);
61 }
62 
63 
charsetConverter_icu(const charset & source,const charset & dest,const charsetConverterOptions & opts)64 charsetConverter_icu::charsetConverter_icu
65 	(const charset& source, const charset& dest, const charsetConverterOptions& opts)
66 	: m_from(NULL), m_to(NULL), m_source(source), m_dest(dest), m_options(opts)
67 {
68 	UErrorCode err = U_ZERO_ERROR;
69 	m_from = ucnv_open(source.getName().c_str(), &err);
70 
71 	if (!U_SUCCESS(err))
72 	{
73 		throw exceptions::charset_conv_error
74 			("Cannot initialize ICU converter for source charset '" + source.getName() + "' (error code: " + u_errorName(err) + ".");
75 	}
76 
77 	m_to = ucnv_open(dest.getName().c_str(), &err);
78 
79 	if (!U_SUCCESS(err))
80 	{
81 		throw exceptions::charset_conv_error
82 			("Cannot initialize ICU converter for destination charset '" + dest.getName() + "' (error code: " + u_errorName(err) + ".");
83 	}
84 }
85 
86 
~charsetConverter_icu()87 charsetConverter_icu::~charsetConverter_icu()
88 {
89 	if (m_from) ucnv_close(m_from);
90 	if (m_to) ucnv_close(m_to);
91 }
92 
93 
convert(utility::inputStream & in,utility::outputStream & out,status * st)94 void charsetConverter_icu::convert
95 	(utility::inputStream& in, utility::outputStream& out, status* st)
96 {
97 	UErrorCode err = U_ZERO_ERROR;
98 
99 	ucnv_reset(m_from);
100 	ucnv_reset(m_to);
101 
102 	if (st)
103 		new (st) status();
104 
105 	// From buffers
106 	byte_t cpInBuffer[16]; // stream data put here
107 	const size_t outSize = ucnv_getMinCharSize(m_from) * sizeof(cpInBuffer) * sizeof(UChar);
108 	std::vector <UChar> uOutBuffer(outSize); // Unicode chars end up here
109 
110 	// To buffers
111 	// converted (char) data end up here
112 	const size_t cpOutBufferSz = ucnv_getMaxCharSize(m_to) * outSize;
113 	std::vector <char> cpOutBuffer(cpOutBufferSz);
114 
115 	// Tell ICU what to do when encountering an illegal byte sequence
116 	if (m_options.silentlyReplaceInvalidSequences)
117 	{
118 		// Set replacement chars for when converting from Unicode to codepage
119 		icu::UnicodeString substString(m_options.invalidSequence.c_str());
120 		ucnv_setSubstString(m_to, substString.getTerminatedBuffer(), -1, &err);
121 
122 		if (U_FAILURE(err))
123 			throw exceptions::charset_conv_error("[ICU] Error when setting substitution string.");
124 	}
125 	else
126 	{
127 		// Tell ICU top stop (and return an error) on illegal byte sequences
128 		ucnv_setToUCallBack
129 			(m_from, UCNV_TO_U_CALLBACK_STOP, UCNV_SUB_STOP_ON_ILLEGAL, NULL, NULL, &err);
130 
131 		if (U_FAILURE(err))
132 			throw exceptions::charset_conv_error("[ICU] Error when setting ToU callback.");
133 
134 		ucnv_setFromUCallBack
135 			(m_to, UCNV_FROM_U_CALLBACK_STOP, UCNV_SUB_STOP_ON_ILLEGAL, NULL, NULL, &err);
136 
137 		if (U_FAILURE(err))
138 			throw exceptions::charset_conv_error("[ICU] Error when setting FromU callback.");
139 	}
140 
141 	// Input data available
142 	while (!in.eof())
143 	{
144 		// Read input data into buffer
145 		size_t inLength = in.read(cpInBuffer, sizeof(cpInBuffer));
146 
147 		// Beginning of read data
148 		const char* source = reinterpret_cast <const char*>(&cpInBuffer[0]);
149 		const char* sourceLimit = source + inLength; // end + 1
150 
151 		UBool flush = in.eof();  // is this last run?
152 
153 		UErrorCode toErr;
154 
155 		// Loop until all source has been processed
156 		do
157 		{
158 			// Set up target pointers
159 			UChar* target = &uOutBuffer[0];
160 			UChar* targetLimit = &target[0] + outSize;
161 
162 			toErr = U_ZERO_ERROR;
163 			ucnv_toUnicode(m_from, &target, targetLimit,
164 			               &source, sourceLimit, NULL, flush, &toErr);
165 
166 			if (st)
167 				st->inputBytesRead += (source - reinterpret_cast <const char*>(&cpInBuffer[0]));
168 
169 			if (toErr != U_BUFFER_OVERFLOW_ERROR && U_FAILURE(toErr))
170 			{
171 				if (toErr == U_INVALID_CHAR_FOUND ||
172 				    toErr == U_TRUNCATED_CHAR_FOUND ||
173 				    toErr == U_ILLEGAL_CHAR_FOUND)
174 				{
175 					// Error will be thrown later (*)
176 				}
177 				else
178 				{
179 					throw exceptions::charset_conv_error("[ICU] Error converting to Unicode from " + m_source.getName());
180 				}
181 			}
182 
183 			// The Unicode source is the buffer just written and the limit
184 			// is where the previous conversion stopped (target is moved in the conversion)
185 			const UChar* uSource = &uOutBuffer[0];
186 			UChar* uSourceLimit = &target[0];
187 			UErrorCode fromErr;
188 
189 			// Loop until converted chars are fully written
190 			do
191 			{
192 				char* cpTarget = &cpOutBuffer[0];
193 				const char* cpTargetLimit = &cpOutBuffer[0] + cpOutBufferSz;
194 
195 				fromErr = U_ZERO_ERROR;
196 
197 				// Write converted bytes (Unicode) to destination codepage
198 				ucnv_fromUnicode(m_to, &cpTarget, cpTargetLimit,
199 				                 &uSource, uSourceLimit, NULL, flush, &fromErr);
200 
201 				if (st)
202 				{
203 					// Decrement input bytes count by the number of input bytes in error
204 					char errBytes[16];
205 					int8_t errBytesLen = sizeof(errBytes);
206 					UErrorCode errBytesErr = U_ZERO_ERROR;
207 
208 	 				ucnv_getInvalidChars(m_from, errBytes, &errBytesLen, &errBytesErr);
209 
210 					st->inputBytesRead -= errBytesLen;
211 					st->outputBytesWritten += cpTarget - &cpOutBuffer[0];
212 				}
213 
214 				// (*) If an error occured while converting from input charset, throw it now
215 				if (toErr == U_INVALID_CHAR_FOUND ||
216 				    toErr == U_TRUNCATED_CHAR_FOUND ||
217 				    toErr == U_ILLEGAL_CHAR_FOUND)
218 				{
219 					throw exceptions::illegal_byte_sequence_for_charset();
220 				}
221 
222 				if (fromErr != U_BUFFER_OVERFLOW_ERROR && U_FAILURE(fromErr))
223 				{
224 					if (fromErr == U_INVALID_CHAR_FOUND ||
225 					    fromErr == U_TRUNCATED_CHAR_FOUND ||
226 					    fromErr == U_ILLEGAL_CHAR_FOUND)
227 					{
228 						throw exceptions::illegal_byte_sequence_for_charset();
229 					}
230 					else
231 					{
232 						throw exceptions::charset_conv_error("[ICU] Error converting from Unicode to " + m_dest.getName());
233 					}
234 				}
235 
236 				// Write to destination stream
237 				out.write(&cpOutBuffer[0], (cpTarget - &cpOutBuffer[0]));
238 
239 			} while (fromErr == U_BUFFER_OVERFLOW_ERROR);
240 
241 		} while (toErr == U_BUFFER_OVERFLOW_ERROR);
242 	}
243 }
244 
245 
convert(const string & in,string & out,status * st)246 void charsetConverter_icu::convert(const string& in, string& out, status* st)
247 {
248 	if (st)
249 		new (st) status();
250 
251 	out.clear();
252 
253 	utility::inputStreamStringAdapter is(in);
254 	utility::outputStreamStringAdapter os(out);
255 
256 	convert(is, os, st);
257 
258 	os.flush();
259 }
260 
261 
262 shared_ptr <utility::charsetFilteredOutputStream>
getFilteredOutputStream(utility::outputStream & os,const charsetConverterOptions & opts)263 	charsetConverter_icu::getFilteredOutputStream
264 		(utility::outputStream& os, const charsetConverterOptions& opts)
265 {
266 	return make_shared <utility::charsetFilteredOutputStream_icu>(m_source, m_dest, &os, opts);
267 }
268 
269 
270 
271 // charsetFilteredOutputStream_icu
272 
273 namespace utility {
274 
275 
charsetFilteredOutputStream_icu(const charset & source,const charset & dest,outputStream * os,const charsetConverterOptions & opts)276 charsetFilteredOutputStream_icu::charsetFilteredOutputStream_icu
277 	(const charset& source, const charset& dest, outputStream* os,
278 	 const charsetConverterOptions& opts)
279 	: m_from(NULL), m_to(NULL), m_sourceCharset(source),
280 	  m_destCharset(dest), m_stream(*os), m_options(opts)
281 {
282 	UErrorCode err = U_ZERO_ERROR;
283 	m_from = ucnv_open(source.getName().c_str(), &err);
284 
285 	if (!U_SUCCESS(err))
286 	{
287 		throw exceptions::charset_conv_error
288 			("Cannot initialize ICU converter for source charset '" + source.getName() + "' (error code: " + u_errorName(err) + ".");
289 	}
290 
291 	m_to = ucnv_open(dest.getName().c_str(), &err);
292 
293 	if (!U_SUCCESS(err))
294 	{
295 		throw exceptions::charset_conv_error
296 			("Cannot initialize ICU converter for destination charset '" + dest.getName() + "' (error code: " + u_errorName(err) + ".");
297 	}
298 
299 	// Tell ICU what to do when encountering an illegal byte sequence
300 	if (m_options.silentlyReplaceInvalidSequences)
301 	{
302 		// Set replacement chars for when converting from Unicode to codepage
303 		icu::UnicodeString substString(m_options.invalidSequence.c_str());
304 		ucnv_setSubstString(m_to, substString.getTerminatedBuffer(), -1, &err);
305 
306 		if (U_FAILURE(err))
307 			throw exceptions::charset_conv_error("[ICU] Error when setting substitution string.");
308 	}
309 	else
310 	{
311 		// Tell ICU top stop (and return an error) on illegal byte sequences
312 		ucnv_setToUCallBack
313 			(m_to, UCNV_TO_U_CALLBACK_STOP, UCNV_SUB_STOP_ON_ILLEGAL, NULL, NULL, &err);
314 
315 		if (U_FAILURE(err))
316 			throw exceptions::charset_conv_error("[ICU] Error when setting ToU callback.");
317 
318 		ucnv_setFromUCallBack
319 			(m_to, UCNV_FROM_U_CALLBACK_STOP, UCNV_SUB_STOP_ON_ILLEGAL, NULL, NULL, &err);
320 
321 		if (U_FAILURE(err))
322 			throw exceptions::charset_conv_error("[ICU] Error when setting FromU callback.");
323 	}
324 }
325 
326 
~charsetFilteredOutputStream_icu()327 charsetFilteredOutputStream_icu::~charsetFilteredOutputStream_icu()
328 {
329 	if (m_from) ucnv_close(m_from);
330 	if (m_to) ucnv_close(m_to);
331 }
332 
333 
getNextOutputStream()334 outputStream& charsetFilteredOutputStream_icu::getNextOutputStream()
335 {
336 	return m_stream;
337 }
338 
339 
writeImpl(const byte_t * const data,const size_t count)340 void charsetFilteredOutputStream_icu::writeImpl
341 	(const byte_t* const data, const size_t count)
342 {
343 	if (m_from == NULL || m_to == NULL)
344 		throw exceptions::charset_conv_error("Cannot initialize converters.");
345 
346 	// Allocate buffer for Unicode chars
347 	const size_t uniSize = ucnv_getMinCharSize(m_from) * count * sizeof(UChar);
348 	std::vector <UChar> uniBuffer(uniSize);
349 
350 	// Conversion loop
351 	UErrorCode toErr = U_ZERO_ERROR;
352 
353 	const char* uniSource = reinterpret_cast <const char*>(data);
354 	const char* uniSourceLimit = uniSource + count;
355 
356 	do
357 	{
358 		// Convert from source charset to Unicode
359 		UChar* uniTarget = &uniBuffer[0];
360 		UChar* uniTargetLimit = &uniBuffer[0] + uniSize;
361 
362 		toErr = U_ZERO_ERROR;
363 
364 		ucnv_toUnicode(m_from, &uniTarget, uniTargetLimit,
365 		               &uniSource, uniSourceLimit, NULL, /* flush */ FALSE, &toErr);
366 
367 		if (U_FAILURE(toErr) && toErr != U_BUFFER_OVERFLOW_ERROR)
368 		{
369 			if (toErr == U_INVALID_CHAR_FOUND ||
370 			    toErr == U_TRUNCATED_CHAR_FOUND ||
371 			    toErr == U_ILLEGAL_CHAR_FOUND)
372 			{
373 				throw exceptions::illegal_byte_sequence_for_charset();
374 			}
375 			else
376 			{
377 				throw exceptions::charset_conv_error
378 					("[ICU] Error converting to Unicode from '" + m_sourceCharset.getName() + "'.");
379 			}
380 		}
381 
382 		const size_t uniLength = uniTarget - &uniBuffer[0];
383 
384 		// Allocate buffer for destination charset
385 		const size_t cpSize = ucnv_getMinCharSize(m_to) * uniLength;
386 		std::vector <char> cpBuffer(cpSize);
387 
388 		// Convert from Unicode to destination charset
389 		UErrorCode fromErr = U_ZERO_ERROR;
390 
391 		const UChar* cpSource = &uniBuffer[0];
392 		const UChar* cpSourceLimit = &uniBuffer[0] + uniLength;
393 
394 		do
395 		{
396 			char* cpTarget = &cpBuffer[0];
397 			char* cpTargetLimit = &cpBuffer[0] + cpSize;
398 
399 			fromErr = U_ZERO_ERROR;
400 
401 			ucnv_fromUnicode(m_to, &cpTarget, cpTargetLimit,
402 							 &cpSource, cpSourceLimit, NULL, /* flush */ FALSE, &fromErr);
403 
404 			if (fromErr != U_BUFFER_OVERFLOW_ERROR && U_FAILURE(fromErr))
405 			{
406 				if (fromErr == U_INVALID_CHAR_FOUND ||
407 				    fromErr == U_TRUNCATED_CHAR_FOUND ||
408 				    fromErr == U_ILLEGAL_CHAR_FOUND)
409 				{
410 					throw exceptions::illegal_byte_sequence_for_charset();
411 				}
412 				else
413 				{
414 					throw exceptions::charset_conv_error
415 						("[ICU] Error converting from Unicode to '" + m_destCharset.getName() + "'.");
416 				}
417 			}
418 
419 			const size_t cpLength = cpTarget - &cpBuffer[0];
420 
421 			// Write successfully converted bytes
422 			m_stream.write(&cpBuffer[0], cpLength);
423 
424 		} while (fromErr == U_BUFFER_OVERFLOW_ERROR);
425 
426 	} while (toErr == U_BUFFER_OVERFLOW_ERROR);
427 }
428 
429 
flush()430 void charsetFilteredOutputStream_icu::flush()
431 {
432 	if (m_from == NULL || m_to == NULL)
433 		throw exceptions::charset_conv_error("Cannot initialize converters.");
434 
435 	// Allocate buffer for Unicode chars
436 	const size_t uniSize = ucnv_getMinCharSize(m_from) * 1024 * sizeof(UChar);
437 	std::vector <UChar> uniBuffer(uniSize);
438 
439 	// Conversion loop (with flushing)
440 	UErrorCode toErr = U_ZERO_ERROR;
441 
442 	const char* uniSource = 0;
443 	const char* uniSourceLimit = 0;
444 
445 	do
446 	{
447 		// Convert from source charset to Unicode
448 		UChar* uniTarget = &uniBuffer[0];
449 		UChar* uniTargetLimit = &uniBuffer[0] + uniSize;
450 
451 		toErr = U_ZERO_ERROR;
452 
453 		ucnv_toUnicode(m_from, &uniTarget, uniTargetLimit,
454 		               &uniSource, uniSourceLimit, NULL, /* flush */ TRUE, &toErr);
455 
456 		if (U_FAILURE(toErr) && toErr != U_BUFFER_OVERFLOW_ERROR)
457 		{
458 			throw exceptions::charset_conv_error
459 				("[ICU] Error converting to Unicode from '" + m_sourceCharset.getName() + "'.");
460 		}
461 
462 		const size_t uniLength = uniTarget - &uniBuffer[0];
463 
464 		// Allocate buffer for destination charset
465 		const size_t cpSize = ucnv_getMinCharSize(m_to) * uniLength;
466 		std::vector <char> cpBuffer(cpSize);
467 
468 		// Convert from Unicode to destination charset
469 		UErrorCode fromErr = U_ZERO_ERROR;
470 
471 		const UChar* cpSource = &uniBuffer[0];
472 		const UChar* cpSourceLimit = &uniBuffer[0] + uniLength;
473 
474 		do
475 		{
476 			char* cpTarget = &cpBuffer[0];
477 			char* cpTargetLimit = &cpBuffer[0] + cpSize;
478 
479 			fromErr = U_ZERO_ERROR;
480 
481 			ucnv_fromUnicode(m_to, &cpTarget, cpTargetLimit,
482 							 &cpSource, cpSourceLimit, NULL, /* flush */ TRUE, &fromErr);
483 
484 			if (fromErr != U_BUFFER_OVERFLOW_ERROR && U_FAILURE(fromErr))
485 			{
486 				throw exceptions::charset_conv_error
487 					("[ICU] Error converting from Unicode to '" + m_destCharset.getName() + "'.");
488 			}
489 
490 			const size_t cpLength = cpTarget - &cpBuffer[0];
491 
492 			// Write successfully converted bytes
493 			m_stream.write(&cpBuffer[0], cpLength);
494 
495 		} while (fromErr == U_BUFFER_OVERFLOW_ERROR);
496 
497 	} while (toErr == U_BUFFER_OVERFLOW_ERROR);
498 
499 	m_stream.flush();
500 }
501 
502 
503 } // utility
504 
505 
506 } // vmime
507 
508 
509 #endif // VMIME_CHARSETCONV_LIB_IS_ICU
510