1 //
2 // VMime library (http://www.vmime.org)
3 // Copyright (C) 2002-2013 Vincent Richard <vincent@vmime.org>
4 //
5 // This program is free software; you can redistribute it and/or
6 // modify it under the terms of the GNU General Public License as
7 // published by the Free Software Foundation; either version 3 of
8 // the License, or (at your option) any later version.
9 //
10 // This program is distributed in the hope that it will be useful,
11 // but WITHOUT ANY WARRANTY; without even the implied warranty of
12 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 // General Public License for more details.
14 //
15 // You should have received a copy of the GNU General Public License along
16 // with this program; if not, write to the Free Software Foundation, Inc.,
17 // 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
18 //
19 // Linking this library statically or dynamically with other modules is making
20 // a combined work based on this library. Thus, the terms and conditions of
21 // the GNU General Public License cover the whole combination.
22 //
23
24 #include "vmime/config.hpp"
25
26
27 #if VMIME_CHARSETCONV_LIB_IS_ICU
28
29
30 #include "vmime/charsetConverter_icu.hpp"
31
32 #include "vmime/exception.hpp"
33 #include "vmime/utility/inputStreamStringAdapter.hpp"
34 #include "vmime/utility/outputStreamStringAdapter.hpp"
35
36
37 extern "C"
38 {
39 #ifndef VMIME_BUILDING_DOC
40
41 #include <unicode/ucnv.h>
42 #include <unicode/ucnv_err.h>
43
44 #endif // VMIME_BUILDING_DOC
45 }
46
47
48 #include <unicode/unistr.h>
49
50
51 namespace vmime
52 {
53
54
55 // static
createGenericConverter(const charset & source,const charset & dest,const charsetConverterOptions & opts)56 shared_ptr <charsetConverter> charsetConverter::createGenericConverter
57 (const charset& source, const charset& dest,
58 const charsetConverterOptions& opts)
59 {
60 return make_shared <charsetConverter_icu>(source, dest, opts);
61 }
62
63
charsetConverter_icu(const charset & source,const charset & dest,const charsetConverterOptions & opts)64 charsetConverter_icu::charsetConverter_icu
65 (const charset& source, const charset& dest, const charsetConverterOptions& opts)
66 : m_from(NULL), m_to(NULL), m_source(source), m_dest(dest), m_options(opts)
67 {
68 UErrorCode err = U_ZERO_ERROR;
69 m_from = ucnv_open(source.getName().c_str(), &err);
70
71 if (!U_SUCCESS(err))
72 {
73 throw exceptions::charset_conv_error
74 ("Cannot initialize ICU converter for source charset '" + source.getName() + "' (error code: " + u_errorName(err) + ".");
75 }
76
77 m_to = ucnv_open(dest.getName().c_str(), &err);
78
79 if (!U_SUCCESS(err))
80 {
81 throw exceptions::charset_conv_error
82 ("Cannot initialize ICU converter for destination charset '" + dest.getName() + "' (error code: " + u_errorName(err) + ".");
83 }
84 }
85
86
~charsetConverter_icu()87 charsetConverter_icu::~charsetConverter_icu()
88 {
89 if (m_from) ucnv_close(m_from);
90 if (m_to) ucnv_close(m_to);
91 }
92
93
convert(utility::inputStream & in,utility::outputStream & out,status * st)94 void charsetConverter_icu::convert
95 (utility::inputStream& in, utility::outputStream& out, status* st)
96 {
97 UErrorCode err = U_ZERO_ERROR;
98
99 ucnv_reset(m_from);
100 ucnv_reset(m_to);
101
102 if (st)
103 new (st) status();
104
105 // From buffers
106 byte_t cpInBuffer[16]; // stream data put here
107 const size_t outSize = ucnv_getMinCharSize(m_from) * sizeof(cpInBuffer) * sizeof(UChar);
108 std::vector <UChar> uOutBuffer(outSize); // Unicode chars end up here
109
110 // To buffers
111 // converted (char) data end up here
112 const size_t cpOutBufferSz = ucnv_getMaxCharSize(m_to) * outSize;
113 std::vector <char> cpOutBuffer(cpOutBufferSz);
114
115 // Tell ICU what to do when encountering an illegal byte sequence
116 if (m_options.silentlyReplaceInvalidSequences)
117 {
118 // Set replacement chars for when converting from Unicode to codepage
119 icu::UnicodeString substString(m_options.invalidSequence.c_str());
120 ucnv_setSubstString(m_to, substString.getTerminatedBuffer(), -1, &err);
121
122 if (U_FAILURE(err))
123 throw exceptions::charset_conv_error("[ICU] Error when setting substitution string.");
124 }
125 else
126 {
127 // Tell ICU top stop (and return an error) on illegal byte sequences
128 ucnv_setToUCallBack
129 (m_from, UCNV_TO_U_CALLBACK_STOP, UCNV_SUB_STOP_ON_ILLEGAL, NULL, NULL, &err);
130
131 if (U_FAILURE(err))
132 throw exceptions::charset_conv_error("[ICU] Error when setting ToU callback.");
133
134 ucnv_setFromUCallBack
135 (m_to, UCNV_FROM_U_CALLBACK_STOP, UCNV_SUB_STOP_ON_ILLEGAL, NULL, NULL, &err);
136
137 if (U_FAILURE(err))
138 throw exceptions::charset_conv_error("[ICU] Error when setting FromU callback.");
139 }
140
141 // Input data available
142 while (!in.eof())
143 {
144 // Read input data into buffer
145 size_t inLength = in.read(cpInBuffer, sizeof(cpInBuffer));
146
147 // Beginning of read data
148 const char* source = reinterpret_cast <const char*>(&cpInBuffer[0]);
149 const char* sourceLimit = source + inLength; // end + 1
150
151 UBool flush = in.eof(); // is this last run?
152
153 UErrorCode toErr;
154
155 // Loop until all source has been processed
156 do
157 {
158 // Set up target pointers
159 UChar* target = &uOutBuffer[0];
160 UChar* targetLimit = &target[0] + outSize;
161
162 toErr = U_ZERO_ERROR;
163 ucnv_toUnicode(m_from, &target, targetLimit,
164 &source, sourceLimit, NULL, flush, &toErr);
165
166 if (st)
167 st->inputBytesRead += (source - reinterpret_cast <const char*>(&cpInBuffer[0]));
168
169 if (toErr != U_BUFFER_OVERFLOW_ERROR && U_FAILURE(toErr))
170 {
171 if (toErr == U_INVALID_CHAR_FOUND ||
172 toErr == U_TRUNCATED_CHAR_FOUND ||
173 toErr == U_ILLEGAL_CHAR_FOUND)
174 {
175 // Error will be thrown later (*)
176 }
177 else
178 {
179 throw exceptions::charset_conv_error("[ICU] Error converting to Unicode from " + m_source.getName());
180 }
181 }
182
183 // The Unicode source is the buffer just written and the limit
184 // is where the previous conversion stopped (target is moved in the conversion)
185 const UChar* uSource = &uOutBuffer[0];
186 UChar* uSourceLimit = &target[0];
187 UErrorCode fromErr;
188
189 // Loop until converted chars are fully written
190 do
191 {
192 char* cpTarget = &cpOutBuffer[0];
193 const char* cpTargetLimit = &cpOutBuffer[0] + cpOutBufferSz;
194
195 fromErr = U_ZERO_ERROR;
196
197 // Write converted bytes (Unicode) to destination codepage
198 ucnv_fromUnicode(m_to, &cpTarget, cpTargetLimit,
199 &uSource, uSourceLimit, NULL, flush, &fromErr);
200
201 if (st)
202 {
203 // Decrement input bytes count by the number of input bytes in error
204 char errBytes[16];
205 int8_t errBytesLen = sizeof(errBytes);
206 UErrorCode errBytesErr = U_ZERO_ERROR;
207
208 ucnv_getInvalidChars(m_from, errBytes, &errBytesLen, &errBytesErr);
209
210 st->inputBytesRead -= errBytesLen;
211 st->outputBytesWritten += cpTarget - &cpOutBuffer[0];
212 }
213
214 // (*) If an error occured while converting from input charset, throw it now
215 if (toErr == U_INVALID_CHAR_FOUND ||
216 toErr == U_TRUNCATED_CHAR_FOUND ||
217 toErr == U_ILLEGAL_CHAR_FOUND)
218 {
219 throw exceptions::illegal_byte_sequence_for_charset();
220 }
221
222 if (fromErr != U_BUFFER_OVERFLOW_ERROR && U_FAILURE(fromErr))
223 {
224 if (fromErr == U_INVALID_CHAR_FOUND ||
225 fromErr == U_TRUNCATED_CHAR_FOUND ||
226 fromErr == U_ILLEGAL_CHAR_FOUND)
227 {
228 throw exceptions::illegal_byte_sequence_for_charset();
229 }
230 else
231 {
232 throw exceptions::charset_conv_error("[ICU] Error converting from Unicode to " + m_dest.getName());
233 }
234 }
235
236 // Write to destination stream
237 out.write(&cpOutBuffer[0], (cpTarget - &cpOutBuffer[0]));
238
239 } while (fromErr == U_BUFFER_OVERFLOW_ERROR);
240
241 } while (toErr == U_BUFFER_OVERFLOW_ERROR);
242 }
243 }
244
245
convert(const string & in,string & out,status * st)246 void charsetConverter_icu::convert(const string& in, string& out, status* st)
247 {
248 if (st)
249 new (st) status();
250
251 out.clear();
252
253 utility::inputStreamStringAdapter is(in);
254 utility::outputStreamStringAdapter os(out);
255
256 convert(is, os, st);
257
258 os.flush();
259 }
260
261
262 shared_ptr <utility::charsetFilteredOutputStream>
getFilteredOutputStream(utility::outputStream & os,const charsetConverterOptions & opts)263 charsetConverter_icu::getFilteredOutputStream
264 (utility::outputStream& os, const charsetConverterOptions& opts)
265 {
266 return make_shared <utility::charsetFilteredOutputStream_icu>(m_source, m_dest, &os, opts);
267 }
268
269
270
271 // charsetFilteredOutputStream_icu
272
273 namespace utility {
274
275
charsetFilteredOutputStream_icu(const charset & source,const charset & dest,outputStream * os,const charsetConverterOptions & opts)276 charsetFilteredOutputStream_icu::charsetFilteredOutputStream_icu
277 (const charset& source, const charset& dest, outputStream* os,
278 const charsetConverterOptions& opts)
279 : m_from(NULL), m_to(NULL), m_sourceCharset(source),
280 m_destCharset(dest), m_stream(*os), m_options(opts)
281 {
282 UErrorCode err = U_ZERO_ERROR;
283 m_from = ucnv_open(source.getName().c_str(), &err);
284
285 if (!U_SUCCESS(err))
286 {
287 throw exceptions::charset_conv_error
288 ("Cannot initialize ICU converter for source charset '" + source.getName() + "' (error code: " + u_errorName(err) + ".");
289 }
290
291 m_to = ucnv_open(dest.getName().c_str(), &err);
292
293 if (!U_SUCCESS(err))
294 {
295 throw exceptions::charset_conv_error
296 ("Cannot initialize ICU converter for destination charset '" + dest.getName() + "' (error code: " + u_errorName(err) + ".");
297 }
298
299 // Tell ICU what to do when encountering an illegal byte sequence
300 if (m_options.silentlyReplaceInvalidSequences)
301 {
302 // Set replacement chars for when converting from Unicode to codepage
303 icu::UnicodeString substString(m_options.invalidSequence.c_str());
304 ucnv_setSubstString(m_to, substString.getTerminatedBuffer(), -1, &err);
305
306 if (U_FAILURE(err))
307 throw exceptions::charset_conv_error("[ICU] Error when setting substitution string.");
308 }
309 else
310 {
311 // Tell ICU top stop (and return an error) on illegal byte sequences
312 ucnv_setToUCallBack
313 (m_to, UCNV_TO_U_CALLBACK_STOP, UCNV_SUB_STOP_ON_ILLEGAL, NULL, NULL, &err);
314
315 if (U_FAILURE(err))
316 throw exceptions::charset_conv_error("[ICU] Error when setting ToU callback.");
317
318 ucnv_setFromUCallBack
319 (m_to, UCNV_FROM_U_CALLBACK_STOP, UCNV_SUB_STOP_ON_ILLEGAL, NULL, NULL, &err);
320
321 if (U_FAILURE(err))
322 throw exceptions::charset_conv_error("[ICU] Error when setting FromU callback.");
323 }
324 }
325
326
~charsetFilteredOutputStream_icu()327 charsetFilteredOutputStream_icu::~charsetFilteredOutputStream_icu()
328 {
329 if (m_from) ucnv_close(m_from);
330 if (m_to) ucnv_close(m_to);
331 }
332
333
getNextOutputStream()334 outputStream& charsetFilteredOutputStream_icu::getNextOutputStream()
335 {
336 return m_stream;
337 }
338
339
writeImpl(const byte_t * const data,const size_t count)340 void charsetFilteredOutputStream_icu::writeImpl
341 (const byte_t* const data, const size_t count)
342 {
343 if (m_from == NULL || m_to == NULL)
344 throw exceptions::charset_conv_error("Cannot initialize converters.");
345
346 // Allocate buffer for Unicode chars
347 const size_t uniSize = ucnv_getMinCharSize(m_from) * count * sizeof(UChar);
348 std::vector <UChar> uniBuffer(uniSize);
349
350 // Conversion loop
351 UErrorCode toErr = U_ZERO_ERROR;
352
353 const char* uniSource = reinterpret_cast <const char*>(data);
354 const char* uniSourceLimit = uniSource + count;
355
356 do
357 {
358 // Convert from source charset to Unicode
359 UChar* uniTarget = &uniBuffer[0];
360 UChar* uniTargetLimit = &uniBuffer[0] + uniSize;
361
362 toErr = U_ZERO_ERROR;
363
364 ucnv_toUnicode(m_from, &uniTarget, uniTargetLimit,
365 &uniSource, uniSourceLimit, NULL, /* flush */ FALSE, &toErr);
366
367 if (U_FAILURE(toErr) && toErr != U_BUFFER_OVERFLOW_ERROR)
368 {
369 if (toErr == U_INVALID_CHAR_FOUND ||
370 toErr == U_TRUNCATED_CHAR_FOUND ||
371 toErr == U_ILLEGAL_CHAR_FOUND)
372 {
373 throw exceptions::illegal_byte_sequence_for_charset();
374 }
375 else
376 {
377 throw exceptions::charset_conv_error
378 ("[ICU] Error converting to Unicode from '" + m_sourceCharset.getName() + "'.");
379 }
380 }
381
382 const size_t uniLength = uniTarget - &uniBuffer[0];
383
384 // Allocate buffer for destination charset
385 const size_t cpSize = ucnv_getMinCharSize(m_to) * uniLength;
386 std::vector <char> cpBuffer(cpSize);
387
388 // Convert from Unicode to destination charset
389 UErrorCode fromErr = U_ZERO_ERROR;
390
391 const UChar* cpSource = &uniBuffer[0];
392 const UChar* cpSourceLimit = &uniBuffer[0] + uniLength;
393
394 do
395 {
396 char* cpTarget = &cpBuffer[0];
397 char* cpTargetLimit = &cpBuffer[0] + cpSize;
398
399 fromErr = U_ZERO_ERROR;
400
401 ucnv_fromUnicode(m_to, &cpTarget, cpTargetLimit,
402 &cpSource, cpSourceLimit, NULL, /* flush */ FALSE, &fromErr);
403
404 if (fromErr != U_BUFFER_OVERFLOW_ERROR && U_FAILURE(fromErr))
405 {
406 if (fromErr == U_INVALID_CHAR_FOUND ||
407 fromErr == U_TRUNCATED_CHAR_FOUND ||
408 fromErr == U_ILLEGAL_CHAR_FOUND)
409 {
410 throw exceptions::illegal_byte_sequence_for_charset();
411 }
412 else
413 {
414 throw exceptions::charset_conv_error
415 ("[ICU] Error converting from Unicode to '" + m_destCharset.getName() + "'.");
416 }
417 }
418
419 const size_t cpLength = cpTarget - &cpBuffer[0];
420
421 // Write successfully converted bytes
422 m_stream.write(&cpBuffer[0], cpLength);
423
424 } while (fromErr == U_BUFFER_OVERFLOW_ERROR);
425
426 } while (toErr == U_BUFFER_OVERFLOW_ERROR);
427 }
428
429
flush()430 void charsetFilteredOutputStream_icu::flush()
431 {
432 if (m_from == NULL || m_to == NULL)
433 throw exceptions::charset_conv_error("Cannot initialize converters.");
434
435 // Allocate buffer for Unicode chars
436 const size_t uniSize = ucnv_getMinCharSize(m_from) * 1024 * sizeof(UChar);
437 std::vector <UChar> uniBuffer(uniSize);
438
439 // Conversion loop (with flushing)
440 UErrorCode toErr = U_ZERO_ERROR;
441
442 const char* uniSource = 0;
443 const char* uniSourceLimit = 0;
444
445 do
446 {
447 // Convert from source charset to Unicode
448 UChar* uniTarget = &uniBuffer[0];
449 UChar* uniTargetLimit = &uniBuffer[0] + uniSize;
450
451 toErr = U_ZERO_ERROR;
452
453 ucnv_toUnicode(m_from, &uniTarget, uniTargetLimit,
454 &uniSource, uniSourceLimit, NULL, /* flush */ TRUE, &toErr);
455
456 if (U_FAILURE(toErr) && toErr != U_BUFFER_OVERFLOW_ERROR)
457 {
458 throw exceptions::charset_conv_error
459 ("[ICU] Error converting to Unicode from '" + m_sourceCharset.getName() + "'.");
460 }
461
462 const size_t uniLength = uniTarget - &uniBuffer[0];
463
464 // Allocate buffer for destination charset
465 const size_t cpSize = ucnv_getMinCharSize(m_to) * uniLength;
466 std::vector <char> cpBuffer(cpSize);
467
468 // Convert from Unicode to destination charset
469 UErrorCode fromErr = U_ZERO_ERROR;
470
471 const UChar* cpSource = &uniBuffer[0];
472 const UChar* cpSourceLimit = &uniBuffer[0] + uniLength;
473
474 do
475 {
476 char* cpTarget = &cpBuffer[0];
477 char* cpTargetLimit = &cpBuffer[0] + cpSize;
478
479 fromErr = U_ZERO_ERROR;
480
481 ucnv_fromUnicode(m_to, &cpTarget, cpTargetLimit,
482 &cpSource, cpSourceLimit, NULL, /* flush */ TRUE, &fromErr);
483
484 if (fromErr != U_BUFFER_OVERFLOW_ERROR && U_FAILURE(fromErr))
485 {
486 throw exceptions::charset_conv_error
487 ("[ICU] Error converting from Unicode to '" + m_destCharset.getName() + "'.");
488 }
489
490 const size_t cpLength = cpTarget - &cpBuffer[0];
491
492 // Write successfully converted bytes
493 m_stream.write(&cpBuffer[0], cpLength);
494
495 } while (fromErr == U_BUFFER_OVERFLOW_ERROR);
496
497 } while (toErr == U_BUFFER_OVERFLOW_ERROR);
498
499 m_stream.flush();
500 }
501
502
503 } // utility
504
505
506 } // vmime
507
508
509 #endif // VMIME_CHARSETCONV_LIB_IS_ICU
510