1 /* ====================================================================
2 * The Kannel Software License, Version 1.0
3 *
4 * Copyright (c) 2001-2014 Kannel Group
5 * Copyright (c) 1998-2001 WapIT Ltd.
6 * All rights reserved.
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
11 *
12 * 1. Redistributions of source code must retain the above copyright
13 * notice, this list of conditions and the following disclaimer.
14 *
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in
17 * the documentation and/or other materials provided with the
18 * distribution.
19 *
20 * 3. The end-user documentation included with the redistribution,
21 * if any, must include the following acknowledgment:
22 * "This product includes software developed by the
23 * Kannel Group (http://www.kannel.org/)."
24 * Alternately, this acknowledgment may appear in the software itself,
25 * if and wherever such third-party acknowledgments normally appear.
26 *
27 * 4. The names "Kannel" and "Kannel Group" must not be used to
28 * endorse or promote products derived from this software without
29 * prior written permission. For written permission, please
30 * contact org@kannel.org.
31 *
32 * 5. Products derived from this software may not be called "Kannel",
33 * nor may "Kannel" appear in their name, without prior written
34 * permission of the Kannel Group.
35 *
36 * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
37 * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
38 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
39 * DISCLAIMED. IN NO EVENT SHALL THE KANNEL GROUP OR ITS CONTRIBUTORS
40 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY,
41 * OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT
42 * OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
43 * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
44 * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
45 * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
46 * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
47 * ====================================================================
48 *
49 * This software consists of voluntary contributions made by many
50 * individuals on behalf of the Kannel Group. For more information on
51 * the Kannel Group, please see <http://www.kannel.org/>.
52 *
53 * Portions of this software are based upon software originally written at
54 * WapIT Ltd., Helsinki, Finland for the Kannel project.
55 */
56
57 /*
58 * gwlib/charset.c - character set conversions
59 *
60 * This file implements the character set conversions declared in charset.h.
61 *
62 * Richard Braakman
63 */
64
65 #include "gwlib/gwlib.h"
66
67 #if HAVE_ICONV
68 #include <errno.h>
69 #include <iconv.h>
70 #endif
71
72 /* Code used for non-representable characters */
73 #define NRP '?'
74
75 #include "gwlib/latin1_to_gsm.h"
76
77
78 /* This is the extension table defined in GSM 03.38. It is the mapping
79 * used for the character after a GSM 27 (Escape) character. All characters
80 * not in the table, as well as characters we can't represent, will map
81 * to themselves. We cannot represent the euro symbol, which is an escaped
82 * 'e', so we left it out of this table. */
83 static const struct {
84 int gsmesc;
85 int latin1;
86 } gsm_esctolatin1[] = {
87 { 10, 12 }, /* ASCII page break */
88 { 20, '^' },
89 { 40, '{' },
90 { 41, '}' },
91 { 47, '\\' },
92 { 60, '[' },
93 { 61, '~' },
94 { 62, ']' },
95 { 64, '|' },
96 { 101, 128 },
97 { -1, -1 }
98 };
99
100
101 /**
102 * Struct maps escaped GSM chars to unicode codeposition.
103 */
104 static const struct {
105 int gsmesc;
106 int unichar;
107 } gsm_esctouni[] = {
108 { 10, 12 }, /* ASCII page break */
109 { 20, '^' },
110 { 40, '{' },
111 { 41, '}' },
112 { 47, '\\' },
113 { 60, '[' },
114 { 61, '~' },
115 { 62, ']' },
116 { 64, '|' },
117 { 'e', 0x20AC }, /* euro symbol */
118 { -1, -1 }
119 };
120
121
122 /* Map GSM default alphabet characters to ISO-Latin-1 characters.
123 * The greek characters at positions 16 and 18 through 26 are not
124 * mappable. They are mapped to '?' characters.
125 * The escape character, at position 27, is mapped to a space,
126 * though normally the function that indexes into this table will
127 * treat it specially. */
128 static const unsigned char gsm_to_latin1[128] = {
129 '@', 0xa3, '$', 0xa5, 0xe8, 0xe9, 0xf9, 0xec, /* 0 - 7 */
130 0xf2, 0xc7, 10, 0xd8, 0xf8, 13, 0xc5, 0xe5, /* 8 - 15 */
131 '?', '_', '?', '?', '?', '?', '?', '?', /* 16 - 23 */
132 '?', '?', '?', ' ', 0xc6, 0xe6, 0xdf, 0xc9, /* 24 - 31 */
133 ' ', '!', '"', '#', 0xa4, '%', '&', '\'', /* 32 - 39 */
134 '(', ')', '*', '+', ',', '-', '.', '/', /* 40 - 47 */
135 '0', '1', '2', '3', '4', '5', '6', '7', /* 48 - 55 */
136 '8', '9', ':', ';', '<', '=', '>', '?', /* 56 - 63 */
137 0xa1, 'A', 'B', 'C', 'D', 'E', 'F', 'G', /* 64 - 71 */
138 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', /* 73 - 79 */
139 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', /* 80 - 87 */
140 'X', 'Y', 'Z', 0xc4, 0xd6, 0xd1, 0xdc, 0xa7, /* 88 - 95 */
141 0xbf, 'a', 'b', 'c', 'd', 'e', 'f', 'g', /* 96 - 103 */
142 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', /* 104 - 111 */
143 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', /* 112 - 119 */
144 'x', 'y', 'z', 0xe4, 0xf6, 0xf1, 0xfc, 0xe0 /* 120 - 127 */
145 };
146
147 /**
148 * Map GSM default alphabet characters to unicode codeposition.
149 * The escape character, at position 27, is mapped to a NRP,
150 * though normally the function that indexes into this table will
151 * treat it specially.
152 */
153 static const int gsm_to_unicode[128] = {
154 '@', 0xA3, '$', 0xA5, 0xE8, 0xE9, 0xF9, 0xEC, /* 0 - 7 */
155 0xF2, 0xC7, 10, 0xd8, 0xF8, 13, 0xC5, 0xE5, /* 8 - 15 */
156 0x394, '_', 0x3A6, 0x393, 0x39B, 0x3A9, 0x3A0, 0x3A8, /* 16 - 23 */
157 0x3A3, 0x398, 0x39E, NRP, 0xC6, 0xE6, 0xDF, 0xC9, /* 24 - 31 */
158 ' ', '!', '"', '#', 0xA4, '%', '&', '\'', /* 32 - 39 */
159 '(', ')', '*', '+', ',', '-', '.', '/', /* 40 - 47 */
160 '0', '1', '2', '3', '4', '5', '6', '7', /* 48 - 55 */
161 '8', '9', ':', ';', '<', '=', '>', '?', /* 56 - 63 */
162 0xA1, 'A', 'B', 'C', 'D', 'E', 'F', 'G', /* 64 - 71 */
163 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', /* 73 - 79 */
164 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', /* 80 - 87 */
165 'X', 'Y', 'Z', 0xC4, 0xD6, 0xD1, 0xDC, 0xA7, /* 88 - 95 */
166 0xBF, 'a', 'b', 'c', 'd', 'e', 'f', 'g', /* 96 - 103 */
167 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', /* 104 - 111 */
168 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', /* 112 - 119 */
169 'x', 'y', 'z', 0xE4, 0xF6, 0xF1, 0xFC, 0xE0 /* 120 - 127 */
170 };
171
172 /*
173 * Register alises for Windows character sets that the libxml/libiconv can
174 * recoqnise them.
175 */
176
177 struct alias_t {
178 char *real;
179 char *alias;
180 };
181
182 typedef struct alias_t alias_t;
183
184 alias_t chars_aliases[] = {
185 { "CP1250", "WIN-1250" },
186 { "CP1250", "WINDOWS-1250" },
187 { "CP1251", "WIN-1251" },
188 { "CP1251", "WINDOWS-1251" },
189 { "CP1252", "WIN-1252" },
190 { "CP1252", "WINDOWS-1252" },
191 { "CP1253", "WIN-1253" },
192 { "CP1253", "WINDOWS-1253" },
193 { "CP1254", "WIN-1254" },
194 { "CP1254", "WINDOWS-1254" },
195 { "CP1257", "WIN-1257" },
196 { "CP1257", "WINDOWS-1257" },
197 { NULL }
198 };
199
charset_init()200 void charset_init()
201 {
202 int i;
203
204 for (i = 0; chars_aliases[i].real != NULL; i++) {
205 xmlAddEncodingAlias(chars_aliases[i].real,chars_aliases[i].alias);
206 /*debug("encoding",0,"Add encoding for %s",chars_aliases[i].alias);*/
207 }
208 }
209
charset_shutdown()210 void charset_shutdown()
211 {
212 xmlCleanupEncodingAliases();
213 }
214
215 /**
216 * Convert octet string in GSM format to UTF-8.
217 * Every GSM character can be represented with unicode, hence nothing will
218 * be lost. Escaped charaters will be translated into appropriate UTF-8 character.
219 */
charset_gsm_to_utf8(Octstr * ostr)220 void charset_gsm_to_utf8(Octstr *ostr)
221 {
222 long pos, len;
223 Octstr *newostr;
224
225 if (ostr == NULL)
226 return;
227
228 newostr = octstr_create("");
229 len = octstr_len(ostr);
230
231 for (pos = 0; pos < len; pos++) {
232 int c, i;
233
234 c = octstr_get_char(ostr, pos);
235 if (c > 127) {
236 warning(0, "Could not convert GSM (0x%02x) to Unicode.", c);
237 continue;
238 }
239
240 if(c == 27 && pos + 1 < len) {
241 c = octstr_get_char(ostr, ++pos);
242 for (i = 0; gsm_esctouni[i].gsmesc >= 0; i++) {
243 if (gsm_esctouni[i].gsmesc == c)
244 break;
245 }
246 if (gsm_esctouni[i].gsmesc == c) {
247 /* found a value for escaped char */
248 c = gsm_esctouni[i].unichar;
249 } else {
250 /* nothing found, look esc in our table */
251 c = gsm_to_unicode[27];
252 pos--;
253 }
254 } else if (c < 128) {
255 c = gsm_to_unicode[c];
256 }
257 /* unicode to utf-8 */
258 if(c < 128) {
259 /* 0-127 are ASCII chars that need no conversion */
260 octstr_append_char(newostr, c);
261 } else {
262 /* test if it can be converterd into a two byte char */
263 if(c < 0x0800) {
264 octstr_append_char(newostr, ((c >> 6) | 0xC0) & 0xFF); /* add 110xxxxx */
265 octstr_append_char(newostr, (c & 0x3F) | 0x80); /* add 10xxxxxx */
266 } else {
267 /* else we encode with 3 bytes. This only happens in case of euro symbol */
268 octstr_append_char(newostr, ((c >> 12) | 0xE0) & 0xFF); /* add 1110xxxx */
269 octstr_append_char(newostr, (((c >> 6) & 0x3F) | 0x80) & 0xFF); /* add 10xxxxxx */
270 octstr_append_char(newostr, ((c & 0x3F) | 0x80) & 0xFF); /* add 10xxxxxx */
271 }
272 /* There are no 4 bytes encoded characters in GSM charset */
273 }
274 }
275
276 octstr_truncate(ostr, 0);
277 octstr_append(ostr, newostr);
278 octstr_destroy(newostr);
279 }
280
281 /**
282 * Convert octet string in UTF-8 format to GSM 03.38.
283 * Because not all UTF-8 charater can be converted to GSM 03.38 non
284 * convertable character replaces with NRP character (see define above).
285 * Special characters will be formed into escape sequences.
286 * Incomplete UTF-8 characters at the end of the string will be skipped.
287 */
charset_utf8_to_gsm(Octstr * ostr)288 void charset_utf8_to_gsm(Octstr *ostr)
289 {
290 long pos, len;
291 int val1, val2;
292 Octstr *newostr;
293
294 if (ostr == NULL)
295 return;
296
297 newostr = octstr_create("");
298 len = octstr_len(ostr);
299
300 for (pos = 0; pos < len; pos++) {
301 val1 = octstr_get_char(ostr, pos);
302
303 /* check range */
304 if (val1 < 0 || val1 > 255) {
305 warning(0, "Char (0x%02x) in UTF-8 string not in the range (0, 255). Skipped.", val1);
306 continue;
307 }
308
309 /* Convert UTF-8 to unicode code */
310
311 /* test if two byte utf8 char */
312 if ((val1 & 0xE0) == 0xC0) {
313 /* test if incomplete utf char */
314 if(pos + 1 < len) {
315 val2 = octstr_get_char(ostr, ++pos);
316 val1 = (((val1 & ~0xC0) << 6) | (val2 & 0x3F));
317 } else {
318 /* incomplete, ignore it */
319 warning(0, "Incomplete UTF-8 char discovered, skipped. 1");
320 pos += 1;
321 continue;
322 }
323 } else if ((val1 & 0xF0) == 0xE0) { /* test for three byte utf8 char */
324 if(pos + 2 < len) {
325 val2 = octstr_get_char(ostr, ++pos);
326 val1 = (((val1 & ~0xE0) << 6) | (val2 & 0x3F));
327 val2 = octstr_get_char(ostr, ++pos);
328 val1 = (val1 << 6) | (val2 & 0x3F);
329 } else {
330 /* incomplete, ignore it */
331 warning(0, "Incomplete UTF-8 char discovered, skipped. 2");
332 pos += 2;
333 continue;
334 }
335 }
336
337 /* test Latin code page 1 char */
338 if(val1 <= 255) {
339 val1 = latin1_to_gsm[val1];
340 /* needs to be escaped ? */
341 if(val1 < 0) {
342 octstr_append_char(newostr, 27);
343 val1 *= -1;
344 }
345 } else {
346 /* Its not a Latin1 char, test for allowed GSM chars */
347 switch(val1) {
348 case 0x394:
349 val1 = 0x10; /* GREEK CAPITAL LETTER DELTA */
350 break;
351 case 0x3A6:
352 val1 = 0x12; /* GREEK CAPITAL LETTER PHI */
353 break;
354 case 0x393:
355 val1 = 0x13; /* GREEK CAPITAL LETTER GAMMA */
356 break;
357 case 0x39B:
358 val1 = 0x14; /* GREEK CAPITAL LETTER LAMBDA */
359 break;
360 case 0x3A9:
361 val1 = 0x15; /* GREEK CAPITAL LETTER OMEGA */
362 break;
363 case 0x3A0:
364 val1 = 0x16; /* GREEK CAPITAL LETTER PI */
365 break;
366 case 0x3A8:
367 val1 = 0x17; /* GREEK CAPITAL LETTER PSI */
368 break;
369 case 0x3A3:
370 val1 = 0x18; /* GREEK CAPITAL LETTER SIGMA */
371 break;
372 case 0x398:
373 val1 = 0x19; /* GREEK CAPITAL LETTER THETA */
374 break;
375 case 0x39E:
376 val1 = 0x1A; /* GREEK CAPITAL LETTER XI */
377 break;
378 case 0x20AC:
379 val1 = 'e'; /* EURO SIGN */
380 octstr_append_char(newostr, 27);
381 break;
382 default: val1 = NRP; /* character cannot be represented in GSM 03.38 */
383 }
384 }
385 octstr_append_char(newostr, val1);
386 }
387
388 octstr_truncate(ostr, 0);
389 octstr_append(ostr, newostr);
390 octstr_destroy(newostr);
391 }
392
393
charset_gsm_to_latin1(Octstr * ostr)394 void charset_gsm_to_latin1(Octstr *ostr)
395 {
396 long pos, len;
397
398 len = octstr_len(ostr);
399 for (pos = 0; pos < len; pos++) {
400 int c, new, i;
401
402 c = octstr_get_char(ostr, pos);
403 if (c == 27 && pos + 1 < len) {
404 /* GSM escape code. Delete it, then process the next
405 * character specially. */
406 octstr_delete(ostr, pos, 1);
407 len--;
408 c = octstr_get_char(ostr, pos);
409 for (i = 0; gsm_esctolatin1[i].gsmesc >= 0; i++) {
410 if (gsm_esctolatin1[i].gsmesc == c)
411 break;
412 }
413 if (gsm_esctolatin1[i].gsmesc == c)
414 new = gsm_esctolatin1[i].latin1;
415 else if (c < 128)
416 new = gsm_to_latin1[c];
417 else
418 continue;
419 } else if (c < 128) {
420 new = gsm_to_latin1[c];
421 } else {
422 continue;
423 }
424 if (new != c)
425 octstr_set_char(ostr, pos, new);
426 }
427 }
428
429
charset_latin1_to_gsm(Octstr * ostr)430 void charset_latin1_to_gsm(Octstr *ostr)
431 {
432 long pos, len;
433 int c, new;
434 unsigned char esc = 27;
435
436 len = octstr_len(ostr);
437 for (pos = 0; pos < len; pos++) {
438 c = octstr_get_char(ostr, pos);
439 gw_assert(c >= 0);
440 gw_assert(c <= 256);
441 new = latin1_to_gsm[c];
442 if (new < 0) {
443 /* Escaped GSM code */
444 octstr_insert_data(ostr, pos, (char*) &esc, 1);
445 pos++;
446 len++;
447 new = -new;
448 }
449 if (new != c)
450 octstr_set_char(ostr, pos, new);
451 }
452 }
453
454
455 /*
456 * This function is a wrapper arround charset_latin1_to_gsm()
457 * which implements the mapping of a NRCs (national reprentation codes)
458 * ISO 21 German.
459 */
charset_gsm_to_nrc_iso_21_german(Octstr * ostr)460 void charset_gsm_to_nrc_iso_21_german(Octstr *ostr)
461 {
462 long pos, len;
463 int c, new;
464
465 len = octstr_len(ostr);
466
467 for (pos = 0; pos < len; pos++) {
468 c = octstr_get_char(ostr, pos);
469 switch (c) {
470 /* GSM value; NRC value */
471 case 0x5b: new = 0x5b; break; /* � */
472 case 0x5c: new = 0x5c; break; /* � */
473 case 0x5e: new = 0x5d; break; /* � */
474 case 0x7b: new = 0x7b; break; /* � */
475 case 0x7c: new = 0x7c; break; /* � */
476 case 0x7e: new = 0x7d; break; /* � */
477 case 0x1e: new = 0x7e; break; /* � */
478 case 0x5f: new = 0x5e; break; /* � */
479 default: new = c;
480 }
481 if (new != c)
482 octstr_set_char(ostr, pos, new);
483 }
484 }
485
charset_nrc_iso_21_german_to_gsm(Octstr * ostr)486 void charset_nrc_iso_21_german_to_gsm(Octstr *ostr)
487 {
488 long pos, len;
489 int c, new;
490
491 len = octstr_len(ostr);
492
493 for (pos = 0; pos < len; pos++) {
494 c = octstr_get_char(ostr, pos);
495 switch (c) {
496 /* NRC value; GSM value */
497 case 0x5b: new = 0x5b; break; /* � */
498 case 0x5c: new = 0x5c; break; /* � */
499 case 0x5d: new = 0x5e; break; /* � */
500 case 0x7b: new = 0x7b; break; /* � */
501 case 0x7c: new = 0x7c; break; /* � */
502 case 0x7d: new = 0x7e; break; /* � */
503 case 0x7e: new = 0x1e; break; /* � */
504 case 0x5e: new = 0x5f; break; /* � */
505 default: new = c;
506 }
507 if (new != c)
508 octstr_set_char(ostr, pos, new);
509 }
510 }
511
charset_gsm_truncate(Octstr * gsm,long max)512 int charset_gsm_truncate(Octstr *gsm, long max)
513 {
514 if (octstr_len(gsm) > max) {
515 /* If the last GSM character was an escaped character,
516 * then chop off the escape as well as the character. */
517 if (octstr_get_char(gsm, max - 1) == 27)
518 octstr_truncate(gsm, max - 1);
519 else
520 octstr_truncate(gsm, max);
521 return 1;
522 }
523 return 0;
524 }
525
charset_to_utf8(Octstr * from,Octstr ** to,Octstr * charset_from)526 int charset_to_utf8(Octstr *from, Octstr **to, Octstr *charset_from)
527 {
528 int ret;
529 xmlCharEncodingHandlerPtr handler = NULL;
530 xmlBufferPtr frombuffer = NULL;
531 xmlBufferPtr tobuffer = NULL;
532
533 if (octstr_compare(charset_from, octstr_imm("UTF-8")) == 0) {
534 *to = octstr_duplicate(from);
535 return 0;
536 }
537
538 handler = xmlFindCharEncodingHandler(octstr_get_cstr(charset_from));
539 if (handler == NULL)
540 return -2;
541
542 /* Build the libxml buffers for the transcoding. */
543 tobuffer = xmlBufferCreate();
544 frombuffer = xmlBufferCreate();
545 xmlBufferAdd(frombuffer, (unsigned char*)octstr_get_cstr(from), octstr_len(from));
546
547 ret = xmlCharEncInFunc(handler, tobuffer, frombuffer);
548
549 *to = octstr_create_from_data((char*)tobuffer->content, tobuffer->use);
550
551 /* Memory cleanup. */
552 xmlBufferFree(tobuffer);
553 xmlBufferFree(frombuffer);
554
555 return ret;
556 }
557
charset_from_utf8(Octstr * utf8,Octstr ** to,Octstr * charset_to)558 int charset_from_utf8(Octstr *utf8, Octstr **to, Octstr *charset_to)
559 {
560 int ret;
561 xmlCharEncodingHandlerPtr handler = NULL;
562 xmlBufferPtr frombuffer = NULL;
563 xmlBufferPtr tobuffer = NULL;
564
565 handler = xmlFindCharEncodingHandler(octstr_get_cstr(charset_to));
566 if (handler == NULL)
567 return -2;
568
569 /* Build the libxml buffers for the transcoding. */
570 tobuffer = xmlBufferCreate();
571 frombuffer = xmlBufferCreate();
572 xmlBufferAdd(frombuffer, (unsigned char*)octstr_get_cstr(utf8), octstr_len(utf8));
573
574 ret = xmlCharEncOutFunc(handler, tobuffer, frombuffer);
575 if (ret < -2)
576 /* Libxml seems to be here a little uncertain what would be the
577 * return code -3, so let's make it -1. Ugly thing, indeed. --tuo */
578 ret = -1;
579
580 *to = octstr_create_from_data((char*)tobuffer->content, tobuffer->use);
581
582 /* Memory cleanup. */
583 xmlBufferFree(tobuffer);
584 xmlBufferFree(frombuffer);
585
586 return ret;
587 }
588
charset_convert(Octstr * string,char * charset_from,char * charset_to)589 int charset_convert(Octstr* string, char* charset_from, char* charset_to)
590 {
591 #if HAVE_ICONV
592 char *from_buf, *to_buf, *pointer;
593 size_t inbytesleft, outbytesleft, ret;
594 iconv_t cd;
595
596 if (!charset_from || !charset_to || !string) /* sanity check */
597 return -1;
598
599 if (octstr_len(string) < 1 || strcasecmp(charset_from, charset_to) == 0)
600 return 0; /* we are done, nothing to convert */
601
602 cd = iconv_open(charset_to, charset_from);
603 /* Did I succeed in getting a conversion descriptor ? */
604 if (cd == (iconv_t)(-1)) {
605 /* I guess not */
606 error(0,"Failed to convert string from <%s> to <%s> - probably broken type names.",
607 charset_from, charset_to);
608 return -1;
609 }
610
611 from_buf = octstr_get_cstr(string);
612 inbytesleft = octstr_len(string);
613 /* allocate max sized buffer, assuming target encoding may be 4 byte unicode */
614 outbytesleft = inbytesleft * 4;
615 pointer = to_buf = gw_malloc(outbytesleft);
616
617 do {
618 ret = iconv(cd, (ICONV_CONST char**) &from_buf, &inbytesleft, &pointer, &outbytesleft);
619 if(ret == -1) {
620 long tmp;
621 /* the conversion failed somewhere */
622 switch(errno) {
623 case E2BIG: /* no space in output buffer */
624 debug("charset", 0, "outbuf to small, realloc.");
625 tmp = pointer - to_buf;
626 to_buf = gw_realloc(to_buf, tmp + inbytesleft * 4);
627 outbytesleft += inbytesleft * 4;
628 pointer = to_buf + tmp;
629 ret = 0;
630 break;
631 case EILSEQ: /* invalid multibyte sequence */
632 case EINVAL: /* incomplete multibyte sequence */
633 warning(0, "Invalid/Incomplete multibyte sequence at position %d, skeep it.",
634 (int)(from_buf - octstr_get_cstr(string)));
635 /* skeep char and try next */
636 if (outbytesleft == 0) {
637 /* buffer to small */
638 tmp = pointer - to_buf;
639 to_buf = gw_realloc(to_buf, tmp + inbytesleft * 4);
640 outbytesleft += inbytesleft * 4;
641 pointer = to_buf + tmp;
642 }
643 pointer[0] = from_buf[0];
644 pointer++;
645 from_buf++;
646 inbytesleft--;
647 outbytesleft--;
648 ret = 0;
649 break;
650 }
651 }
652 } while(inbytesleft && ret == 0); /* stop if error occurs and not handled above */
653
654 iconv_close(cd);
655
656 if (ret != -1) {
657 /* conversion succeeded */
658 octstr_truncate(string, 0);
659 octstr_append_data(string, to_buf, pointer - to_buf);
660 if (ret)
661 debug("charset", 0, "charset_convert did %ld non-reversible conversions", (long) ret);
662 ret = 0;
663 } else
664 error(errno,"Failed to convert string from <%s> to <%s>.", charset_from, charset_to);
665
666 if (errno == EILSEQ) {
667 debug("charset_convert", 0, "Found an invalid multibyte sequence at position <%d>",
668 (int)(from_buf - octstr_get_cstr(string)));
669 }
670 gw_free(to_buf);
671 return ret;
672 #endif
673 /* no convertion done due to not having iconv */
674 return -1;
675 }
676