1 /* ====================================================================
2  * The Kannel Software License, Version 1.0
3  *
4  * Copyright (c) 2001-2014 Kannel Group
5  * Copyright (c) 1998-2001 WapIT Ltd.
6  * All rights reserved.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  *
12  * 1. Redistributions of source code must retain the above copyright
13  *    notice, this list of conditions and the following disclaimer.
14  *
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in
17  *    the documentation and/or other materials provided with the
18  *    distribution.
19  *
20  * 3. The end-user documentation included with the redistribution,
21  *    if any, must include the following acknowledgment:
22  *       "This product includes software developed by the
23  *        Kannel Group (http://www.kannel.org/)."
24  *    Alternately, this acknowledgment may appear in the software itself,
25  *    if and wherever such third-party acknowledgments normally appear.
26  *
27  * 4. The names "Kannel" and "Kannel Group" must not be used to
28  *    endorse or promote products derived from this software without
29  *    prior written permission. For written permission, please
30  *    contact org@kannel.org.
31  *
32  * 5. Products derived from this software may not be called "Kannel",
33  *    nor may "Kannel" appear in their name, without prior written
34  *    permission of the Kannel Group.
35  *
36  * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
37  * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
38  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
39  * DISCLAIMED.  IN NO EVENT SHALL THE KANNEL GROUP OR ITS CONTRIBUTORS
40  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY,
41  * OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT
42  * OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
43  * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
44  * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
45  * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
46  * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
47  * ====================================================================
48  *
49  * This software consists of voluntary contributions made by many
50  * individuals on behalf of the Kannel Group.  For more information on
51  * the Kannel Group, please see <http://www.kannel.org/>.
52  *
53  * Portions of this software are based upon software originally written at
54  * WapIT Ltd., Helsinki, Finland for the Kannel project.
55  */
56 
57 /*
58  * gwlib/charset.c - character set conversions
59  *
60  * This file implements the character set conversions declared in charset.h.
61  *
62  * Richard Braakman
63  */
64 
65 #include "gwlib/gwlib.h"
66 
67 #if HAVE_ICONV
68 #include <errno.h>
69 #include <iconv.h>
70 #endif
71 
72 /* Code used for non-representable characters */
73 #define NRP '?'
74 
75 #include "gwlib/latin1_to_gsm.h"
76 
77 
78 /* This is the extension table defined in GSM 03.38.  It is the mapping
79  * used for the character after a GSM 27 (Escape) character.  All characters
80  * not in the table, as well as characters we can't represent, will map
81  * to themselves.  We cannot represent the euro symbol, which is an escaped
82  * 'e', so we left it out of this table. */
83 static const struct {
84     int gsmesc;
85     int latin1;
86 } gsm_esctolatin1[] = {
87     {  10, 12 }, /* ASCII page break */
88     {  20, '^' },
89     {  40, '{' },
90     {  41, '}' },
91     {  47, '\\' },
92     {  60, '[' },
93     {  61, '~' },
94     {  62, ']' },
95     {  64, '|' },
96     { 101, 128 },
97     { -1, -1 }
98 };
99 
100 
101 /**
102  * Struct maps escaped GSM chars to unicode codeposition.
103  */
104 static const struct {
105     int gsmesc;
106     int unichar;
107 } gsm_esctouni[] = {
108     { 10, 12 }, /* ASCII page break */
109     { 20, '^' },
110     { 40, '{' },
111     { 41, '}' },
112     { 47, '\\' },
113     { 60, '[' },
114     { 61, '~' },
115     { 62, ']' },
116     { 64, '|' },
117     { 'e', 0x20AC },  /* euro symbol */
118     { -1, -1 }
119 };
120 
121 
122 /* Map GSM default alphabet characters to ISO-Latin-1 characters.
123  * The greek characters at positions 16 and 18 through 26 are not
124  * mappable.  They are mapped to '?' characters.
125  * The escape character, at position 27, is mapped to a space,
126  * though normally the function that indexes into this table will
127  * treat it specially. */
128 static const unsigned char gsm_to_latin1[128] = {
129      '@', 0xa3,  '$', 0xa5, 0xe8, 0xe9, 0xf9, 0xec,   /* 0 - 7 */
130     0xf2, 0xc7,   10, 0xd8, 0xf8,   13, 0xc5, 0xe5,   /* 8 - 15 */
131      '?',  '_',  '?',  '?',  '?',  '?',  '?',  '?',   /* 16 - 23 */
132          '?',  '?',  '?',  ' ', 0xc6, 0xe6, 0xdf, 0xc9,   /* 24 - 31 */
133      ' ',  '!',  '"',  '#', 0xa4,  '%',  '&', '\'',   /* 32 - 39 */
134      '(',  ')',  '*',  '+',  ',',  '-',  '.',  '/',   /* 40 - 47 */
135      '0',  '1',  '2',  '3',  '4',  '5',  '6',  '7',   /* 48 - 55 */
136      '8',  '9',  ':',  ';',  '<',  '=',  '>',  '?',   /* 56 - 63 */
137         0xa1,  'A',  'B',  'C',  'D',  'E',  'F',  'G',   /* 64 - 71 */
138          'H',  'I',  'J',  'K',  'L',  'M',  'N',  'O',   /* 73 - 79 */
139          'P',  'Q',  'R',  'S',  'T',  'U',  'V',  'W',   /* 80 - 87 */
140          'X',  'Y',  'Z', 0xc4, 0xd6, 0xd1, 0xdc, 0xa7,   /* 88 - 95 */
141         0xbf,  'a',  'b',  'c',  'd',  'e',  'f',  'g',   /* 96 - 103 */
142          'h',  'i',  'j',  'k',  'l',  'm',  'n',  'o',   /* 104 - 111 */
143          'p',  'q',  'r',  's',  't',  'u',  'v',  'w',   /* 112 - 119 */
144          'x',  'y',  'z', 0xe4, 0xf6, 0xf1, 0xfc, 0xe0    /* 120 - 127 */
145 };
146 
147 /**
148  * Map GSM default alphabet characters to unicode codeposition.
149  * The escape character, at position 27, is mapped to a NRP,
150  * though normally the function that indexes into this table will
151  * treat it specially.
152  */
153 static const int gsm_to_unicode[128] = {
154       '@',  0xA3,   '$',  0xA5,  0xE8,  0xE9,  0xF9,  0xEC,   /* 0 - 7 */
155      0xF2,  0xC7,    10,  0xd8,  0xF8,    13,  0xC5,  0xE5,   /* 8 - 15 */
156     0x394,   '_', 0x3A6, 0x393, 0x39B, 0x3A9, 0x3A0, 0x3A8,   /* 16 - 23 */
157     0x3A3, 0x398, 0x39E,   NRP,  0xC6,  0xE6,  0xDF,  0xC9,   /* 24 - 31 */
158       ' ',   '!',   '"',   '#',  0xA4,   '%',   '&',  '\'',   /* 32 - 39 */
159       '(',   ')',   '*',   '+',   ',',   '-',   '.',   '/',   /* 40 - 47 */
160       '0',   '1',   '2',   '3',   '4',   '5',   '6',   '7',   /* 48 - 55 */
161       '8',   '9',   ':',   ';',   '<',   '=',   '>',   '?',   /* 56 - 63 */
162       0xA1,  'A',   'B',   'C',   'D',   'E',   'F',   'G',   /* 64 - 71 */
163       'H',   'I',   'J',   'K',   'L',   'M',   'N',   'O',   /* 73 - 79 */
164       'P',   'Q',   'R',   'S',   'T',   'U',   'V',   'W',   /* 80 - 87 */
165       'X',   'Y',   'Z',  0xC4,  0xD6,  0xD1,  0xDC,  0xA7,   /* 88 - 95 */
166      0xBF,   'a',   'b',   'c',   'd',   'e',   'f',   'g',   /* 96 - 103 */
167       'h',   'i',   'j',   'k',   'l',   'm',   'n',   'o',   /* 104 - 111 */
168       'p',   'q',   'r',   's',   't',   'u',   'v',   'w',   /* 112 - 119 */
169       'x',   'y',   'z',  0xE4,  0xF6,  0xF1,  0xFC,  0xE0    /* 120 - 127 */
170 };
171 
172 /*
173  * Register alises for Windows character sets that the libxml/libiconv can
174  * recoqnise them.
175  */
176 
177 struct alias_t {
178     char *real;
179     char *alias;
180 };
181 
182 typedef struct alias_t alias_t;
183 
184 alias_t chars_aliases[] = {
185     { "CP1250", "WIN-1250" },
186     { "CP1250", "WINDOWS-1250" },
187     { "CP1251", "WIN-1251" },
188     { "CP1251", "WINDOWS-1251" },
189     { "CP1252", "WIN-1252" },
190     { "CP1252", "WINDOWS-1252" },
191     { "CP1253", "WIN-1253" },
192     { "CP1253", "WINDOWS-1253" },
193     { "CP1254", "WIN-1254" },
194     { "CP1254", "WINDOWS-1254" },
195     { "CP1257", "WIN-1257" },
196     { "CP1257", "WINDOWS-1257" },
197     { NULL }
198 };
199 
charset_init()200 void charset_init()
201 {
202     int i;
203 
204     for (i = 0; chars_aliases[i].real != NULL; i++) {
205       xmlAddEncodingAlias(chars_aliases[i].real,chars_aliases[i].alias);
206       /*debug("encoding",0,"Add encoding for %s",chars_aliases[i].alias);*/
207     }
208 }
209 
charset_shutdown()210 void charset_shutdown()
211 {
212     xmlCleanupEncodingAliases();
213 }
214 
215 /**
216  * Convert octet string in GSM format to UTF-8.
217  * Every GSM character can be represented with unicode, hence nothing will
218  * be lost. Escaped charaters will be translated into appropriate UTF-8 character.
219  */
charset_gsm_to_utf8(Octstr * ostr)220 void charset_gsm_to_utf8(Octstr *ostr)
221 {
222     long pos, len;
223     Octstr *newostr;
224 
225     if (ostr == NULL)
226         return;
227 
228     newostr = octstr_create("");
229     len = octstr_len(ostr);
230 
231     for (pos = 0; pos < len; pos++) {
232         int c, i;
233 
234         c = octstr_get_char(ostr, pos);
235         if (c > 127) {
236             warning(0, "Could not convert GSM (0x%02x) to Unicode.", c);
237             continue;
238         }
239 
240         if(c == 27 && pos + 1 < len) {
241             c = octstr_get_char(ostr, ++pos);
242             for (i = 0; gsm_esctouni[i].gsmesc >= 0; i++) {
243                 if (gsm_esctouni[i].gsmesc == c)
244                     break;
245             }
246             if (gsm_esctouni[i].gsmesc == c) {
247                 /* found a value for escaped char */
248                 c = gsm_esctouni[i].unichar;
249             } else {
250 	        /* nothing found, look esc in our table */
251 		c = gsm_to_unicode[27];
252                 pos--;
253 	    }
254         } else if (c < 128) {
255             c = gsm_to_unicode[c];
256         }
257         /* unicode to utf-8 */
258         if(c < 128) {
259             /* 0-127 are ASCII chars that need no conversion */
260             octstr_append_char(newostr, c);
261         } else {
262             /* test if it can be converterd into a two byte char */
263             if(c < 0x0800) {
264                 octstr_append_char(newostr, ((c >> 6) | 0xC0) & 0xFF); /* add 110xxxxx */
265                 octstr_append_char(newostr, (c & 0x3F) | 0x80); /* add 10xxxxxx */
266             } else {
267                 /* else we encode with 3 bytes. This only happens in case of euro symbol */
268                 octstr_append_char(newostr, ((c >> 12) | 0xE0) & 0xFF); /* add 1110xxxx */
269                 octstr_append_char(newostr, (((c >> 6) & 0x3F) | 0x80) & 0xFF); /* add 10xxxxxx */
270                 octstr_append_char(newostr, ((c  & 0x3F) | 0x80) & 0xFF); /* add 10xxxxxx */
271             }
272             /* There are no 4 bytes encoded characters in GSM charset */
273         }
274     }
275 
276     octstr_truncate(ostr, 0);
277     octstr_append(ostr, newostr);
278     octstr_destroy(newostr);
279 }
280 
281 /**
282  * Convert octet string in UTF-8 format to GSM 03.38.
283  * Because not all UTF-8 charater can be converted to GSM 03.38 non
284  * convertable character replaces with NRP character (see define above).
285  * Special characters will be formed into escape sequences.
286  * Incomplete UTF-8 characters at the end of the string will be skipped.
287  */
charset_utf8_to_gsm(Octstr * ostr)288 void charset_utf8_to_gsm(Octstr *ostr)
289 {
290     long pos, len;
291     int val1, val2;
292     Octstr *newostr;
293 
294     if (ostr == NULL)
295         return;
296 
297     newostr = octstr_create("");
298     len = octstr_len(ostr);
299 
300     for (pos = 0; pos < len; pos++) {
301         val1 = octstr_get_char(ostr, pos);
302 
303         /* check range */
304         if (val1 < 0 || val1 > 255) {
305             warning(0, "Char (0x%02x) in UTF-8 string not in the range (0, 255). Skipped.", val1);
306             continue;
307         }
308 
309         /* Convert UTF-8 to unicode code */
310 
311         /* test if two byte utf8 char */
312         if ((val1 & 0xE0) == 0xC0) {
313             /* test if incomplete utf char */
314             if(pos + 1 < len) {
315                 val2 = octstr_get_char(ostr, ++pos);
316                 val1 = (((val1 & ~0xC0) << 6) | (val2 & 0x3F));
317             } else {
318                 /* incomplete, ignore it */
319                 warning(0, "Incomplete UTF-8 char discovered, skipped. 1");
320                 pos += 1;
321                 continue;
322             }
323         } else if ((val1 & 0xF0) == 0xE0) { /* test for three byte utf8 char */
324             if(pos + 2 < len) {
325                 val2 = octstr_get_char(ostr, ++pos);
326                 val1 = (((val1 & ~0xE0) << 6) | (val2 & 0x3F));
327                 val2 = octstr_get_char(ostr, ++pos);
328                 val1 = (val1 << 6) | (val2 & 0x3F);
329             } else {
330                 /* incomplete, ignore it */
331                 warning(0, "Incomplete UTF-8 char discovered, skipped. 2");
332                 pos += 2;
333                 continue;
334             }
335         }
336 
337         /* test Latin code page 1 char */
338         if(val1 <= 255) {
339             val1 = latin1_to_gsm[val1];
340             /* needs to be escaped ? */
341             if(val1 < 0) {
342                 octstr_append_char(newostr, 27);
343                 val1 *= -1;
344             }
345         } else {
346             /* Its not a Latin1 char, test for allowed GSM chars */
347             switch(val1) {
348             case 0x394:
349                 val1 = 0x10; /* GREEK CAPITAL LETTER DELTA */
350                 break;
351             case 0x3A6:
352                 val1 = 0x12; /* GREEK CAPITAL LETTER PHI */
353                 break;
354             case 0x393:
355                 val1 = 0x13; /* GREEK CAPITAL LETTER GAMMA */
356                 break;
357             case 0x39B:
358                 val1 = 0x14; /* GREEK CAPITAL LETTER LAMBDA */
359                 break;
360             case 0x3A9:
361                 val1 = 0x15; /* GREEK CAPITAL LETTER OMEGA */
362                 break;
363             case 0x3A0:
364                 val1 = 0x16; /* GREEK CAPITAL LETTER PI */
365                 break;
366             case 0x3A8:
367                 val1 = 0x17; /* GREEK CAPITAL LETTER PSI */
368                 break;
369             case 0x3A3:
370                 val1 = 0x18; /* GREEK CAPITAL LETTER SIGMA */
371                 break;
372             case 0x398:
373                 val1 = 0x19; /* GREEK CAPITAL LETTER THETA */
374                 break;
375             case 0x39E:
376                 val1 = 0x1A; /* GREEK CAPITAL LETTER XI */
377                 break;
378             case 0x20AC:
379                 val1 = 'e'; /* EURO SIGN */
380                 octstr_append_char(newostr, 27);
381                 break;
382             default: val1 = NRP; /* character cannot be represented in GSM 03.38 */
383             }
384         }
385         octstr_append_char(newostr, val1);
386     }
387 
388     octstr_truncate(ostr, 0);
389     octstr_append(ostr, newostr);
390     octstr_destroy(newostr);
391 }
392 
393 
charset_gsm_to_latin1(Octstr * ostr)394 void charset_gsm_to_latin1(Octstr *ostr)
395 {
396     long pos, len;
397 
398     len = octstr_len(ostr);
399     for (pos = 0; pos < len; pos++) {
400     int c, new, i;
401 
402     c = octstr_get_char(ostr, pos);
403     if (c == 27 && pos + 1 < len) {
404         /* GSM escape code.  Delete it, then process the next
405              * character specially. */
406         octstr_delete(ostr, pos, 1);
407         len--;
408         c = octstr_get_char(ostr, pos);
409         for (i = 0; gsm_esctolatin1[i].gsmesc >= 0; i++) {
410         if (gsm_esctolatin1[i].gsmesc == c)
411             break;
412         }
413         if (gsm_esctolatin1[i].gsmesc == c)
414         new = gsm_esctolatin1[i].latin1;
415         else if (c < 128)
416         new = gsm_to_latin1[c];
417         else
418         continue;
419     } else if (c < 128) {
420             new = gsm_to_latin1[c];
421     } else {
422         continue;
423     }
424     if (new != c)
425         octstr_set_char(ostr, pos, new);
426     }
427 }
428 
429 
charset_latin1_to_gsm(Octstr * ostr)430 void charset_latin1_to_gsm(Octstr *ostr)
431 {
432     long pos, len;
433     int c, new;
434     unsigned char esc = 27;
435 
436     len = octstr_len(ostr);
437     for (pos = 0; pos < len; pos++) {
438     c = octstr_get_char(ostr, pos);
439     gw_assert(c >= 0);
440     gw_assert(c <= 256);
441     new = latin1_to_gsm[c];
442     if (new < 0) {
443          /* Escaped GSM code */
444         octstr_insert_data(ostr, pos, (char*) &esc, 1);
445         pos++;
446         len++;
447         new = -new;
448     }
449     if (new != c)
450         octstr_set_char(ostr, pos, new);
451     }
452 }
453 
454 
455 /*
456  * This function is a wrapper arround charset_latin1_to_gsm()
457  * which implements the mapping of a NRCs (national reprentation codes)
458  * ISO 21 German.
459  */
charset_gsm_to_nrc_iso_21_german(Octstr * ostr)460 void charset_gsm_to_nrc_iso_21_german(Octstr *ostr)
461 {
462     long pos, len;
463     int c, new;
464 
465     len = octstr_len(ostr);
466 
467     for (pos = 0; pos < len; pos++) {
468         c = octstr_get_char(ostr, pos);
469         switch (c) {
470             /* GSM value; NRC value */
471             case 0x5b: new = 0x5b; break; /* � */
472             case 0x5c: new = 0x5c; break; /* � */
473             case 0x5e: new = 0x5d; break; /* � */
474             case 0x7b: new = 0x7b; break; /* � */
475             case 0x7c: new = 0x7c; break; /* � */
476             case 0x7e: new = 0x7d; break; /* � */
477             case 0x1e: new = 0x7e; break; /* � */
478             case 0x5f: new = 0x5e; break; /* � */
479             default: new = c;
480         }
481         if (new != c)
482             octstr_set_char(ostr, pos, new);
483     }
484 }
485 
charset_nrc_iso_21_german_to_gsm(Octstr * ostr)486 void charset_nrc_iso_21_german_to_gsm(Octstr *ostr)
487 {
488     long pos, len;
489     int c, new;
490 
491     len = octstr_len(ostr);
492 
493     for (pos = 0; pos < len; pos++) {
494         c = octstr_get_char(ostr, pos);
495         switch (c) {
496             /* NRC value; GSM value */
497             case 0x5b: new = 0x5b; break; /* � */
498             case 0x5c: new = 0x5c; break; /* � */
499             case 0x5d: new = 0x5e; break; /* � */
500             case 0x7b: new = 0x7b; break; /* � */
501             case 0x7c: new = 0x7c; break; /* � */
502             case 0x7d: new = 0x7e; break; /* � */
503             case 0x7e: new = 0x1e; break; /* � */
504             case 0x5e: new = 0x5f; break; /* � */
505             default: new = c;
506         }
507         if (new != c)
508             octstr_set_char(ostr, pos, new);
509     }
510 }
511 
charset_gsm_truncate(Octstr * gsm,long max)512 int charset_gsm_truncate(Octstr *gsm, long max)
513 {
514     if (octstr_len(gsm) > max) {
515 	/* If the last GSM character was an escaped character,
516 	 * then chop off the escape as well as the character. */
517 	if (octstr_get_char(gsm, max - 1) == 27)
518   	    octstr_truncate(gsm, max - 1);
519 	else
520 	    octstr_truncate(gsm, max);
521 	return 1;
522     }
523     return 0;
524 }
525 
charset_to_utf8(Octstr * from,Octstr ** to,Octstr * charset_from)526 int charset_to_utf8(Octstr *from, Octstr **to, Octstr *charset_from)
527 {
528     int ret;
529     xmlCharEncodingHandlerPtr handler = NULL;
530     xmlBufferPtr frombuffer = NULL;
531     xmlBufferPtr tobuffer = NULL;
532 
533     if (octstr_compare(charset_from, octstr_imm("UTF-8")) == 0) {
534         *to = octstr_duplicate(from);
535         return 0;
536     }
537 
538     handler = xmlFindCharEncodingHandler(octstr_get_cstr(charset_from));
539     if (handler == NULL)
540 	return -2;
541 
542     /* Build the libxml buffers for the transcoding. */
543     tobuffer = xmlBufferCreate();
544     frombuffer = xmlBufferCreate();
545     xmlBufferAdd(frombuffer, (unsigned char*)octstr_get_cstr(from), octstr_len(from));
546 
547     ret = xmlCharEncInFunc(handler, tobuffer, frombuffer);
548 
549     *to = octstr_create_from_data((char*)tobuffer->content, tobuffer->use);
550 
551     /* Memory cleanup. */
552     xmlBufferFree(tobuffer);
553     xmlBufferFree(frombuffer);
554 
555     return ret;
556 }
557 
charset_from_utf8(Octstr * utf8,Octstr ** to,Octstr * charset_to)558 int charset_from_utf8(Octstr *utf8, Octstr **to, Octstr *charset_to)
559 {
560     int ret;
561     xmlCharEncodingHandlerPtr handler = NULL;
562     xmlBufferPtr frombuffer = NULL;
563     xmlBufferPtr tobuffer = NULL;
564 
565     handler = xmlFindCharEncodingHandler(octstr_get_cstr(charset_to));
566     if (handler == NULL)
567 	return -2;
568 
569     /* Build the libxml buffers for the transcoding. */
570     tobuffer = xmlBufferCreate();
571     frombuffer = xmlBufferCreate();
572     xmlBufferAdd(frombuffer, (unsigned char*)octstr_get_cstr(utf8), octstr_len(utf8));
573 
574     ret = xmlCharEncOutFunc(handler, tobuffer, frombuffer);
575     if (ret < -2)
576 	/* Libxml seems to be here a little uncertain what would be the
577 	 * return code -3, so let's make it -1. Ugly thing, indeed. --tuo */
578 	ret = -1;
579 
580     *to = octstr_create_from_data((char*)tobuffer->content, tobuffer->use);
581 
582     /* Memory cleanup. */
583     xmlBufferFree(tobuffer);
584     xmlBufferFree(frombuffer);
585 
586     return ret;
587 }
588 
charset_convert(Octstr * string,char * charset_from,char * charset_to)589 int charset_convert(Octstr* string, char* charset_from, char* charset_to)
590 {
591 #if HAVE_ICONV
592     char *from_buf, *to_buf, *pointer;
593     size_t inbytesleft, outbytesleft, ret;
594     iconv_t cd;
595 
596     if (!charset_from || !charset_to || !string) /* sanity check */
597         return -1;
598 
599     if (octstr_len(string) < 1 || strcasecmp(charset_from, charset_to) == 0)
600         return 0; /* we are done, nothing to convert */
601 
602     cd = iconv_open(charset_to, charset_from);
603     /* Did I succeed in getting a conversion descriptor ? */
604     if (cd == (iconv_t)(-1)) {
605         /* I guess not */
606         error(0,"Failed to convert string from <%s> to <%s> - probably broken type names.",
607               charset_from, charset_to);
608         return -1;
609     }
610 
611     from_buf = octstr_get_cstr(string);
612     inbytesleft = octstr_len(string);
613     /* allocate max sized buffer, assuming target encoding may be 4 byte unicode */
614     outbytesleft = inbytesleft * 4;
615     pointer = to_buf = gw_malloc(outbytesleft);
616 
617     do {
618         ret = iconv(cd, (ICONV_CONST char**) &from_buf, &inbytesleft, &pointer, &outbytesleft);
619         if(ret == -1) {
620             long tmp;
621             /* the conversion failed somewhere */
622             switch(errno) {
623             case E2BIG: /* no space in output buffer */
624                 debug("charset", 0, "outbuf to small, realloc.");
625                 tmp = pointer - to_buf;
626                 to_buf = gw_realloc(to_buf, tmp + inbytesleft * 4);
627                 outbytesleft += inbytesleft * 4;
628                 pointer = to_buf + tmp;
629                 ret = 0;
630                 break;
631             case EILSEQ: /* invalid multibyte sequence */
632             case EINVAL: /* incomplete multibyte sequence */
633                 warning(0, "Invalid/Incomplete multibyte sequence at position %d, skeep it.",
634                         (int)(from_buf - octstr_get_cstr(string)));
635                 /* skeep char and try next */
636                 if (outbytesleft == 0) {
637                     /* buffer to small */
638                     tmp = pointer - to_buf;
639                     to_buf = gw_realloc(to_buf, tmp + inbytesleft * 4);
640                     outbytesleft += inbytesleft * 4;
641                     pointer = to_buf + tmp;
642                 }
643                 pointer[0] = from_buf[0];
644                 pointer++;
645                 from_buf++;
646                 inbytesleft--;
647                 outbytesleft--;
648                 ret = 0;
649                 break;
650             }
651         }
652     } while(inbytesleft && ret == 0); /* stop if error occurs and not handled above */
653 
654     iconv_close(cd);
655 
656     if (ret != -1) {
657         /* conversion succeeded */
658         octstr_truncate(string, 0);
659         octstr_append_data(string, to_buf, pointer - to_buf);
660         if (ret)
661             debug("charset", 0, "charset_convert did %ld non-reversible conversions", (long) ret);
662         ret = 0;
663     } else
664         error(errno,"Failed to convert string from <%s> to <%s>.", charset_from, charset_to);
665 
666     if (errno == EILSEQ) {
667         debug("charset_convert", 0, "Found an invalid multibyte sequence at position <%d>",
668               (int)(from_buf - octstr_get_cstr(string)));
669     }
670     gw_free(to_buf);
671     return ret;
672 #endif
673     /* no convertion done due to not having iconv */
674     return -1;
675 }
676