1 /*
2  * Copyright (C) 2001 Edmund Grimley Evans <edmundo@rano.org>
3  *
4  * This program is free software; you can redistribute it and/or modify
5  * it under the terms of the GNU General Public License as published by
6  * the Free Software Foundation; either version 2 of the License, or
7  * (at your option) any later version.
8  *
9  * This program is distributed in the hope that it will be useful,
10  * but WITHOUT ANY WARRANTY; without even the implied warranty of
11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12  * GNU General Public License for more details.
13  *
14  * You should have received a copy of the GNU General Public License along
15  * with this program; if not, write to the Free Software Foundation, Inc.,
16  * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
17  */
18 
19 #ifdef HAVE_CONFIG_H
20 #  include <config.h>
21 #endif
22 
23 #if !defined _WIN32 && defined HAVE_ICONV
24 
25 #include <assert.h>
26 #include <errno.h>
27 #include <iconv.h>
28 #include <stdlib.h>
29 #include <string.h>
30 
31 #include "iconvert.h"
32 #include "share/alloc.h"
33 #include "share/safe_str.h"
34 
35 /*
36  * Convert data from one encoding to another. Return:
37  *
38  *  -2 : memory allocation failed
39  *  -1 : unknown encoding
40  *   0 : data was converted exactly
41  *   1 : data was converted inexactly
42  *   2 : data was invalid (but still converted)
43  *
44  * We convert in two steps, via UTF-8, as this is the only
45  * reliable way of distinguishing between invalid input
46  * and valid input which iconv refuses to transliterate.
47  * We convert from UTF-8 twice, because we have no way of
48  * knowing whether the conversion was exact if iconv returns
49  * E2BIG (due to a bug in the specification of iconv).
50  * An alternative approach is to assume that the output of
51  * iconv is never more than 4 times as long as the input,
52  * but I prefer to avoid that assumption if possible.
53  */
54 
iconvert(const char * fromcode,const char * tocode,const char * from,size_t fromlen,char ** to,size_t * tolen)55 int iconvert(const char *fromcode, const char *tocode,
56 	     const char *from, size_t fromlen,
57 	     char **to, size_t *tolen)
58 {
59   int ret = 0;
60   iconv_t cd1, cd2;
61   char *ib;
62   char *ob;
63   char *utfbuf = 0, *outbuf, *newbuf;
64   size_t utflen, outlen, ibl, obl, k;
65   char tbuf[2048];
66 
67   cd1 = iconv_open("UTF-8", fromcode);
68   if (cd1 == (iconv_t)(-1))
69     return -1;
70 
71   cd2 = (iconv_t)(-1);
72   /* Don't use strcasecmp() as it's locale-dependent. */
73   if (!strchr("Uu", tocode[0]) ||
74       !strchr("Tt", tocode[1]) ||
75       !strchr("Ff", tocode[2]) ||
76       tocode[3] != '-' ||
77       tocode[4] != '8' ||
78       tocode[5] != '\0') {
79     char *tocode1;
80 	size_t dest_len = strlen(tocode) + 11;
81     /*
82      * Try using this non-standard feature of glibc and libiconv.
83      * This is deliberately not a config option as people often
84      * change their iconv library without rebuilding applications.
85      */
86     tocode1 = safe_malloc_(dest_len);
87     if (!tocode1)
88       goto fail;
89 
90     safe_strncpy(tocode1, tocode, dest_len);
91     safe_strncat(tocode1, "//TRANSLIT", dest_len);
92     cd2 = iconv_open(tocode1, "UTF-8");
93     free(tocode1);
94 
95     if (cd2 == (iconv_t)(-1))
96       cd2 = iconv_open(tocode, fromcode);
97 
98     if (cd2 == (iconv_t)(-1)) {
99       iconv_close(cd1);
100       return -1;
101     }
102   }
103 
104   utflen = 1; /*fromlen * 2 + 1; XXX */
105   utfbuf = malloc(utflen);
106   if (!utfbuf)
107     goto fail;
108 
109   /* Convert to UTF-8 */
110   ib = (char *)from;
111   ibl = fromlen;
112   ob = utfbuf;
113   obl = utflen;
114   for (;;) {
115     k = iconv(cd1, &ib, &ibl, &ob, &obl);
116     assert((!k && !ibl) ||
117 	   (k == (size_t)(-1) && errno == E2BIG && ibl && obl < 6) ||
118 	   (k == (size_t)(-1) &&
119 	    (errno == EILSEQ || errno == EINVAL) && ibl));
120     if (!ibl)
121       break;
122     if (obl < 6) {
123       /* Enlarge the buffer */
124       if(utflen*2 < utflen) /* overflow check */
125 	goto fail;
126       utflen *= 2;
127       newbuf = realloc(utfbuf, utflen);
128       if (!newbuf)
129 	goto fail;
130       ob = (ob - utfbuf) + newbuf;
131       obl = utflen - (ob - newbuf);
132       utfbuf = newbuf;
133     }
134     else {
135       /* Invalid input */
136       ib++, ibl--;
137       *ob++ = '#', obl--;
138       ret = 2;
139       iconv(cd1, 0, 0, 0, 0);
140     }
141   }
142 
143   if (cd2 == (iconv_t)(-1)) {
144     /* The target encoding was UTF-8 */
145     if (tolen)
146       *tolen = ob - utfbuf;
147     if (!to) {
148       free(utfbuf);
149       iconv_close(cd1);
150       return ret;
151     }
152     newbuf = safe_realloc_add_2op_(utfbuf, (ob - utfbuf), /*+*/1);
153     if (!newbuf)
154       goto fail;
155     ob = (ob - utfbuf) + newbuf;
156     *ob = '\0';
157     *to = newbuf;
158     iconv_close(cd1);
159     return ret;
160   }
161 
162   /* Truncate the buffer to be tidy */
163   utflen = ob - utfbuf;
164   newbuf = realloc(utfbuf, utflen);
165   if (!newbuf)
166     goto fail;
167   utfbuf = newbuf;
168 
169   /* Convert from UTF-8 to discover how long the output is */
170   outlen = 0;
171   ib = utfbuf;
172   ibl = utflen;
173   while (ibl) {
174     ob = tbuf;
175     obl = sizeof(tbuf);
176     k = iconv(cd2, &ib, &ibl, &ob, &obl);
177     assert((k != (size_t)(-1) && !ibl) ||
178 	   (k == (size_t)(-1) && errno == E2BIG && ibl) ||
179 	   (k == (size_t)(-1) && errno == EILSEQ && ibl));
180     if (ibl && !(k == (size_t)(-1) && errno == E2BIG)) {
181       /* Replace one character */
182       char *tb = "?";
183       size_t tbl = 1;
184 
185       outlen += ob - tbuf;
186       ob = tbuf;
187       obl = sizeof(tbuf);
188       k = iconv(cd2, &tb, &tbl, &ob, &obl);
189       assert((!k && !tbl) ||
190 	     (k == (size_t)(-1) && errno == EILSEQ && tbl));
191       for (++ib, --ibl; ibl && (*ib & 0x80); ib++, ibl--)
192 	;
193     }
194     outlen += ob - tbuf;
195   }
196   ob = tbuf;
197   obl = sizeof(tbuf);
198   k = iconv(cd2, 0, 0, &ob, &obl);
199   assert(!k);
200   outlen += ob - tbuf;
201 
202   /* Convert from UTF-8 for real */
203   outbuf = safe_malloc_add_2op_(outlen, /*+*/1);
204   if (!outbuf)
205     goto fail;
206   ib = utfbuf;
207   ibl = utflen;
208   ob = outbuf;
209   obl = outlen;
210   while (ibl) {
211     k = iconv(cd2, &ib, &ibl, &ob, &obl);
212     assert((k != (size_t)(-1) && !ibl) ||
213 	   (k == (size_t)(-1) && errno == EILSEQ && ibl));
214     if (k && !ret)
215       ret = 1;
216     if (ibl && !(k == (size_t)(-1) && errno == E2BIG)) {
217       /* Replace one character */
218       char *tb = "?";
219       size_t tbl = 1;
220 
221       k = iconv(cd2, &tb, &tbl, &ob, &obl);
222       assert((!k && !tbl) ||
223 	     (k == (size_t)(-1) && errno == EILSEQ && tbl));
224       for (++ib, --ibl; ibl && (*ib & 0x80); ib++, ibl--)
225 	;
226     }
227   }
228   k = iconv(cd2, 0, 0, &ob, &obl);
229   assert(!k);
230   assert(!obl);
231   *ob = '\0';
232 
233   free(utfbuf);
234   iconv_close(cd1);
235   iconv_close(cd2);
236   if (tolen)
237     *tolen = outlen;
238   if (!to) {
239     free(outbuf);
240     return ret;
241   }
242   *to = outbuf;
243   return ret;
244 
245  fail:
246   if(0 != utfbuf)
247     free(utfbuf);
248   iconv_close(cd1);
249   if (cd2 != (iconv_t)(-1))
250     iconv_close(cd2);
251   return -2;
252 }
253 
254 #endif /* HAVE_ICONV */
255