xref: /dragonfly/contrib/grep/lib/striconv.c (revision 25a2db75)
1 /* Charset conversion.
2    Copyright (C) 2001-2007, 2010-2012 Free Software Foundation, Inc.
3    Written by Bruno Haible and Simon Josefsson.
4 
5    This program is free software; you can redistribute it and/or modify
6    it under the terms of the GNU General Public License as published by
7    the Free Software Foundation; either version 3, or (at your option)
8    any later version.
9 
10    This program is distributed in the hope that it will be useful,
11    but WITHOUT ANY WARRANTY; without even the implied warranty of
12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13    GNU General Public License for more details.
14 
15    You should have received a copy of the GNU General Public License
16    along with this program; if not, see <http://www.gnu.org/licenses/>.  */
17 
18 #include <config.h>
19 
20 /* Specification.  */
21 #include "striconv.h"
22 
23 #include <errno.h>
24 #include <stdlib.h>
25 #include <string.h>
26 
27 #if HAVE_ICONV
28 # include <iconv.h>
29 /* Get MB_LEN_MAX, CHAR_BIT.  */
30 # include <limits.h>
31 #endif
32 
33 #include "c-strcase.h"
34 
35 #ifndef SIZE_MAX
36 # define SIZE_MAX ((size_t) -1)
37 #endif
38 
39 
40 #if HAVE_ICONV
41 
42 int
43 mem_cd_iconv (const char *src, size_t srclen, iconv_t cd,
44               char **resultp, size_t *lengthp)
45 {
46 # define tmpbufsize 4096
47   size_t length;
48   char *result;
49 
50   /* Avoid glibc-2.1 bug and Solaris 2.7-2.9 bug.  */
51 # if defined _LIBICONV_VERSION \
52      || !(((__GLIBC__ == 2 && __GLIBC_MINOR__ <= 1) && !defined __UCLIBC__) \
53           || defined __sun)
54   /* Set to the initial state.  */
55   iconv (cd, NULL, NULL, NULL, NULL);
56 # endif
57 
58   /* Determine the length we need.  */
59   {
60     size_t count = 0;
61     /* The alignment is needed when converting e.g. to glibc's WCHAR_T or
62        libiconv's UCS-4-INTERNAL encoding.  */
63     union { unsigned int align; char buf[tmpbufsize]; } tmp;
64 # define tmpbuf tmp.buf
65     const char *inptr = src;
66     size_t insize = srclen;
67 
68     while (insize > 0)
69       {
70         char *outptr = tmpbuf;
71         size_t outsize = tmpbufsize;
72         size_t res = iconv (cd,
73                             (ICONV_CONST char **) &inptr, &insize,
74                             &outptr, &outsize);
75 
76         if (res == (size_t)(-1))
77           {
78             if (errno == E2BIG)
79               ;
80             else if (errno == EINVAL)
81               break;
82             else
83               return -1;
84           }
85 # if !defined _LIBICONV_VERSION && !(defined __GLIBC__ && !defined __UCLIBC__)
86         /* Irix iconv() inserts a NUL byte if it cannot convert.
87            NetBSD iconv() inserts a question mark if it cannot convert.
88            Only GNU libiconv and GNU libc are known to prefer to fail rather
89            than doing a lossy conversion.  */
90         else if (res > 0)
91           {
92             errno = EILSEQ;
93             return -1;
94           }
95 # endif
96         count += outptr - tmpbuf;
97       }
98     /* Avoid glibc-2.1 bug and Solaris 2.7 bug.  */
99 # if defined _LIBICONV_VERSION \
100      || !(((__GLIBC__ == 2 && __GLIBC_MINOR__ <= 1) && !defined __UCLIBC__) \
101           || defined __sun)
102     {
103       char *outptr = tmpbuf;
104       size_t outsize = tmpbufsize;
105       size_t res = iconv (cd, NULL, NULL, &outptr, &outsize);
106 
107       if (res == (size_t)(-1))
108         return -1;
109       count += outptr - tmpbuf;
110     }
111 # endif
112     length = count;
113 # undef tmpbuf
114   }
115 
116   if (length == 0)
117     {
118       *lengthp = 0;
119       return 0;
120     }
121   if (*resultp != NULL && *lengthp >= length)
122     result = *resultp;
123   else
124     {
125       result = (char *) malloc (length);
126       if (result == NULL)
127         {
128           errno = ENOMEM;
129           return -1;
130         }
131     }
132 
133   /* Avoid glibc-2.1 bug and Solaris 2.7-2.9 bug.  */
134 # if defined _LIBICONV_VERSION \
135      || !(((__GLIBC__ == 2 && __GLIBC_MINOR__ <= 1) && !defined __UCLIBC__) \
136           || defined __sun)
137   /* Return to the initial state.  */
138   iconv (cd, NULL, NULL, NULL, NULL);
139 # endif
140 
141   /* Do the conversion for real.  */
142   {
143     const char *inptr = src;
144     size_t insize = srclen;
145     char *outptr = result;
146     size_t outsize = length;
147 
148     while (insize > 0)
149       {
150         size_t res = iconv (cd,
151                             (ICONV_CONST char **) &inptr, &insize,
152                             &outptr, &outsize);
153 
154         if (res == (size_t)(-1))
155           {
156             if (errno == EINVAL)
157               break;
158             else
159               goto fail;
160           }
161 # if !defined _LIBICONV_VERSION && !(defined __GLIBC__ && !defined __UCLIBC__)
162         /* Irix iconv() inserts a NUL byte if it cannot convert.
163            NetBSD iconv() inserts a question mark if it cannot convert.
164            Only GNU libiconv and GNU libc are known to prefer to fail rather
165            than doing a lossy conversion.  */
166         else if (res > 0)
167           {
168             errno = EILSEQ;
169             goto fail;
170           }
171 # endif
172       }
173     /* Avoid glibc-2.1 bug and Solaris 2.7 bug.  */
174 # if defined _LIBICONV_VERSION \
175      || !(((__GLIBC__ == 2 && __GLIBC_MINOR__ <= 1) && !defined __UCLIBC__) \
176           || defined __sun)
177     {
178       size_t res = iconv (cd, NULL, NULL, &outptr, &outsize);
179 
180       if (res == (size_t)(-1))
181         goto fail;
182     }
183 # endif
184     if (outsize != 0)
185       abort ();
186   }
187 
188   *resultp = result;
189   *lengthp = length;
190 
191   return 0;
192 
193  fail:
194   {
195     if (result != *resultp)
196       {
197         int saved_errno = errno;
198         free (result);
199         errno = saved_errno;
200       }
201     return -1;
202   }
203 # undef tmpbufsize
204 }
205 
206 char *
207 str_cd_iconv (const char *src, iconv_t cd)
208 {
209   /* For most encodings, a trailing NUL byte in the input will be converted
210      to a trailing NUL byte in the output.  But not for UTF-7.  So that this
211      function is usable for UTF-7, we have to exclude the NUL byte from the
212      conversion and add it by hand afterwards.  */
213 # if !defined _LIBICONV_VERSION && !(defined __GLIBC__ && !defined __UCLIBC__)
214   /* Irix iconv() inserts a NUL byte if it cannot convert.
215      NetBSD iconv() inserts a question mark if it cannot convert.
216      Only GNU libiconv and GNU libc are known to prefer to fail rather
217      than doing a lossy conversion.  For other iconv() implementations,
218      we have to look at the number of irreversible conversions returned;
219      but this information is lost when iconv() returns for an E2BIG reason.
220      Therefore we cannot use the second, faster algorithm.  */
221 
222   char *result = NULL;
223   size_t length = 0;
224   int retval = mem_cd_iconv (src, strlen (src), cd, &result, &length);
225   char *final_result;
226 
227   if (retval < 0)
228     {
229       if (result != NULL)
230         abort ();
231       return NULL;
232     }
233 
234   /* Add the terminating NUL byte.  */
235   final_result =
236     (result != NULL ? realloc (result, length + 1) : malloc (length + 1));
237   if (final_result == NULL)
238     {
239       free (result);
240       errno = ENOMEM;
241       return NULL;
242     }
243   final_result[length] = '\0';
244 
245   return final_result;
246 
247 # else
248   /* This algorithm is likely faster than the one above.  But it may produce
249      iconv() returns for an E2BIG reason, when the output size guess is too
250      small.  Therefore it can only be used when we don't need the number of
251      irreversible conversions performed.  */
252   char *result;
253   size_t result_size;
254   size_t length;
255   const char *inptr = src;
256   size_t inbytes_remaining = strlen (src);
257 
258   /* Make a guess for the worst-case output size, in order to avoid a
259      realloc.  It's OK if the guess is wrong as long as it is not zero and
260      doesn't lead to an integer overflow.  */
261   result_size = inbytes_remaining;
262   {
263     size_t approx_sqrt_SIZE_MAX = SIZE_MAX >> (sizeof (size_t) * CHAR_BIT / 2);
264     if (result_size <= approx_sqrt_SIZE_MAX / MB_LEN_MAX)
265       result_size *= MB_LEN_MAX;
266   }
267   result_size += 1; /* for the terminating NUL */
268 
269   result = (char *) malloc (result_size);
270   if (result == NULL)
271     {
272       errno = ENOMEM;
273       return NULL;
274     }
275 
276   /* Avoid glibc-2.1 bug and Solaris 2.7-2.9 bug.  */
277 # if defined _LIBICONV_VERSION \
278      || !(((__GLIBC__ == 2 && __GLIBC_MINOR__ <= 1) && !defined __UCLIBC__) \
279           || defined __sun)
280   /* Set to the initial state.  */
281   iconv (cd, NULL, NULL, NULL, NULL);
282 # endif
283 
284   /* Do the conversion.  */
285   {
286     char *outptr = result;
287     size_t outbytes_remaining = result_size - 1;
288 
289     for (;;)
290       {
291         /* Here inptr + inbytes_remaining = src + strlen (src),
292                 outptr + outbytes_remaining = result + result_size - 1.  */
293         size_t res = iconv (cd,
294                             (ICONV_CONST char **) &inptr, &inbytes_remaining,
295                             &outptr, &outbytes_remaining);
296 
297         if (res == (size_t)(-1))
298           {
299             if (errno == EINVAL)
300               break;
301             else if (errno == E2BIG)
302               {
303                 size_t used = outptr - result;
304                 size_t newsize = result_size * 2;
305                 char *newresult;
306 
307                 if (!(newsize > result_size))
308                   {
309                     errno = ENOMEM;
310                     goto failed;
311                   }
312                 newresult = (char *) realloc (result, newsize);
313                 if (newresult == NULL)
314                   {
315                     errno = ENOMEM;
316                     goto failed;
317                   }
318                 result = newresult;
319                 result_size = newsize;
320                 outptr = result + used;
321                 outbytes_remaining = result_size - 1 - used;
322               }
323             else
324               goto failed;
325           }
326         else
327           break;
328       }
329     /* Avoid glibc-2.1 bug and Solaris 2.7 bug.  */
330 # if defined _LIBICONV_VERSION \
331      || !(((__GLIBC__ == 2 && __GLIBC_MINOR__ <= 1) && !defined __UCLIBC__) \
332           || defined __sun)
333     for (;;)
334       {
335         /* Here outptr + outbytes_remaining = result + result_size - 1.  */
336         size_t res = iconv (cd, NULL, NULL, &outptr, &outbytes_remaining);
337 
338         if (res == (size_t)(-1))
339           {
340             if (errno == E2BIG)
341               {
342                 size_t used = outptr - result;
343                 size_t newsize = result_size * 2;
344                 char *newresult;
345 
346                 if (!(newsize > result_size))
347                   {
348                     errno = ENOMEM;
349                     goto failed;
350                   }
351                 newresult = (char *) realloc (result, newsize);
352                 if (newresult == NULL)
353                   {
354                     errno = ENOMEM;
355                     goto failed;
356                   }
357                 result = newresult;
358                 result_size = newsize;
359                 outptr = result + used;
360                 outbytes_remaining = result_size - 1 - used;
361               }
362             else
363               goto failed;
364           }
365         else
366           break;
367       }
368 # endif
369 
370     /* Add the terminating NUL byte.  */
371     *outptr++ = '\0';
372 
373     length = outptr - result;
374   }
375 
376   /* Give away unused memory.  */
377   if (length < result_size)
378     {
379       char *smaller_result = (char *) realloc (result, length);
380 
381       if (smaller_result != NULL)
382         result = smaller_result;
383     }
384 
385   return result;
386 
387  failed:
388   {
389     int saved_errno = errno;
390     free (result);
391     errno = saved_errno;
392     return NULL;
393   }
394 
395 # endif
396 }
397 
398 #endif
399 
400 char *
401 str_iconv (const char *src, const char *from_codeset, const char *to_codeset)
402 {
403   if (*src == '\0' || c_strcasecmp (from_codeset, to_codeset) == 0)
404     {
405       char *result = strdup (src);
406 
407       if (result == NULL)
408         errno = ENOMEM;
409       return result;
410     }
411   else
412     {
413 #if HAVE_ICONV
414       iconv_t cd;
415       char *result;
416 
417       /* Avoid glibc-2.1 bug with EUC-KR.  */
418 # if ((__GLIBC__ == 2 && __GLIBC_MINOR__ <= 1) && !defined __UCLIBC__) \
419      && !defined _LIBICONV_VERSION
420       if (c_strcasecmp (from_codeset, "EUC-KR") == 0
421           || c_strcasecmp (to_codeset, "EUC-KR") == 0)
422         {
423           errno = EINVAL;
424           return NULL;
425         }
426 # endif
427       cd = iconv_open (to_codeset, from_codeset);
428       if (cd == (iconv_t) -1)
429         return NULL;
430 
431       result = str_cd_iconv (src, cd);
432 
433       if (result == NULL)
434         {
435           /* Close cd, but preserve the errno from str_cd_iconv.  */
436           int saved_errno = errno;
437           iconv_close (cd);
438           errno = saved_errno;
439         }
440       else
441         {
442           if (iconv_close (cd) < 0)
443             {
444               /* Return NULL, but free the allocated memory, and while doing
445                  that, preserve the errno from iconv_close.  */
446               int saved_errno = errno;
447               free (result);
448               errno = saved_errno;
449               return NULL;
450             }
451         }
452       return result;
453 #else
454       /* This is a different error code than if iconv_open existed but didn't
455          support from_codeset and to_codeset, so that the caller can emit
456          an error message such as
457            "iconv() is not supported. Installing GNU libiconv and
458             then reinstalling this package would fix this."  */
459       errno = ENOSYS;
460       return NULL;
461 #endif
462     }
463 }
464