1 /**********************************************************************
2  *
3  * Name:     cpl_recode_iconv.cpp
4  * Project:  CPL - Common Portability Library
5  * Purpose:  Character set recoding and char/wchar_t conversions implemented
6  *           using the iconv() functionality.
7  * Author:   Andrey Kiselev, dron@ak4719.spb.edu
8  *
9  **********************************************************************
10  * Copyright (c) 2011, Andrey Kiselev <dron@ak4719.spb.edu>
11  * Copyright (c) 2011-2012, Even Rouault <even dot rouault at spatialys.com>
12  *
13  * Permission to use, copy, modify, and distribute this software for any
14  * purpose with or without fee is hereby granted, provided that the above
15  * copyright notice and this permission notice appear in all copies.
16  *
17  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
18  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
19  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
20  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
21  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
22  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
23  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
24  **********************************************************************/
25 
26 #include "cpl_port.h"
27 
28 #include <algorithm>
29 
30 CPL_CVSID("$Id: cpl_recode_iconv.cpp b1c9c12ad373e40b955162b45d704070d4ebf7b0 2019-06-19 16:50:15 +0200 Even Rouault $")
31 
32 #ifdef CPL_RECODE_ICONV
33 
34 #include <iconv.h>
35 #include "cpl_string.h"
36 
37 #ifndef ICONV_CPP_CONST
38 #define ICONV_CPP_CONST ICONV_CONST
39 #endif
40 
41 constexpr size_t CPL_RECODE_DSTBUF_SIZE = 32768;
42 
43  /* used by cpl_recode.cpp */
44 extern void CPLClearRecodeIconvWarningFlags();
45 extern char *CPLRecodeIconv( const char *, const char *, const char * ) CPL_RETURNS_NONNULL;
46 extern char *CPLRecodeFromWCharIconv( const wchar_t *,
47                                       const char *, const char * );
48 extern wchar_t *CPLRecodeToWCharIconv( const char *,
49                                        const char *, const char * );
50 
51 /************************************************************************/
52 /*                 CPLClearRecodeIconvWarningFlags()                    */
53 /************************************************************************/
54 
55 static bool bHaveWarned1 = false;
56 static bool bHaveWarned2 = false;
57 
CPLClearRecodeIconvWarningFlags()58 void CPLClearRecodeIconvWarningFlags()
59 {
60     bHaveWarned1 = false;
61     bHaveWarned2 = false;
62 }
63 
64 /************************************************************************/
65 /*                          CPLRecodeIconv()                            */
66 /************************************************************************/
67 
68 /**
69  * Convert a string from a source encoding to a destination encoding
70  * using the iconv() function.
71  *
72  * If an error occurs an error may, or may not be posted with CPLError().
73  *
74  * @param pszSource a NULL terminated string.
75  * @param pszSrcEncoding the source encoding.
76  * @param pszDstEncoding the destination encoding.
77  *
78  * @return a NULL terminated string which should be freed with CPLFree().
79  */
80 
CPLRecodeIconv(const char * pszSource,const char * pszSrcEncoding,const char * pszDstEncoding)81 char *CPLRecodeIconv( const char *pszSource,
82                       const char *pszSrcEncoding,
83                       const char *pszDstEncoding )
84 
85 {
86     iconv_t sConv;
87 
88     sConv = iconv_open( pszDstEncoding, pszSrcEncoding );
89 
90     if( sConv == reinterpret_cast<iconv_t>(-1) )
91     {
92         CPLError( CE_Warning, CPLE_AppDefined,
93                   "Recode from %s to %s failed with the error: \"%s\".",
94                   pszSrcEncoding, pszDstEncoding, strerror(errno) );
95 
96         return CPLStrdup(pszSource);
97     }
98 
99 /* -------------------------------------------------------------------- */
100 /*      XXX: There is a portability issue: iconv() function could be    */
101 /*      declared differently on different platforms. The second         */
102 /*      argument could be declared as char** (as POSIX defines) or      */
103 /*      as a const char**. Handle it with the ICONV_CPP_CONST macro here.   */
104 /* -------------------------------------------------------------------- */
105     ICONV_CPP_CONST char *pszSrcBuf = const_cast<ICONV_CPP_CONST char *>(pszSource);
106     size_t nSrcLen = strlen( pszSource );
107     size_t nDstCurLen = std::max(CPL_RECODE_DSTBUF_SIZE, nSrcLen);
108     size_t nDstLen = nDstCurLen;
109     char *pszDestination =
110         static_cast<char *>(CPLCalloc(nDstCurLen + 1, sizeof(char)));
111     char *pszDstBuf = pszDestination;
112 
113     while( nSrcLen > 0 )
114     {
115         size_t nConverted =
116             iconv( sConv, &pszSrcBuf, &nSrcLen, &pszDstBuf, &nDstLen );
117 
118         if( nConverted == static_cast<size_t>(-1) )
119         {
120             if( errno == EILSEQ )
121             {
122                 // Skip the invalid sequence in the input string.
123                 if( !bHaveWarned1 )
124                 {
125                     bHaveWarned1 = true;
126                     CPLError(CE_Warning, CPLE_AppDefined,
127                              "One or several characters couldn't be converted "
128                              "correctly from %s to %s.  "
129                              "This warning will not be emitted anymore",
130                              pszSrcEncoding, pszDstEncoding);
131                 }
132                 nSrcLen--;
133                 pszSrcBuf++;
134                 continue;
135             }
136 
137             else if( errno == E2BIG )
138             {
139                 // We are running out of the output buffer.
140                 // Dynamically increase the buffer size.
141                 size_t nTmp = nDstCurLen;
142                 nDstCurLen *= 2;
143                 pszDestination =
144                     static_cast<char *>(CPLRealloc(pszDestination, nDstCurLen + 1));
145                 pszDstBuf = pszDestination + nTmp - nDstLen;
146                 nDstLen += nTmp;
147                 continue;
148             }
149 
150             else
151                 break;
152         }
153     }
154 
155     pszDestination[nDstCurLen - nDstLen] = '\0';
156 
157     iconv_close( sConv );
158 
159     return pszDestination;
160 }
161 
162 /************************************************************************/
163 /*                      CPLRecodeFromWCharIconv()                       */
164 /************************************************************************/
165 
166 /**
167  * Convert wchar_t string to UTF-8.
168  *
169  * Convert a wchar_t string into a multibyte utf-8 string
170  * using the iconv() function.
171  *
172  * Note that the wchar_t type varies in size on different systems. On
173  * win32 it is normally 2 bytes, and on unix 4 bytes.
174  *
175  * If an error occurs an error may, or may not be posted with CPLError().
176  *
177  * @param pwszSource the source wchar_t string, terminated with a 0 wchar_t.
178  * @param pszSrcEncoding the source encoding, typically CPL_ENC_UCS2.
179  * @param pszDstEncoding the destination encoding, typically CPL_ENC_UTF8.
180  *
181  * @return a zero terminated multi-byte string which should be freed with
182  * CPLFree(), or NULL if an error occurs.
183  */
184 
CPLRecodeFromWCharIconv(const wchar_t * pwszSource,const char * pszSrcEncoding,const char * pszDstEncoding)185 char *CPLRecodeFromWCharIconv( const wchar_t *pwszSource,
186                                const char *pszSrcEncoding,
187                                const char *pszDstEncoding )
188 
189 {
190 /* -------------------------------------------------------------------- */
191 /*      What is the source length.                                      */
192 /* -------------------------------------------------------------------- */
193     size_t nSrcLen = 0;
194 
195     while( pwszSource[nSrcLen] != 0 )
196         nSrcLen++;
197 
198 /* -------------------------------------------------------------------- */
199 /*      iconv() does not support wchar_t so we need to repack the       */
200 /*      characters according to the width of a character in the         */
201 /*      source encoding.  For instance if wchar_t is 4 bytes but our    */
202 /*      source is UTF16 then we need to pack down into 2 byte           */
203 /*      characters before passing to iconv().                           */
204 /* -------------------------------------------------------------------- */
205     const int nTargetCharWidth = CPLEncodingCharSize( pszSrcEncoding );
206 
207     if( nTargetCharWidth < 1 )
208     {
209         CPLError( CE_Warning, CPLE_AppDefined,
210                   "Recode from %s with CPLRecodeFromWChar() failed because"
211                   " the width of characters in the encoding are not known.",
212                   pszSrcEncoding );
213         return CPLStrdup("");
214     }
215 
216     GByte *pszIconvSrcBuf =
217         static_cast<GByte *>(CPLCalloc((nSrcLen + 1), nTargetCharWidth));
218 
219     for( unsigned int iSrc = 0; iSrc <= nSrcLen; iSrc++ )
220     {
221         if( nTargetCharWidth == 1 )
222             pszIconvSrcBuf[iSrc] = static_cast<GByte>(pwszSource[iSrc]);
223         else if( nTargetCharWidth == 2 )
224             (reinterpret_cast<short *>(pszIconvSrcBuf))[iSrc] =
225                 static_cast<short>(pwszSource[iSrc]);
226         else if( nTargetCharWidth == 4 )
227             (reinterpret_cast<GInt32 *>(pszIconvSrcBuf))[iSrc] = pwszSource[iSrc];
228     }
229 
230 /* -------------------------------------------------------------------- */
231 /*      Create the iconv() translation object.                          */
232 /* -------------------------------------------------------------------- */
233     iconv_t sConv;
234 
235     sConv = iconv_open( pszDstEncoding, pszSrcEncoding );
236 
237     if( sConv == reinterpret_cast<iconv_t>(-1) )
238     {
239         CPLFree( pszIconvSrcBuf );
240         CPLError( CE_Warning, CPLE_AppDefined,
241                   "Recode from %s to %s failed with the error: \"%s\".",
242                   pszSrcEncoding, pszDstEncoding, strerror(errno) );
243 
244         return CPLStrdup( "" );
245     }
246 
247 /* -------------------------------------------------------------------- */
248 /*      XXX: There is a portability issue: iconv() function could be    */
249 /*      declared differently on different platforms. The second         */
250 /*      argument could be declared as char** (as POSIX defines) or      */
251 /*      as a const char**. Handle it with the ICONV_CPP_CONST macro here.   */
252 /* -------------------------------------------------------------------- */
253     ICONV_CPP_CONST char *pszSrcBuf = const_cast<ICONV_CPP_CONST char *>(
254         reinterpret_cast<char*>(pszIconvSrcBuf));
255 
256     /* iconv expects a number of bytes, not characters */
257     nSrcLen *= sizeof(wchar_t);
258 
259 /* -------------------------------------------------------------------- */
260 /*      Allocate destination buffer.                                    */
261 /* -------------------------------------------------------------------- */
262     size_t nDstCurLen = std::max(CPL_RECODE_DSTBUF_SIZE, nSrcLen + 1);
263     size_t nDstLen = nDstCurLen;
264     char *pszDestination =
265         static_cast<char *>(CPLCalloc(nDstCurLen, sizeof(char)));
266     char *pszDstBuf = pszDestination;
267 
268     while( nSrcLen > 0 )
269     {
270         const size_t nConverted =
271             iconv( sConv, &pszSrcBuf, &nSrcLen, &pszDstBuf, &nDstLen );
272 
273         if( nConverted == static_cast<size_t>(-1) )
274         {
275             if( errno == EILSEQ )
276             {
277                 // Skip the invalid sequence in the input string.
278                 nSrcLen--;
279                 pszSrcBuf += sizeof(wchar_t);
280                 if( !bHaveWarned2 )
281                 {
282                     bHaveWarned2 = true;
283                     CPLError(CE_Warning, CPLE_AppDefined,
284                              "One or several characters couldn't be converted "
285                              "correctly from %s to %s.  "
286                              "This warning will not be emitted anymore",
287                              pszSrcEncoding, pszDstEncoding);
288                 }
289                 continue;
290             }
291 
292             else if( errno == E2BIG )
293             {
294                 // We are running out of the output buffer.
295                 // Dynamically increase the buffer size.
296                 size_t nTmp = nDstCurLen;
297                 nDstCurLen *= 2;
298                 pszDestination =
299                     static_cast<char *>(CPLRealloc(pszDestination, nDstCurLen));
300                 pszDstBuf = pszDestination + nTmp - nDstLen;
301                 nDstLen += nDstCurLen - nTmp;
302                 continue;
303             }
304 
305             else
306                 break;
307         }
308     }
309 
310     pszDestination[nDstCurLen - nDstLen] = '\0';
311 
312     iconv_close( sConv );
313 
314     CPLFree( pszIconvSrcBuf );
315 
316     return pszDestination;
317 }
318 
319 /************************************************************************/
320 /*                        CPLRecodeToWCharIconv()                       */
321 /************************************************************************/
322 
323 /**
324  * Convert UTF-8 string to a wchar_t string.
325  *
326  * Convert a 8bit, multi-byte per character input string into a wide
327  * character (wchar_t) string using the iconv() function.
328  *
329  * Note that the wchar_t type varies in size on different systems. On
330  * win32 it is normally 2 bytes, and on unix 4 bytes.
331  *
332  * If an error occurs an error may, or may not be posted with CPLError().
333  *
334  * @param pszSource input multi-byte character string.
335  * @param pszSrcEncoding source encoding, typically CPL_ENC_UTF8.
336  * @param pszDstEncoding destination encoding, typically CPL_ENC_UCS2.
337  *
338  * @return the zero terminated wchar_t string (to be freed with CPLFree()) or
339  * NULL on error.
340  */
341 
CPLRecodeToWCharIconv(const char * pszSource,const char * pszSrcEncoding,const char * pszDstEncoding)342 wchar_t *CPLRecodeToWCharIconv( const char *pszSource,
343                                 const char *pszSrcEncoding,
344                                 const char *pszDstEncoding )
345 
346 {
347     return reinterpret_cast<wchar_t *>(CPLRecodeIconv( pszSource,
348                                             pszSrcEncoding, pszDstEncoding));
349 }
350 
351 #endif /* CPL_RECODE_ICONV */
352