1 /**********************************************************************
2 *
3 * Name: cpl_recode_iconv.cpp
4 * Project: CPL - Common Portability Library
5 * Purpose: Character set recoding and char/wchar_t conversions implemented
6 * using the iconv() functionality.
7 * Author: Andrey Kiselev, dron@ak4719.spb.edu
8 *
9 **********************************************************************
10 * Copyright (c) 2011, Andrey Kiselev <dron@ak4719.spb.edu>
11 * Copyright (c) 2011-2012, Even Rouault <even dot rouault at spatialys.com>
12 *
13 * Permission to use, copy, modify, and distribute this software for any
14 * purpose with or without fee is hereby granted, provided that the above
15 * copyright notice and this permission notice appear in all copies.
16 *
17 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
18 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
19 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
20 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
21 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
22 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
23 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
24 **********************************************************************/
25
26 #include "cpl_port.h"
27
28 #include <algorithm>
29
30 CPL_CVSID("$Id: cpl_recode_iconv.cpp b1c9c12ad373e40b955162b45d704070d4ebf7b0 2019-06-19 16:50:15 +0200 Even Rouault $")
31
32 #ifdef CPL_RECODE_ICONV
33
34 #include <iconv.h>
35 #include "cpl_string.h"
36
37 #ifndef ICONV_CPP_CONST
38 #define ICONV_CPP_CONST ICONV_CONST
39 #endif
40
41 constexpr size_t CPL_RECODE_DSTBUF_SIZE = 32768;
42
43 /* used by cpl_recode.cpp */
44 extern void CPLClearRecodeIconvWarningFlags();
45 extern char *CPLRecodeIconv( const char *, const char *, const char * ) CPL_RETURNS_NONNULL;
46 extern char *CPLRecodeFromWCharIconv( const wchar_t *,
47 const char *, const char * );
48 extern wchar_t *CPLRecodeToWCharIconv( const char *,
49 const char *, const char * );
50
51 /************************************************************************/
52 /* CPLClearRecodeIconvWarningFlags() */
53 /************************************************************************/
54
55 static bool bHaveWarned1 = false;
56 static bool bHaveWarned2 = false;
57
CPLClearRecodeIconvWarningFlags()58 void CPLClearRecodeIconvWarningFlags()
59 {
60 bHaveWarned1 = false;
61 bHaveWarned2 = false;
62 }
63
64 /************************************************************************/
65 /* CPLRecodeIconv() */
66 /************************************************************************/
67
68 /**
69 * Convert a string from a source encoding to a destination encoding
70 * using the iconv() function.
71 *
72 * If an error occurs an error may, or may not be posted with CPLError().
73 *
74 * @param pszSource a NULL terminated string.
75 * @param pszSrcEncoding the source encoding.
76 * @param pszDstEncoding the destination encoding.
77 *
78 * @return a NULL terminated string which should be freed with CPLFree().
79 */
80
CPLRecodeIconv(const char * pszSource,const char * pszSrcEncoding,const char * pszDstEncoding)81 char *CPLRecodeIconv( const char *pszSource,
82 const char *pszSrcEncoding,
83 const char *pszDstEncoding )
84
85 {
86 iconv_t sConv;
87
88 sConv = iconv_open( pszDstEncoding, pszSrcEncoding );
89
90 if( sConv == reinterpret_cast<iconv_t>(-1) )
91 {
92 CPLError( CE_Warning, CPLE_AppDefined,
93 "Recode from %s to %s failed with the error: \"%s\".",
94 pszSrcEncoding, pszDstEncoding, strerror(errno) );
95
96 return CPLStrdup(pszSource);
97 }
98
99 /* -------------------------------------------------------------------- */
100 /* XXX: There is a portability issue: iconv() function could be */
101 /* declared differently on different platforms. The second */
102 /* argument could be declared as char** (as POSIX defines) or */
103 /* as a const char**. Handle it with the ICONV_CPP_CONST macro here. */
104 /* -------------------------------------------------------------------- */
105 ICONV_CPP_CONST char *pszSrcBuf = const_cast<ICONV_CPP_CONST char *>(pszSource);
106 size_t nSrcLen = strlen( pszSource );
107 size_t nDstCurLen = std::max(CPL_RECODE_DSTBUF_SIZE, nSrcLen);
108 size_t nDstLen = nDstCurLen;
109 char *pszDestination =
110 static_cast<char *>(CPLCalloc(nDstCurLen + 1, sizeof(char)));
111 char *pszDstBuf = pszDestination;
112
113 while( nSrcLen > 0 )
114 {
115 size_t nConverted =
116 iconv( sConv, &pszSrcBuf, &nSrcLen, &pszDstBuf, &nDstLen );
117
118 if( nConverted == static_cast<size_t>(-1) )
119 {
120 if( errno == EILSEQ )
121 {
122 // Skip the invalid sequence in the input string.
123 if( !bHaveWarned1 )
124 {
125 bHaveWarned1 = true;
126 CPLError(CE_Warning, CPLE_AppDefined,
127 "One or several characters couldn't be converted "
128 "correctly from %s to %s. "
129 "This warning will not be emitted anymore",
130 pszSrcEncoding, pszDstEncoding);
131 }
132 nSrcLen--;
133 pszSrcBuf++;
134 continue;
135 }
136
137 else if( errno == E2BIG )
138 {
139 // We are running out of the output buffer.
140 // Dynamically increase the buffer size.
141 size_t nTmp = nDstCurLen;
142 nDstCurLen *= 2;
143 pszDestination =
144 static_cast<char *>(CPLRealloc(pszDestination, nDstCurLen + 1));
145 pszDstBuf = pszDestination + nTmp - nDstLen;
146 nDstLen += nTmp;
147 continue;
148 }
149
150 else
151 break;
152 }
153 }
154
155 pszDestination[nDstCurLen - nDstLen] = '\0';
156
157 iconv_close( sConv );
158
159 return pszDestination;
160 }
161
162 /************************************************************************/
163 /* CPLRecodeFromWCharIconv() */
164 /************************************************************************/
165
166 /**
167 * Convert wchar_t string to UTF-8.
168 *
169 * Convert a wchar_t string into a multibyte utf-8 string
170 * using the iconv() function.
171 *
172 * Note that the wchar_t type varies in size on different systems. On
173 * win32 it is normally 2 bytes, and on unix 4 bytes.
174 *
175 * If an error occurs an error may, or may not be posted with CPLError().
176 *
177 * @param pwszSource the source wchar_t string, terminated with a 0 wchar_t.
178 * @param pszSrcEncoding the source encoding, typically CPL_ENC_UCS2.
179 * @param pszDstEncoding the destination encoding, typically CPL_ENC_UTF8.
180 *
181 * @return a zero terminated multi-byte string which should be freed with
182 * CPLFree(), or NULL if an error occurs.
183 */
184
CPLRecodeFromWCharIconv(const wchar_t * pwszSource,const char * pszSrcEncoding,const char * pszDstEncoding)185 char *CPLRecodeFromWCharIconv( const wchar_t *pwszSource,
186 const char *pszSrcEncoding,
187 const char *pszDstEncoding )
188
189 {
190 /* -------------------------------------------------------------------- */
191 /* What is the source length. */
192 /* -------------------------------------------------------------------- */
193 size_t nSrcLen = 0;
194
195 while( pwszSource[nSrcLen] != 0 )
196 nSrcLen++;
197
198 /* -------------------------------------------------------------------- */
199 /* iconv() does not support wchar_t so we need to repack the */
200 /* characters according to the width of a character in the */
201 /* source encoding. For instance if wchar_t is 4 bytes but our */
202 /* source is UTF16 then we need to pack down into 2 byte */
203 /* characters before passing to iconv(). */
204 /* -------------------------------------------------------------------- */
205 const int nTargetCharWidth = CPLEncodingCharSize( pszSrcEncoding );
206
207 if( nTargetCharWidth < 1 )
208 {
209 CPLError( CE_Warning, CPLE_AppDefined,
210 "Recode from %s with CPLRecodeFromWChar() failed because"
211 " the width of characters in the encoding are not known.",
212 pszSrcEncoding );
213 return CPLStrdup("");
214 }
215
216 GByte *pszIconvSrcBuf =
217 static_cast<GByte *>(CPLCalloc((nSrcLen + 1), nTargetCharWidth));
218
219 for( unsigned int iSrc = 0; iSrc <= nSrcLen; iSrc++ )
220 {
221 if( nTargetCharWidth == 1 )
222 pszIconvSrcBuf[iSrc] = static_cast<GByte>(pwszSource[iSrc]);
223 else if( nTargetCharWidth == 2 )
224 (reinterpret_cast<short *>(pszIconvSrcBuf))[iSrc] =
225 static_cast<short>(pwszSource[iSrc]);
226 else if( nTargetCharWidth == 4 )
227 (reinterpret_cast<GInt32 *>(pszIconvSrcBuf))[iSrc] = pwszSource[iSrc];
228 }
229
230 /* -------------------------------------------------------------------- */
231 /* Create the iconv() translation object. */
232 /* -------------------------------------------------------------------- */
233 iconv_t sConv;
234
235 sConv = iconv_open( pszDstEncoding, pszSrcEncoding );
236
237 if( sConv == reinterpret_cast<iconv_t>(-1) )
238 {
239 CPLFree( pszIconvSrcBuf );
240 CPLError( CE_Warning, CPLE_AppDefined,
241 "Recode from %s to %s failed with the error: \"%s\".",
242 pszSrcEncoding, pszDstEncoding, strerror(errno) );
243
244 return CPLStrdup( "" );
245 }
246
247 /* -------------------------------------------------------------------- */
248 /* XXX: There is a portability issue: iconv() function could be */
249 /* declared differently on different platforms. The second */
250 /* argument could be declared as char** (as POSIX defines) or */
251 /* as a const char**. Handle it with the ICONV_CPP_CONST macro here. */
252 /* -------------------------------------------------------------------- */
253 ICONV_CPP_CONST char *pszSrcBuf = const_cast<ICONV_CPP_CONST char *>(
254 reinterpret_cast<char*>(pszIconvSrcBuf));
255
256 /* iconv expects a number of bytes, not characters */
257 nSrcLen *= sizeof(wchar_t);
258
259 /* -------------------------------------------------------------------- */
260 /* Allocate destination buffer. */
261 /* -------------------------------------------------------------------- */
262 size_t nDstCurLen = std::max(CPL_RECODE_DSTBUF_SIZE, nSrcLen + 1);
263 size_t nDstLen = nDstCurLen;
264 char *pszDestination =
265 static_cast<char *>(CPLCalloc(nDstCurLen, sizeof(char)));
266 char *pszDstBuf = pszDestination;
267
268 while( nSrcLen > 0 )
269 {
270 const size_t nConverted =
271 iconv( sConv, &pszSrcBuf, &nSrcLen, &pszDstBuf, &nDstLen );
272
273 if( nConverted == static_cast<size_t>(-1) )
274 {
275 if( errno == EILSEQ )
276 {
277 // Skip the invalid sequence in the input string.
278 nSrcLen--;
279 pszSrcBuf += sizeof(wchar_t);
280 if( !bHaveWarned2 )
281 {
282 bHaveWarned2 = true;
283 CPLError(CE_Warning, CPLE_AppDefined,
284 "One or several characters couldn't be converted "
285 "correctly from %s to %s. "
286 "This warning will not be emitted anymore",
287 pszSrcEncoding, pszDstEncoding);
288 }
289 continue;
290 }
291
292 else if( errno == E2BIG )
293 {
294 // We are running out of the output buffer.
295 // Dynamically increase the buffer size.
296 size_t nTmp = nDstCurLen;
297 nDstCurLen *= 2;
298 pszDestination =
299 static_cast<char *>(CPLRealloc(pszDestination, nDstCurLen));
300 pszDstBuf = pszDestination + nTmp - nDstLen;
301 nDstLen += nDstCurLen - nTmp;
302 continue;
303 }
304
305 else
306 break;
307 }
308 }
309
310 pszDestination[nDstCurLen - nDstLen] = '\0';
311
312 iconv_close( sConv );
313
314 CPLFree( pszIconvSrcBuf );
315
316 return pszDestination;
317 }
318
319 /************************************************************************/
320 /* CPLRecodeToWCharIconv() */
321 /************************************************************************/
322
323 /**
324 * Convert UTF-8 string to a wchar_t string.
325 *
326 * Convert a 8bit, multi-byte per character input string into a wide
327 * character (wchar_t) string using the iconv() function.
328 *
329 * Note that the wchar_t type varies in size on different systems. On
330 * win32 it is normally 2 bytes, and on unix 4 bytes.
331 *
332 * If an error occurs an error may, or may not be posted with CPLError().
333 *
334 * @param pszSource input multi-byte character string.
335 * @param pszSrcEncoding source encoding, typically CPL_ENC_UTF8.
336 * @param pszDstEncoding destination encoding, typically CPL_ENC_UCS2.
337 *
338 * @return the zero terminated wchar_t string (to be freed with CPLFree()) or
339 * NULL on error.
340 */
341
CPLRecodeToWCharIconv(const char * pszSource,const char * pszSrcEncoding,const char * pszDstEncoding)342 wchar_t *CPLRecodeToWCharIconv( const char *pszSource,
343 const char *pszSrcEncoding,
344 const char *pszDstEncoding )
345
346 {
347 return reinterpret_cast<wchar_t *>(CPLRecodeIconv( pszSource,
348 pszSrcEncoding, pszDstEncoding));
349 }
350
351 #endif /* CPL_RECODE_ICONV */
352