1 /**********************************************************************
2 * $Id: cpl_recode.cpp 27044 2014-03-16 23:41:27Z rouault $
3 *
4 * Name: cpl_recode.cpp
5 * Project: CPL - Common Portability Library
6 * Purpose: Character set recoding and char/wchar_t conversions.
7 * Author: Andrey Kiselev, dron@ak4719.spb.edu
8 *
9 **********************************************************************
10 * Copyright (c) 2011, Andrey Kiselev <dron@ak4719.spb.edu>
11 * Copyright (c) 2008, Frank Warmerdam
12 * Copyright (c) 2011-2014, Even Rouault <even dot rouault at mines-paris dot org>
13 *
14 * Permission to use, copy, modify, and distribute this software for any
15 * purpose with or without fee is hereby granted, provided that the above
16 * copyright notice and this permission notice appear in all copies.
17 *
18 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
19 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
20 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
21 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
22 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
23 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
24 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
25 **********************************************************************/
26
27 #include "cpl_string.h"
28
29 CPL_CVSID("$Id: cpl_recode.cpp 27044 2014-03-16 23:41:27Z rouault $");
30
31 #ifdef CPL_RECODE_ICONV
32 extern void CPLClearRecodeIconvWarningFlags();
33 extern char *CPLRecodeIconv( const char *, const char *, const char * );
34 extern char *CPLRecodeFromWCharIconv( const wchar_t *,
35 const char *, const char * );
36 extern wchar_t *CPLRecodeToWCharIconv( const char *,
37 const char *, const char * );
38 #endif /* CPL_RECODE_ICONV */
39
40 extern void CPLClearRecodeStubWarningFlags();
41 extern char *CPLRecodeStub( const char *, const char *, const char * );
42 extern char *CPLRecodeFromWCharStub( const wchar_t *,
43 const char *, const char * );
44 extern wchar_t *CPLRecodeToWCharStub( const char *,
45 const char *, const char * );
46 extern int CPLIsUTF8Stub( const char *, int );
47
48 /************************************************************************/
49 /* CPLRecode() */
50 /************************************************************************/
51
52 /**
53 * Convert a string from a source encoding to a destination encoding.
54 *
55 * The only guaranteed supported encodings are CPL_ENC_UTF8, CPL_ENC_ASCII
56 * and CPL_ENC_ISO8859_1. Currently, the following conversions are supported :
57 * <ul>
58 * <li>CPL_ENC_ASCII -> CPL_ENC_UTF8 or CPL_ENC_ISO8859_1 (no conversion in fact)</li>
59 * <li>CPL_ENC_ISO8859_1 -> CPL_ENC_UTF8</li>
60 * <li>CPL_ENC_UTF8 -> CPL_ENC_ISO8859_1</li>
61 * </ul>
62 *
63 * If an error occurs an error may, or may not be posted with CPLError().
64 *
65 * @param pszSource a NULL terminated string.
66 * @param pszSrcEncoding the source encoding.
67 * @param pszDstEncoding the destination encoding.
68 *
69 * @return a NULL terminated string which should be freed with CPLFree().
70 *
71 * @since GDAL 1.6.0
72 */
73
CPLRecode(const char * pszSource,const char * pszSrcEncoding,const char * pszDstEncoding)74 char CPL_DLL *CPLRecode( const char *pszSource,
75 const char *pszSrcEncoding,
76 const char *pszDstEncoding )
77
78 {
79 /* -------------------------------------------------------------------- */
80 /* Handle a few common short cuts. */
81 /* -------------------------------------------------------------------- */
82 if ( EQUAL(pszSrcEncoding, pszDstEncoding) )
83 return CPLStrdup(pszSource);
84
85 if ( EQUAL(pszSrcEncoding, CPL_ENC_ASCII)
86 && ( EQUAL(pszDstEncoding, CPL_ENC_UTF8)
87 || EQUAL(pszDstEncoding, CPL_ENC_ISO8859_1) ) )
88 return CPLStrdup(pszSource);
89
90 #ifdef CPL_RECODE_ICONV
91 /* -------------------------------------------------------------------- */
92 /* CPL_ENC_ISO8859_1 -> CPL_ENC_UTF8 */
93 /* and CPL_ENC_UTF8 -> CPL_ENC_ISO8859_1 conversions are hadled */
94 /* very well by the stub implementation which is faster than the */
95 /* iconv() route. Use a stub for these two ones and iconv() */
96 /* everything else. */
97 /* -------------------------------------------------------------------- */
98 if ( ( EQUAL(pszSrcEncoding, CPL_ENC_ISO8859_1)
99 && EQUAL(pszDstEncoding, CPL_ENC_UTF8) )
100 || ( EQUAL(pszSrcEncoding, CPL_ENC_UTF8)
101 && EQUAL(pszDstEncoding, CPL_ENC_ISO8859_1) ) )
102 {
103 return CPLRecodeStub( pszSource, pszSrcEncoding, pszDstEncoding );
104 }
105 else
106 {
107 return CPLRecodeIconv( pszSource, pszSrcEncoding, pszDstEncoding );
108 }
109 #else /* CPL_RECODE_STUB */
110 return CPLRecodeStub( pszSource, pszSrcEncoding, pszDstEncoding );
111 #endif /* CPL_RECODE_ICONV */
112 }
113
114 /************************************************************************/
115 /* CPLRecodeFromWChar() */
116 /************************************************************************/
117
118 /**
119 * Convert wchar_t string to UTF-8.
120 *
121 * Convert a wchar_t string into a multibyte utf-8 string. The only
122 * guaranteed supported source encoding is CPL_ENC_UCS2, and the only
123 * guaranteed supported destination encodings are CPL_ENC_UTF8, CPL_ENC_ASCII
124 * and CPL_ENC_ISO8859_1. In some cases (ie. using iconv()) other encodings
125 * may also be supported.
126 *
127 * Note that the wchar_t type varies in size on different systems. On
128 * win32 it is normally 2 bytes, and on unix 4 bytes.
129 *
130 * If an error occurs an error may, or may not be posted with CPLError().
131 *
132 * @param pwszSource the source wchar_t string, terminated with a 0 wchar_t.
133 * @param pszSrcEncoding the source encoding, typically CPL_ENC_UCS2.
134 * @param pszDstEncoding the destination encoding, typically CPL_ENC_UTF8.
135 *
136 * @return a zero terminated multi-byte string which should be freed with
137 * CPLFree(), or NULL if an error occurs.
138 *
139 * @since GDAL 1.6.0
140 */
141
CPLRecodeFromWChar(const wchar_t * pwszSource,const char * pszSrcEncoding,const char * pszDstEncoding)142 char CPL_DLL *CPLRecodeFromWChar( const wchar_t *pwszSource,
143 const char *pszSrcEncoding,
144 const char *pszDstEncoding )
145
146 {
147 #ifdef CPL_RECODE_ICONV
148 /* -------------------------------------------------------------------- */
149 /* Conversions from CPL_ENC_UCS2 */
150 /* to CPL_ENC_UTF8, CPL_ENC_ISO8859_1 and CPL_ENC_ASCII are well */
151 /* handled by the stub implementation. */
152 /* -------------------------------------------------------------------- */
153 if ( (EQUAL(pszSrcEncoding, CPL_ENC_UCS2) || EQUAL(pszSrcEncoding, "WCHAR_T"))
154 && ( EQUAL(pszDstEncoding, CPL_ENC_UTF8)
155 || EQUAL(pszDstEncoding, CPL_ENC_ASCII)
156 || EQUAL(pszDstEncoding, CPL_ENC_ISO8859_1) ) )
157 {
158 return CPLRecodeFromWCharStub( pwszSource,
159 pszSrcEncoding, pszDstEncoding );
160 }
161 else
162 {
163 return CPLRecodeFromWCharIconv( pwszSource,
164 pszSrcEncoding, pszDstEncoding );
165 }
166 #else /* CPL_RECODE_STUB */
167 return CPLRecodeFromWCharStub( pwszSource,
168 pszSrcEncoding, pszDstEncoding );
169 #endif /* CPL_RECODE_ICONV */
170 }
171
172 /************************************************************************/
173 /* CPLRecodeToWChar() */
174 /************************************************************************/
175
176 /**
177 * Convert UTF-8 string to a wchar_t string.
178 *
179 * Convert a 8bit, multi-byte per character input string into a wide
180 * character (wchar_t) string. The only guaranteed supported source encodings
181 * are CPL_ENC_UTF8, CPL_ENC_ASCII and CPL_ENC_ISO8869_1 (LATIN1). The only
182 * guaranteed supported destination encoding is CPL_ENC_UCS2. Other source
183 * and destination encodings may be supported depending on the underlying
184 * implementation.
185 *
186 * Note that the wchar_t type varies in size on different systems. On
187 * win32 it is normally 2 bytes, and on unix 4 bytes.
188 *
189 * If an error occurs an error may, or may not be posted with CPLError().
190 *
191 * @param pszSource input multi-byte character string.
192 * @param pszSrcEncoding source encoding, typically CPL_ENC_UTF8.
193 * @param pszDstEncoding destination encoding, typically CPL_ENC_UCS2.
194 *
195 * @return the zero terminated wchar_t string (to be freed with CPLFree()) or
196 * NULL on error.
197 *
198 * @since GDAL 1.6.0
199 */
200
CPLRecodeToWChar(const char * pszSource,const char * pszSrcEncoding,const char * pszDstEncoding)201 wchar_t CPL_DLL *CPLRecodeToWChar( const char *pszSource,
202 const char *pszSrcEncoding,
203 const char *pszDstEncoding )
204
205 {
206 #ifdef CPL_RECODE_ICONV
207 /* -------------------------------------------------------------------- */
208 /* Conversions to CPL_ENC_UCS2 */
209 /* from CPL_ENC_UTF8, CPL_ENC_ISO8859_1 and CPL_ENC_ASCII are well */
210 /* handled by the stub implementation. */
211 /* -------------------------------------------------------------------- */
212 if ( (EQUAL(pszDstEncoding, CPL_ENC_UCS2) || EQUAL(pszDstEncoding, "WCHAR_T"))
213 && ( EQUAL(pszSrcEncoding, CPL_ENC_UTF8)
214 || EQUAL(pszSrcEncoding, CPL_ENC_ASCII)
215 || EQUAL(pszSrcEncoding, CPL_ENC_ISO8859_1) ) )
216 {
217 return CPLRecodeToWCharStub( pszSource,
218 pszSrcEncoding, pszDstEncoding );
219 }
220 else
221 {
222 return CPLRecodeToWCharIconv( pszSource,
223 pszSrcEncoding, pszDstEncoding );
224 }
225 #else /* CPL_RECODE_STUB */
226 return CPLRecodeToWCharStub( pszSource, pszSrcEncoding, pszDstEncoding );
227 #endif /* CPL_RECODE_ICONV */
228 }
229
230 /************************************************************************/
231 /* CPLIsUTF8() */
232 /************************************************************************/
233
234 /**
235 * Test if a string is encoded as UTF-8.
236 *
237 * @param pabyData input string to test
238 * @param nLen length of the input string, or -1 if the function must compute
239 * the string length. In which case it must be null terminated.
240 * @return TRUE if the string is encoded as UTF-8. FALSE otherwise
241 *
242 * @since GDAL 1.7.0
243 */
CPLIsUTF8(const char * pabyData,int nLen)244 int CPLIsUTF8(const char* pabyData, int nLen)
245 {
246 return CPLIsUTF8Stub( pabyData, nLen );
247 }
248
249 /************************************************************************/
250 /* CPLForceToASCII() */
251 /************************************************************************/
252
253 /**
254 * Return a new string that is made only of ASCII characters. If non-ASCII
255 * characters are found in the input string, they will be replaced by the
256 * provided replacement character.
257 *
258 * @param pabyData input string to test
259 * @param nLen length of the input string, or -1 if the function must compute
260 * the string length. In which case it must be null terminated.
261 * @param chReplacementChar character which will be used when the input stream
262 * contains a non ASCII character. Must be valid ASCII !
263 *
264 * @return a new string that must be freed with CPLFree().
265 *
266 * @since GDAL 1.7.0
267 */
CPLForceToASCII(const char * pabyData,int nLen,char chReplacementChar)268 char CPL_DLL *CPLForceToASCII(const char* pabyData, int nLen, char chReplacementChar)
269 {
270 if (nLen < 0)
271 nLen = strlen(pabyData);
272 char* pszOutputString = (char*)CPLMalloc(nLen + 1);
273 int i;
274 for(i=0;i<nLen;i++)
275 {
276 if (((unsigned char*)pabyData)[i] > 127)
277 pszOutputString[i] = chReplacementChar;
278 else
279 pszOutputString[i] = pabyData[i];
280 }
281 pszOutputString[i] = '\0';
282 return pszOutputString;
283 }
284
285 /************************************************************************/
286 /* CPLEncodingCharSize() */
287 /************************************************************************/
288
289 /**
290 * Return bytes per character for encoding.
291 *
292 * This function returns the size in bytes of the smallest character
293 * in this encoding. For fixed width encodings (ASCII, UCS-2, UCS-4) this
294 * is straight forward. For encodings like UTF8 and UTF16 which represent
295 * some characters as a sequence of atomic character sizes the function
296 * still returns the atomic character size (1 for UTF8, 2 for UTF16).
297 *
298 * This function will return the correct value for well known encodings
299 * with corresponding CPL_ENC_ values. It may not return the correct value
300 * for other encodings even if they are supported by the underlying iconv
301 * or windows transliteration services. Hopefully it will improve over time.
302 *
303 * @param pszEncoding the name of the encoding.
304 *
305 * @return the size of a minimal character in bytes or -1 if the size is
306 * unknown.
307 */
308
CPLEncodingCharSize(const char * pszEncoding)309 int CPLEncodingCharSize( const char *pszEncoding )
310
311 {
312 if( EQUAL(pszEncoding,CPL_ENC_UTF8) )
313 return 1;
314 else if( EQUAL(pszEncoding,CPL_ENC_UTF16) )
315 return 2;
316 else if( EQUAL(pszEncoding,CPL_ENC_UCS2) )
317 return 2;
318 else if( EQUAL(pszEncoding,CPL_ENC_UCS4) )
319 return 4;
320 else if( EQUAL(pszEncoding,CPL_ENC_ASCII) )
321 return 1;
322 else if( EQUALN(pszEncoding,"ISO-8859-",9) )
323 return 1;
324 else
325 return -1;
326 }
327
328 /************************************************************************/
329 /* CPLClearRecodeWarningFlags() */
330 /************************************************************************/
331
CPLClearRecodeWarningFlags()332 void CPLClearRecodeWarningFlags()
333 {
334 #ifdef CPL_RECODE_ICONV
335 CPLClearRecodeIconvWarningFlags();
336 #endif
337 CPLClearRecodeStubWarningFlags();
338 }
339
340
341 /************************************************************************/
342 /* CPLStrlenUTF8() */
343 /************************************************************************/
344
345 /**
346 * Return the number of UTF-8 characters of a nul-terminated string.
347 *
348 * This is different from strlen() which returns the number of bytes.
349 *
350 * @param pszUTF8Str a nul-terminated UTF-8 string
351 *
352 * @return the number of UTF-8 characters.
353 */
354
CPLStrlenUTF8(const char * pszUTF8Str)355 int CPLStrlenUTF8(const char *pszUTF8Str) {
356 int i = 0, j = 0;
357 while (pszUTF8Str[i]) {
358 if ((pszUTF8Str[i] & 0xc0) != 0x80) j++;
359 i++;
360 }
361 return j;
362 }
363
364