1 /**********************************************************************
2  * $Id: cpl_recode.cpp 27044 2014-03-16 23:41:27Z rouault $
3  *
4  * Name:     cpl_recode.cpp
5  * Project:  CPL - Common Portability Library
6  * Purpose:  Character set recoding and char/wchar_t conversions.
7  * Author:   Andrey Kiselev, dron@ak4719.spb.edu
8  *
9  **********************************************************************
10  * Copyright (c) 2011, Andrey Kiselev <dron@ak4719.spb.edu>
11  * Copyright (c) 2008, Frank Warmerdam
12  * Copyright (c) 2011-2014, Even Rouault <even dot rouault at mines-paris dot org>
13  *
14  * Permission to use, copy, modify, and distribute this software for any
15  * purpose with or without fee is hereby granted, provided that the above
16  * copyright notice and this permission notice appear in all copies.
17  *
18  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
19  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
20  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
21  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
22  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
23  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
24  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
25  **********************************************************************/
26 
27 #include "cpl_string.h"
28 
29 CPL_CVSID("$Id: cpl_recode.cpp 27044 2014-03-16 23:41:27Z rouault $");
30 
31 #ifdef CPL_RECODE_ICONV
32 extern void CPLClearRecodeIconvWarningFlags();
33 extern char *CPLRecodeIconv( const char *, const char *, const char * );
34 extern char *CPLRecodeFromWCharIconv( const wchar_t *,
35                                       const char *, const char * );
36 extern wchar_t *CPLRecodeToWCharIconv( const char *,
37                                        const char *, const char * );
38 #endif /* CPL_RECODE_ICONV */
39 
40 extern void CPLClearRecodeStubWarningFlags();
41 extern char *CPLRecodeStub( const char *, const char *, const char * );
42 extern char *CPLRecodeFromWCharStub( const wchar_t *,
43                                      const char *, const char * );
44 extern wchar_t *CPLRecodeToWCharStub( const char *,
45                                       const char *, const char * );
46 extern int CPLIsUTF8Stub( const char *, int );
47 
48 /************************************************************************/
49 /*                             CPLRecode()                              */
50 /************************************************************************/
51 
52 /**
53  * Convert a string from a source encoding to a destination encoding.
54  *
55  * The only guaranteed supported encodings are CPL_ENC_UTF8, CPL_ENC_ASCII
56  * and CPL_ENC_ISO8859_1. Currently, the following conversions are supported :
57  * <ul>
58  *  <li>CPL_ENC_ASCII -> CPL_ENC_UTF8 or CPL_ENC_ISO8859_1 (no conversion in fact)</li>
59  *  <li>CPL_ENC_ISO8859_1 -> CPL_ENC_UTF8</li>
60  *  <li>CPL_ENC_UTF8 -> CPL_ENC_ISO8859_1</li>
61  * </ul>
62  *
63  * If an error occurs an error may, or may not be posted with CPLError().
64  *
65  * @param pszSource a NULL terminated string.
66  * @param pszSrcEncoding the source encoding.
67  * @param pszDstEncoding the destination encoding.
68  *
69  * @return a NULL terminated string which should be freed with CPLFree().
70  *
71  * @since GDAL 1.6.0
72  */
73 
CPLRecode(const char * pszSource,const char * pszSrcEncoding,const char * pszDstEncoding)74 char CPL_DLL *CPLRecode( const char *pszSource,
75                          const char *pszSrcEncoding,
76                          const char *pszDstEncoding )
77 
78 {
79 /* -------------------------------------------------------------------- */
80 /*      Handle a few common short cuts.                                 */
81 /* -------------------------------------------------------------------- */
82     if ( EQUAL(pszSrcEncoding, pszDstEncoding) )
83         return CPLStrdup(pszSource);
84 
85     if ( EQUAL(pszSrcEncoding, CPL_ENC_ASCII)
86         && ( EQUAL(pszDstEncoding, CPL_ENC_UTF8)
87              || EQUAL(pszDstEncoding, CPL_ENC_ISO8859_1) ) )
88         return CPLStrdup(pszSource);
89 
90 #ifdef CPL_RECODE_ICONV
91 /* -------------------------------------------------------------------- */
92 /*      CPL_ENC_ISO8859_1 -> CPL_ENC_UTF8                               */
93 /*      and CPL_ENC_UTF8 -> CPL_ENC_ISO8859_1 conversions are hadled    */
94 /*      very well by the stub implementation which is faster than the   */
95 /*      iconv() route. Use a stub for these two ones and iconv()        */
96 /*      everything else.                                                */
97 /* -------------------------------------------------------------------- */
98     if ( ( EQUAL(pszSrcEncoding, CPL_ENC_ISO8859_1)
99            && EQUAL(pszDstEncoding, CPL_ENC_UTF8) )
100          || ( EQUAL(pszSrcEncoding, CPL_ENC_UTF8)
101               && EQUAL(pszDstEncoding, CPL_ENC_ISO8859_1) ) )
102     {
103         return CPLRecodeStub( pszSource, pszSrcEncoding, pszDstEncoding );
104     }
105     else
106     {
107         return CPLRecodeIconv( pszSource, pszSrcEncoding, pszDstEncoding );
108     }
109 #else /* CPL_RECODE_STUB */
110     return CPLRecodeStub( pszSource, pszSrcEncoding, pszDstEncoding );
111 #endif /* CPL_RECODE_ICONV */
112 }
113 
114 /************************************************************************/
115 /*                         CPLRecodeFromWChar()                         */
116 /************************************************************************/
117 
118 /**
119  * Convert wchar_t string to UTF-8.
120  *
121  * Convert a wchar_t string into a multibyte utf-8 string.  The only
122  * guaranteed supported source encoding is CPL_ENC_UCS2, and the only
123  * guaranteed supported destination encodings are CPL_ENC_UTF8, CPL_ENC_ASCII
124  * and CPL_ENC_ISO8859_1.  In some cases (ie. using iconv()) other encodings
125  * may also be supported.
126  *
127  * Note that the wchar_t type varies in size on different systems. On
128  * win32 it is normally 2 bytes, and on unix 4 bytes.
129  *
130  * If an error occurs an error may, or may not be posted with CPLError().
131  *
132  * @param pwszSource the source wchar_t string, terminated with a 0 wchar_t.
133  * @param pszSrcEncoding the source encoding, typically CPL_ENC_UCS2.
134  * @param pszDstEncoding the destination encoding, typically CPL_ENC_UTF8.
135  *
136  * @return a zero terminated multi-byte string which should be freed with
137  * CPLFree(), or NULL if an error occurs.
138  *
139  * @since GDAL 1.6.0
140  */
141 
CPLRecodeFromWChar(const wchar_t * pwszSource,const char * pszSrcEncoding,const char * pszDstEncoding)142 char CPL_DLL *CPLRecodeFromWChar( const wchar_t *pwszSource,
143                                   const char *pszSrcEncoding,
144                                   const char *pszDstEncoding )
145 
146 {
147 #ifdef CPL_RECODE_ICONV
148 /* -------------------------------------------------------------------- */
149 /*      Conversions from CPL_ENC_UCS2                                   */
150 /*      to CPL_ENC_UTF8, CPL_ENC_ISO8859_1 and CPL_ENC_ASCII are well   */
151 /*      handled by the stub implementation.                             */
152 /* -------------------------------------------------------------------- */
153     if ( (EQUAL(pszSrcEncoding, CPL_ENC_UCS2) || EQUAL(pszSrcEncoding, "WCHAR_T"))
154          && ( EQUAL(pszDstEncoding, CPL_ENC_UTF8)
155               || EQUAL(pszDstEncoding, CPL_ENC_ASCII)
156               || EQUAL(pszDstEncoding, CPL_ENC_ISO8859_1) ) )
157     {
158         return CPLRecodeFromWCharStub( pwszSource,
159                                        pszSrcEncoding, pszDstEncoding );
160     }
161     else
162     {
163         return CPLRecodeFromWCharIconv( pwszSource,
164                                         pszSrcEncoding, pszDstEncoding );
165     }
166 #else /* CPL_RECODE_STUB */
167     return CPLRecodeFromWCharStub( pwszSource,
168                                    pszSrcEncoding, pszDstEncoding );
169 #endif /* CPL_RECODE_ICONV */
170 }
171 
172 /************************************************************************/
173 /*                          CPLRecodeToWChar()                          */
174 /************************************************************************/
175 
176 /**
177  * Convert UTF-8 string to a wchar_t string.
178  *
179  * Convert a 8bit, multi-byte per character input string into a wide
180  * character (wchar_t) string.  The only guaranteed supported source encodings
181  * are CPL_ENC_UTF8, CPL_ENC_ASCII and CPL_ENC_ISO8869_1 (LATIN1).  The only
182  * guaranteed supported destination encoding is CPL_ENC_UCS2.  Other source
183  * and destination encodings may be supported depending on the underlying
184  * implementation.
185  *
186  * Note that the wchar_t type varies in size on different systems. On
187  * win32 it is normally 2 bytes, and on unix 4 bytes.
188  *
189  * If an error occurs an error may, or may not be posted with CPLError().
190  *
191  * @param pszSource input multi-byte character string.
192  * @param pszSrcEncoding source encoding, typically CPL_ENC_UTF8.
193  * @param pszDstEncoding destination encoding, typically CPL_ENC_UCS2.
194  *
195  * @return the zero terminated wchar_t string (to be freed with CPLFree()) or
196  * NULL on error.
197  *
198  * @since GDAL 1.6.0
199  */
200 
CPLRecodeToWChar(const char * pszSource,const char * pszSrcEncoding,const char * pszDstEncoding)201 wchar_t CPL_DLL *CPLRecodeToWChar( const char *pszSource,
202                                    const char *pszSrcEncoding,
203                                    const char *pszDstEncoding )
204 
205 {
206 #ifdef CPL_RECODE_ICONV
207 /* -------------------------------------------------------------------- */
208 /*      Conversions to CPL_ENC_UCS2                                     */
209 /*      from CPL_ENC_UTF8, CPL_ENC_ISO8859_1 and CPL_ENC_ASCII are well */
210 /*      handled by the stub implementation.                             */
211 /* -------------------------------------------------------------------- */
212     if ( (EQUAL(pszDstEncoding, CPL_ENC_UCS2) || EQUAL(pszDstEncoding, "WCHAR_T"))
213          && ( EQUAL(pszSrcEncoding, CPL_ENC_UTF8)
214               || EQUAL(pszSrcEncoding, CPL_ENC_ASCII)
215               || EQUAL(pszSrcEncoding, CPL_ENC_ISO8859_1) ) )
216     {
217         return CPLRecodeToWCharStub( pszSource,
218                                      pszSrcEncoding, pszDstEncoding );
219     }
220     else
221     {
222         return CPLRecodeToWCharIconv( pszSource,
223                                       pszSrcEncoding, pszDstEncoding );
224     }
225 #else /* CPL_RECODE_STUB */
226     return CPLRecodeToWCharStub( pszSource, pszSrcEncoding, pszDstEncoding );
227 #endif /* CPL_RECODE_ICONV */
228 }
229 
230 /************************************************************************/
231 /*                                 CPLIsUTF8()                          */
232 /************************************************************************/
233 
234 /**
235  * Test if a string is encoded as UTF-8.
236  *
237  * @param pabyData input string to test
238  * @param nLen length of the input string, or -1 if the function must compute
239  *             the string length. In which case it must be null terminated.
240  * @return TRUE if the string is encoded as UTF-8. FALSE otherwise
241  *
242  * @since GDAL 1.7.0
243  */
CPLIsUTF8(const char * pabyData,int nLen)244 int CPLIsUTF8(const char* pabyData, int nLen)
245 {
246     return CPLIsUTF8Stub( pabyData, nLen );
247 }
248 
249 /************************************************************************/
250 /*                          CPLForceToASCII()                           */
251 /************************************************************************/
252 
253 /**
254  * Return a new string that is made only of ASCII characters. If non-ASCII
255  * characters are found in the input string, they will be replaced by the
256  * provided replacement character.
257  *
258  * @param pabyData input string to test
259  * @param nLen length of the input string, or -1 if the function must compute
260  *             the string length. In which case it must be null terminated.
261  * @param chReplacementChar character which will be used when the input stream
262  *                          contains a non ASCII character. Must be valid ASCII !
263  *
264  * @return a new string that must be freed with CPLFree().
265  *
266  * @since GDAL 1.7.0
267  */
CPLForceToASCII(const char * pabyData,int nLen,char chReplacementChar)268 char CPL_DLL *CPLForceToASCII(const char* pabyData, int nLen, char chReplacementChar)
269 {
270     if (nLen < 0)
271         nLen = strlen(pabyData);
272     char* pszOutputString = (char*)CPLMalloc(nLen + 1);
273     int i;
274     for(i=0;i<nLen;i++)
275     {
276         if (((unsigned char*)pabyData)[i] > 127)
277             pszOutputString[i] = chReplacementChar;
278         else
279             pszOutputString[i] = pabyData[i];
280     }
281     pszOutputString[i] = '\0';
282     return pszOutputString;
283 }
284 
285 /************************************************************************/
286 /*                        CPLEncodingCharSize()                         */
287 /************************************************************************/
288 
289 /**
290  * Return bytes per character for encoding.
291  *
292  * This function returns the size in bytes of the smallest character
293  * in this encoding.  For fixed width encodings (ASCII, UCS-2, UCS-4) this
294  * is straight forward.  For encodings like UTF8 and UTF16 which represent
295  * some characters as a sequence of atomic character sizes the function
296  * still returns the atomic character size (1 for UTF8, 2 for UTF16).
297  *
298  * This function will return the correct value for well known encodings
299  * with corresponding CPL_ENC_ values.  It may not return the correct value
300  * for other encodings even if they are supported by the underlying iconv
301  * or windows transliteration services.  Hopefully it will improve over time.
302  *
303  * @param pszEncoding the name of the encoding.
304  *
305  * @return the size of a minimal character in bytes or -1 if the size is
306  * unknown.
307  */
308 
CPLEncodingCharSize(const char * pszEncoding)309 int CPLEncodingCharSize( const char *pszEncoding )
310 
311 {
312     if( EQUAL(pszEncoding,CPL_ENC_UTF8) )
313         return 1;
314     else if( EQUAL(pszEncoding,CPL_ENC_UTF16) )
315         return 2;
316     else if( EQUAL(pszEncoding,CPL_ENC_UCS2) )
317         return 2;
318     else if( EQUAL(pszEncoding,CPL_ENC_UCS4) )
319         return 4;
320     else if( EQUAL(pszEncoding,CPL_ENC_ASCII) )
321         return 1;
322     else if( EQUALN(pszEncoding,"ISO-8859-",9) )
323         return 1;
324     else
325         return -1;
326 }
327 
328 /************************************************************************/
329 /*                    CPLClearRecodeWarningFlags()                      */
330 /************************************************************************/
331 
CPLClearRecodeWarningFlags()332 void CPLClearRecodeWarningFlags()
333 {
334 #ifdef CPL_RECODE_ICONV
335     CPLClearRecodeIconvWarningFlags();
336 #endif
337     CPLClearRecodeStubWarningFlags();
338 }
339 
340 
341 /************************************************************************/
342 /*                         CPLStrlenUTF8()                              */
343 /************************************************************************/
344 
345 /**
346  * Return the number of UTF-8 characters of a nul-terminated string.
347  *
348  * This is different from strlen() which returns the number of bytes.
349  *
350  * @param pszUTF8Str a nul-terminated UTF-8 string
351  *
352  * @return the number of UTF-8 characters.
353  */
354 
CPLStrlenUTF8(const char * pszUTF8Str)355 int CPLStrlenUTF8(const char *pszUTF8Str) {
356     int i = 0, j = 0;
357     while (pszUTF8Str[i]) {
358         if ((pszUTF8Str[i] & 0xc0) != 0x80) j++;
359         i++;
360     }
361     return j;
362 }
363 
364