1 /**********************************************************************
2  *
3  * Name:     cpl_recode_stub.cpp
4  * Project:  CPL - Common Portability Library
5  * Purpose:  Character set recoding and char/wchar_t conversions, stub
6  *           implementation to be used if iconv() functionality is not
7  *           available.
8  * Author:   Frank Warmerdam, warmerdam@pobox.com
9  *
10  * The bulk of this code is derived from the utf.c module from FLTK. It
11  * was originally downloaded from:
12  *    http://svn.easysw.com/public/fltk/fltk/trunk/src/utf.c
13  *
14  **********************************************************************
15  * Copyright (c) 2008, Frank Warmerdam
16  * Copyright 2006 by Bill Spitzak and others.
17  * Copyright (c) 2009-2014, Even Rouault <even dot rouault at spatialys.com>
18  *
19  * Permission to use, copy, modify, and distribute this software for any
20  * purpose with or without fee is hereby granted, provided that the above
21  * copyright notice and this permission notice appear in all copies.
22  *
23  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
24  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
25  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
26  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
27  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
28  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
29  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
30  **********************************************************************/
31 
32 #include "cpl_port.h"
33 #include "cpl_string.h"
34 
35 #include <cstring>
36 
37 #include "cpl_conv.h"
38 #include "cpl_error.h"
39 
40 CPL_CVSID("$Id: cpl_recode_stub.cpp b1c9c12ad373e40b955162b45d704070d4ebf7b0 2019-06-19 16:50:15 +0200 Even Rouault $")
41 
42 #ifdef CPL_RECODE_STUB
43 
44 static unsigned utf8decode(const char* p, const char* end, int* len);
45 static unsigned utf8towc(const char* src, unsigned srclen,
46                          wchar_t* dst, unsigned dstlen);
47 static unsigned utf8toa(const char* src, unsigned srclen,
48                         char* dst, unsigned dstlen);
49 static unsigned utf8fromwc(char* dst, unsigned dstlen,
50                            const wchar_t* src, unsigned srclen);
51 static unsigned utf8froma(char* dst, unsigned dstlen,
52                           const char* src, unsigned srclen);
53 static int utf8test(const char* src, unsigned srclen);
54 
55 #ifdef _WIN32
56 
57 #include <windows.h>
58 #include <winnls.h>
59 
60 static char* CPLWin32Recode( const char* src, unsigned src_code_page,
61                              unsigned dst_code_page )
62     CPL_RETURNS_NONNULL;
63 #endif
64 
65 /* used by cpl_recode.cpp */
66 extern void CPLClearRecodeStubWarningFlags();
67 extern char *CPLRecodeStub( const char *, const char *, const char * )
68     CPL_RETURNS_NONNULL;
69 extern char *CPLRecodeFromWCharStub( const wchar_t *,
70                                      const char *, const char * );
71 extern wchar_t *CPLRecodeToWCharStub( const char *,
72                                       const char *, const char * );
73 extern int CPLIsUTF8Stub( const char *, int );
74 
75 /************************************************************************/
76 /* ==================================================================== */
77 /*      Stub Implementation not depending on iconv() or WIN32 API.      */
78 /* ==================================================================== */
79 /************************************************************************/
80 
81 static bool bHaveWarned1 = false;
82 static bool bHaveWarned2 = false;
83 static bool bHaveWarned3 = false;
84 static bool bHaveWarned4 = false;
85 static bool bHaveWarned5 = false;
86 static bool bHaveWarned6 = false;
87 
88 /************************************************************************/
89 /*                 CPLClearRecodeStubWarningFlags()                     */
90 /************************************************************************/
91 
CPLClearRecodeStubWarningFlags()92 void CPLClearRecodeStubWarningFlags()
93 {
94     bHaveWarned1 = false;
95     bHaveWarned2 = false;
96     bHaveWarned3 = false;
97     bHaveWarned4 = false;
98     bHaveWarned5 = false;
99     bHaveWarned6 = false;
100 }
101 
102 /************************************************************************/
103 /*                           CPLRecodeStub()                            */
104 /************************************************************************/
105 
106 /**
107  * Convert a string from a source encoding to a destination encoding.
108  *
109  * The only guaranteed supported encodings are CPL_ENC_UTF8, CPL_ENC_ASCII
110  * and CPL_ENC_ISO8859_1. Currently, the following conversions are supported :
111  * <ul>
112  *  <li>CPL_ENC_ASCII -> CPL_ENC_UTF8 or CPL_ENC_ISO8859_1 (no conversion in
113  *  fact)</li>
114  *  <li>CPL_ENC_ISO8859_1 -> CPL_ENC_UTF8</li>
115  *  <li>CPL_ENC_UTF8 -> CPL_ENC_ISO8859_1</li>
116  * </ul>
117  *
118  * If an error occurs an error may, or may not be posted with CPLError().
119  *
120  * @param pszSource a NULL terminated string.
121  * @param pszSrcEncoding the source encoding.
122  * @param pszDstEncoding the destination encoding.
123  *
124  * @return a NULL terminated string which should be freed with CPLFree().
125  */
126 
CPLRecodeStub(const char * pszSource,const char * pszSrcEncoding,const char * pszDstEncoding)127 char *CPLRecodeStub( const char *pszSource,
128                      const char *pszSrcEncoding,
129                      const char *pszDstEncoding )
130 
131 {
132 /* -------------------------------------------------------------------- */
133 /*      If the source or destination is current locale(), we change     */
134 /*      it to ISO8859-1 since our stub implementation does not          */
135 /*      attempt to address locales properly.                            */
136 /* -------------------------------------------------------------------- */
137 
138     if( pszSrcEncoding[0] == '\0' )
139         pszSrcEncoding = CPL_ENC_ISO8859_1;
140 
141     if( pszDstEncoding[0] == '\0' )
142         pszDstEncoding = CPL_ENC_ISO8859_1;
143 
144 /* -------------------------------------------------------------------- */
145 /*      ISO8859 to UTF8                                                 */
146 /* -------------------------------------------------------------------- */
147     if( strcmp(pszSrcEncoding, CPL_ENC_ISO8859_1) == 0
148         && strcmp(pszDstEncoding, CPL_ENC_UTF8) == 0 )
149     {
150         const int nCharCount = static_cast<int>(strlen(pszSource));
151         char *pszResult = static_cast<char *>(CPLCalloc(1, nCharCount * 2 + 1));
152 
153         utf8froma(pszResult, nCharCount * 2 + 1, pszSource, nCharCount);
154 
155         return pszResult;
156     }
157 
158 /* -------------------------------------------------------------------- */
159 /*      UTF8 to ISO8859                                                 */
160 /* -------------------------------------------------------------------- */
161     if( strcmp(pszSrcEncoding, CPL_ENC_UTF8) == 0
162         && strcmp(pszDstEncoding, CPL_ENC_ISO8859_1) == 0 )
163     {
164         int nCharCount = static_cast<int>(strlen(pszSource));
165         char *pszResult = static_cast<char *>(CPLCalloc(1, nCharCount + 1));
166 
167         utf8toa(pszSource, nCharCount, pszResult, nCharCount + 1);
168 
169         return pszResult;
170     }
171 
172 #ifdef _WIN32
173 /* ---------------------------------------------------------------------*/
174 /*      CPXXX to UTF8                                                   */
175 /* ---------------------------------------------------------------------*/
176     if( STARTS_WITH(pszSrcEncoding, "CP")
177         && strcmp(pszDstEncoding, CPL_ENC_UTF8) == 0 )
178     {
179         int nCode = atoi( pszSrcEncoding + 2 );
180         if( nCode > 0 ) {
181            return CPLWin32Recode( pszSource, nCode, CP_UTF8 );
182         }
183         else if( EQUAL(pszSrcEncoding, "CP_OEMCP") )
184             return CPLWin32Recode( pszSource, CP_OEMCP, CP_UTF8 );
185         else if( EQUAL(pszSrcEncoding, "CP_ACP") )
186             return CPLWin32Recode( pszSource, CP_ACP, CP_UTF8 );
187     }
188 
189 /* ---------------------------------------------------------------------*/
190 /*      UTF8 to CPXXX                                                   */
191 /* ---------------------------------------------------------------------*/
192     if( strcmp(pszSrcEncoding, CPL_ENC_UTF8) == 0
193         && STARTS_WITH(pszDstEncoding, "CP") )
194     {
195          int nCode = atoi( pszDstEncoding + 2 );
196          if( nCode > 0 ) {
197              return CPLWin32Recode( pszSource, CP_UTF8, nCode );
198          }
199          else if( EQUAL(pszDstEncoding, "CP_OEMCP") )
200             return CPLWin32Recode( pszSource, CP_UTF8, CP_OEMCP );
201          else if( EQUAL(pszDstEncoding, "CP_ACP") )
202             return CPLWin32Recode( pszSource, CP_UTF8, CP_ACP );
203     }
204 #endif
205 
206 /* -------------------------------------------------------------------- */
207 /*      Anything else to UTF-8 is treated as ISO8859-1 to UTF-8 with    */
208 /*      a one-time warning.                                             */
209 /* -------------------------------------------------------------------- */
210     if( strcmp(pszDstEncoding, CPL_ENC_UTF8) == 0 )
211     {
212         int nCharCount = static_cast<int>(strlen(pszSource));
213         char *pszResult = static_cast<char *>(CPLCalloc(1, nCharCount * 2 + 1));
214 
215         if( EQUAL( pszSrcEncoding, "CP437") ) // For ZIP file handling.
216         {
217             bool bIsAllPrintableASCII = true;
218             for( int i = 0; i <nCharCount; i++ )
219             {
220                 if( pszSource[i] < 32 || pszSource[i] > 126 )
221                 {
222                     bIsAllPrintableASCII = false;
223                     break;
224                 }
225             }
226             if( bIsAllPrintableASCII )
227             {
228                 if( nCharCount )
229                     memcpy( pszResult, pszSource, nCharCount );
230                 return pszResult;
231             }
232         }
233 
234         if( !bHaveWarned1 )
235         {
236             bHaveWarned1 = true;
237             CPLError( CE_Warning, CPLE_AppDefined,
238                       "Recode from %s to UTF-8 not supported, "
239                       "treated as ISO-8859-1 to UTF-8.",
240                       pszSrcEncoding );
241         }
242 
243         utf8froma( pszResult, nCharCount*2+1, pszSource, nCharCount );
244 
245         return pszResult;
246     }
247 
248 /* -------------------------------------------------------------------- */
249 /*      UTF-8 to anything else is treated as UTF-8 to ISO-8859-1        */
250 /*      with a warning.                                                 */
251 /* -------------------------------------------------------------------- */
252     if( strcmp(pszSrcEncoding, CPL_ENC_UTF8) == 0
253         && strcmp(pszDstEncoding, CPL_ENC_ISO8859_1) == 0 )
254     {
255         int nCharCount = static_cast<int>(strlen(pszSource));
256         char *pszResult = static_cast<char *>(CPLCalloc(1, nCharCount + 1));
257 
258         if( !bHaveWarned2 )
259         {
260             bHaveWarned2 = true;
261             CPLError( CE_Warning, CPLE_AppDefined,
262                       "Recode from UTF-8 to %s not supported, "
263                       "treated as UTF-8 to ISO-8859-1.",
264                       pszDstEncoding );
265         }
266 
267         utf8toa(pszSource, nCharCount, pszResult, nCharCount + 1);
268 
269         return pszResult;
270     }
271 
272 /* -------------------------------------------------------------------- */
273 /*      Everything else is treated as a no-op with a warning.           */
274 /* -------------------------------------------------------------------- */
275     {
276         if( !bHaveWarned3 )
277         {
278             bHaveWarned3 = true;
279             CPLError( CE_Warning, CPLE_AppDefined,
280                       "Recode from %s to %s not supported, no change applied.",
281                       pszSrcEncoding, pszDstEncoding );
282         }
283 
284         return CPLStrdup(pszSource);
285     }
286 }
287 
288 /************************************************************************/
289 /*                       CPLRecodeFromWCharStub()                       */
290 /************************************************************************/
291 
292 /**
293  * Convert wchar_t string to UTF-8.
294  *
295  * Convert a wchar_t string into a multibyte utf-8 string.  The only
296  * guaranteed supported source encoding is CPL_ENC_UCS2, and the only
297  * guaranteed supported destination encodings are CPL_ENC_UTF8, CPL_ENC_ASCII
298  * and CPL_ENC_ISO8859_1.  In some cases (i.e. using iconv()) other encodings
299  * may also be supported.
300  *
301  * Note that the wchar_t type varies in size on different systems. On
302  * win32 it is normally 2 bytes, and on unix 4 bytes.
303  *
304  * If an error occurs an error may, or may not be posted with CPLError().
305  *
306  * @param pwszSource the source wchar_t string, terminated with a 0 wchar_t.
307  * @param pszSrcEncoding the source encoding, typically CPL_ENC_UCS2.
308  * @param pszDstEncoding the destination encoding, typically CPL_ENC_UTF8.
309  *
310  * @return a zero terminated multi-byte string which should be freed with
311  * CPLFree(), or NULL if an error occurs.
312  */
313 
CPLRecodeFromWCharStub(const wchar_t * pwszSource,const char * pszSrcEncoding,const char * pszDstEncoding)314 char *CPLRecodeFromWCharStub( const wchar_t *pwszSource,
315                               const char *pszSrcEncoding,
316                               const char *pszDstEncoding )
317 
318 {
319 /* -------------------------------------------------------------------- */
320 /*      We try to avoid changes of character set.  We are just          */
321 /*      providing for unicode to unicode.                               */
322 /* -------------------------------------------------------------------- */
323     if( strcmp(pszSrcEncoding, "WCHAR_T") != 0 &&
324         strcmp(pszSrcEncoding, CPL_ENC_UTF8) != 0
325         && strcmp(pszSrcEncoding, CPL_ENC_UTF16) != 0
326         && strcmp(pszSrcEncoding, CPL_ENC_UCS2) != 0
327         && strcmp(pszSrcEncoding, CPL_ENC_UCS4) != 0 )
328     {
329         CPLError( CE_Failure, CPLE_AppDefined,
330                   "Stub recoding implementation does not support "
331                   "CPLRecodeFromWCharStub(...,%s,%s)",
332                   pszSrcEncoding, pszDstEncoding );
333         return nullptr;
334     }
335 
336 /* -------------------------------------------------------------------- */
337 /*      What is the source length.                                      */
338 /* -------------------------------------------------------------------- */
339     int nSrcLen = 0;
340 
341     while( pwszSource[nSrcLen] != 0 )
342         nSrcLen++;
343 
344 /* -------------------------------------------------------------------- */
345 /*      Allocate destination buffer plenty big.                         */
346 /* -------------------------------------------------------------------- */
347     const int nDstBufSize = nSrcLen * 4 + 1;
348     // Nearly worst case.
349     char *pszResult = static_cast<char *>(CPLMalloc(nDstBufSize));
350 
351     if( nSrcLen == 0 )
352     {
353         pszResult[0] = '\0';
354         return pszResult;
355     }
356 
357 /* -------------------------------------------------------------------- */
358 /*      Convert, and confirm we had enough space.                       */
359 /* -------------------------------------------------------------------- */
360     const int nDstLen =
361         utf8fromwc( pszResult, nDstBufSize, pwszSource, nSrcLen );
362     if( nDstLen >= nDstBufSize )
363     {
364         CPLAssert( false ); // too small!
365         return nullptr;
366     }
367 
368 /* -------------------------------------------------------------------- */
369 /*      If something other than UTF-8 was requested, recode now.        */
370 /* -------------------------------------------------------------------- */
371     if( strcmp(pszDstEncoding, CPL_ENC_UTF8) == 0 )
372         return pszResult;
373 
374     char *pszFinalResult =
375         CPLRecodeStub( pszResult, CPL_ENC_UTF8, pszDstEncoding );
376 
377     CPLFree( pszResult );
378 
379     return pszFinalResult;
380 }
381 
382 /************************************************************************/
383 /*                        CPLRecodeToWCharStub()                        */
384 /************************************************************************/
385 
386 /**
387  * Convert UTF-8 string to a wchar_t string.
388  *
389  * Convert a 8bit, multi-byte per character input string into a wide
390  * character (wchar_t) string.  The only guaranteed supported source encodings
391  * are CPL_ENC_UTF8, CPL_ENC_ASCII and CPL_ENC_ISO8869_1 (LATIN1).  The only
392  * guaranteed supported destination encoding is CPL_ENC_UCS2.  Other source
393  * and destination encodings may be supported depending on the underlying
394  * implementation.
395  *
396  * Note that the wchar_t type varies in size on different systems. On
397  * win32 it is normally 2 bytes, and on unix 4 bytes.
398  *
399  * If an error occurs an error may, or may not be posted with CPLError().
400  *
401  * @param pszSource input multi-byte character string.
402  * @param pszSrcEncoding source encoding, typically CPL_ENC_UTF8.
403  * @param pszDstEncoding destination encoding, typically CPL_ENC_UCS2.
404  *
405  * @return the zero terminated wchar_t string (to be freed with CPLFree()) or
406  * NULL on error.
407  *
408  * @since GDAL 1.6.0
409  */
410 
CPLRecodeToWCharStub(const char * pszSource,const char * pszSrcEncoding,const char * pszDstEncoding)411 wchar_t *CPLRecodeToWCharStub( const char *pszSource,
412                                const char *pszSrcEncoding,
413                                const char *pszDstEncoding )
414 
415 {
416     char *pszUTF8Source = const_cast<char *>(pszSource);
417 
418     if( strcmp(pszSrcEncoding, CPL_ENC_UTF8) != 0
419         && strcmp(pszSrcEncoding, CPL_ENC_ASCII) != 0 )
420     {
421         pszUTF8Source =
422             CPLRecodeStub(pszSource, pszSrcEncoding, CPL_ENC_UTF8);
423         if( pszUTF8Source == nullptr )
424             return nullptr;
425     }
426 
427 /* -------------------------------------------------------------------- */
428 /*      We try to avoid changes of character set.  We are just          */
429 /*      providing for unicode to unicode.                               */
430 /* -------------------------------------------------------------------- */
431     if( strcmp(pszDstEncoding, "WCHAR_T") != 0
432         && strcmp(pszDstEncoding, CPL_ENC_UCS2) != 0
433         && strcmp(pszDstEncoding, CPL_ENC_UCS4) != 0
434         && strcmp(pszDstEncoding, CPL_ENC_UTF16) != 0 )
435     {
436         CPLError( CE_Failure, CPLE_AppDefined,
437                   "Stub recoding implementation does not support "
438                   "CPLRecodeToWCharStub(...,%s,%s)",
439                   pszSrcEncoding, pszDstEncoding );
440         if( pszUTF8Source != pszSource )
441             CPLFree( pszUTF8Source );
442         return nullptr;
443     }
444 
445 /* -------------------------------------------------------------------- */
446 /*      Do the UTF-8 to UCS-2 recoding.                                 */
447 /* -------------------------------------------------------------------- */
448     int nSrcLen = static_cast<int>(strlen(pszUTF8Source));
449     wchar_t *pwszResult =
450         static_cast<wchar_t *>(CPLCalloc(sizeof(wchar_t), nSrcLen + 1));
451 
452     utf8towc( pszUTF8Source, nSrcLen, pwszResult, nSrcLen+1 );
453 
454     if( pszUTF8Source != pszSource )
455         CPLFree( pszUTF8Source );
456 
457     return pwszResult;
458 }
459 
460 /************************************************************************/
461 /*                                 CPLIsUTF8()                          */
462 /************************************************************************/
463 
464 /**
465  * Test if a string is encoded as UTF-8.
466  *
467  * @param pabyData input string to test
468  * @param nLen length of the input string, or -1 if the function must compute
469  *             the string length. In which case it must be null terminated.
470  * @return TRUE if the string is encoded as UTF-8. FALSE otherwise
471  *
472  * @since GDAL 1.7.0
473  */
CPLIsUTF8Stub(const char * pabyData,int nLen)474 int CPLIsUTF8Stub(const char* pabyData, int nLen)
475 {
476     if( nLen < 0 )
477         nLen = static_cast<int>(strlen(pabyData));
478     return utf8test(pabyData, static_cast<unsigned>(nLen)) != 0;
479 }
480 
481 /************************************************************************/
482 /* ==================================================================== */
483 /*      UTF.C code from FLTK with some modifications.                   */
484 /* ==================================================================== */
485 /************************************************************************/
486 
487 /* Set to 1 to turn bad UTF8 bytes into ISO-8859-1. If this is to zero
488    they are instead turned into the Unicode REPLACEMENT CHARACTER, of
489    value 0xfffd.
490    If this is on utf8decode will correctly map most (perhaps all)
491    human-readable text that is in ISO-8859-1. This may allow you
492    to completely ignore character sets in your code because virtually
493    everything is either ISO-8859-1 or UTF-8.
494 */
495 #define ERRORS_TO_ISO8859_1 1
496 
497 /* Set to 1 to turn bad UTF8 bytes in the 0x80-0x9f range into the
498    Unicode index for Microsoft's CP1252 character set. You should
499    also set ERRORS_TO_ISO8859_1. With this a huge amount of more
500    available text (such as all web pages) are correctly converted
501    to Unicode.
502 */
503 #define ERRORS_TO_CP1252 1
504 
505 /* A number of Unicode code points are in fact illegal and should not
506    be produced by a UTF-8 converter. Turn this on will replace the
507    bytes in those encodings with errors. If you do this then converting
508    arbitrary 16-bit data to UTF-8 and then back is not an identity,
509    which will probably break a lot of software.
510 */
511 #define STRICT_RFC3629 0
512 
513 #if ERRORS_TO_CP1252
514 // Codes 0x80..0x9f from the Microsoft CP1252 character set, translated
515 // to Unicode:
516 constexpr unsigned short cp1252[32] = {
517     0x20ac, 0x0081, 0x201a, 0x0192, 0x201e, 0x2026, 0x2020, 0x2021,
518     0x02c6, 0x2030, 0x0160, 0x2039, 0x0152, 0x008d, 0x017d, 0x008f,
519     0x0090, 0x2018, 0x2019, 0x201c, 0x201d, 0x2022, 0x2013, 0x2014,
520     0x02dc, 0x2122, 0x0161, 0x203a, 0x0153, 0x009d, 0x017e, 0x0178
521 };
522 #endif
523 
524 /************************************************************************/
525 /*                             utf8decode()                             */
526 /************************************************************************/
527 
528 /*
529     Decode a single UTF-8 encoded character starting at \e p. The
530     resulting Unicode value (in the range 0-0x10ffff) is returned,
531     and \e len is set the number of bytes in the UTF-8 encoding
532     (adding \e len to \e p will point at the next character).
533 
534     If \a p points at an illegal UTF-8 encoding, including one that
535     would go past \e end, or where a code is uses more bytes than
536     necessary, then *reinterpret_cast<const unsigned char*>(p) is translated as though it is
537     in the Microsoft CP1252 character set and \e len is set to 1.
538     Treating errors this way allows this to decode almost any
539     ISO-8859-1 or CP1252 text that has been mistakenly placed where
540     UTF-8 is expected, and has proven very useful.
541 
542     If you want errors to be converted to error characters (as the
543     standards recommend), adding a test to see if the length is
544     unexpectedly 1 will work:
545 
546 \code
547     if( *p & 0x80 )
548     {  // What should be a multibyte encoding.
549       code = utf8decode(p, end, &len);
550       if( len<2 ) code = 0xFFFD;  // Turn errors into REPLACEMENT CHARACTER.
551     }
552     else
553     {  // Handle the 1-byte utf8 encoding:
554       code = *p;
555       len = 1;
556     }
557 \endcode
558 
559     Direct testing for the 1-byte case (as shown above) will also
560     speed up the scanning of strings where the majority of characters
561     are ASCII.
562 */
utf8decode(const char * p,const char * end,int * len)563 static unsigned utf8decode(const char* p, const char* end, int* len)
564 {
565   unsigned char c = *reinterpret_cast<const unsigned char*>(p);
566   if( c < 0x80 )
567   {
568     *len = 1;
569     return c;
570 #if ERRORS_TO_CP1252
571   }
572   else if( c < 0xa0 )
573   {
574     *len = 1;
575     return cp1252[c-0x80];
576 #endif
577   }
578   else if( c < 0xc2 )
579   {
580     goto FAIL;
581   }
582   if( p+1 >= end || (p[1] & 0xc0) != 0x80 ) goto FAIL;
583   if( c < 0xe0 )
584   {
585     *len = 2;
586     return
587       ((p[0] & 0x1f) << 6) +
588       ((p[1] & 0x3f));
589   }
590   else if( c == 0xe0 )
591   {
592     if( (reinterpret_cast<const unsigned char*>(p))[1] < 0xa0 ) goto FAIL;
593     goto UTF8_3;
594 #if STRICT_RFC3629
595   }
596   else if( c == 0xed )
597   {
598     // RFC 3629 says surrogate chars are illegal.
599     if( (reinterpret_cast<const unsigned char*>(p))[1] >= 0xa0 ) goto FAIL;
600     goto UTF8_3;
601   }
602   else if( c == 0xef )
603   {
604     // 0xfffe and 0xffff are also illegal characters.
605     if( (reinterpret_cast<const unsigned char*>(p))[1]==0xbf &&
606         (reinterpret_cast<const unsigned char*>(p))[2]>=0xbe ) goto FAIL;
607     goto UTF8_3;
608 #endif
609   }
610   else if( c < 0xf0 )
611   {
612   UTF8_3:
613     if( p+2 >= end || (p[2]&0xc0) != 0x80 ) goto FAIL;
614     *len = 3;
615     return
616       ((p[0] & 0x0f) << 12) +
617       ((p[1] & 0x3f) << 6) +
618       ((p[2] & 0x3f));
619   }
620   else if( c == 0xf0 )
621   {
622     if( (reinterpret_cast<const unsigned char*>(p))[1] < 0x90 ) goto FAIL;
623     goto UTF8_4;
624   }
625   else if( c < 0xf4 )
626   {
627   UTF8_4:
628     if( p+3 >= end || (p[2]&0xc0) != 0x80 || (p[3]&0xc0) != 0x80 ) goto FAIL;
629     *len = 4;
630 #if STRICT_RFC3629
631     // RFC 3629 says all codes ending in fffe or ffff are illegal:
632     if( (p[1]&0xf)==0xf &&
633         (reinterpret_cast<const unsigned char*>(p))[2] == 0xbf &&
634         (reinterpret_cast<const unsigned char*>(p))[3] >= 0xbe ) goto FAIL;
635 #endif
636     return
637       ((p[0] & 0x07) << 18) +
638       ((p[1] & 0x3f) << 12) +
639       ((p[2] & 0x3f) << 6) +
640       ((p[3] & 0x3f));
641   }
642   else if( c == 0xf4 )
643   {
644     if( (reinterpret_cast<const unsigned char*>(p))[1] > 0x8f ) goto FAIL; // After 0x10ffff.
645     goto UTF8_4;
646   }
647   else
648   {
649   FAIL:
650     *len = 1;
651 #if ERRORS_TO_ISO8859_1
652     return c;
653 #else
654     return 0xfffd; // Unicode REPLACEMENT CHARACTER
655 #endif
656   }
657 }
658 
659 /************************************************************************/
660 /*                              utf8towc()                              */
661 /************************************************************************/
662 
663 /*  Convert a UTF-8 sequence into an array of wchar_t. These
664     are used by some system calls, especially on Windows.
665 
666     \a src points at the UTF-8, and \a srclen is the number of bytes to
667     convert.
668 
669     \a dst points at an array to write, and \a dstlen is the number of
670     locations in this array. At most \a dstlen-1 words will be
671     written there, plus a 0 terminating word. Thus this function
672     will never overwrite the buffer and will always return a
673     zero-terminated string. If \a dstlen is zero then \a dst can be
674     null and no data is written, but the length is returned.
675 
676     The return value is the number of words that \e would be written
677     to \a dst if it were long enough, not counting the terminating
678     zero. If the return value is greater or equal to \a dstlen it
679     indicates truncation, you can then allocate a new array of size
680     return+1 and call this again.
681 
682     Errors in the UTF-8 are converted as though each byte in the
683     erroneous string is in the Microsoft CP1252 encoding. This allows
684     ISO-8859-1 text mistakenly identified as UTF-8 to be printed
685     correctly.
686 
687     Notice that sizeof(wchar_t) is 2 on Windows and is 4 on Linux
688     and most other systems. Where wchar_t is 16 bits, Unicode
689     characters in the range 0x10000 to 0x10ffff are converted to
690     "surrogate pairs" which take two words each (this is called UTF-16
691     encoding). If wchar_t is 32 bits this rather nasty problem is
692     avoided.
693 */
utf8towc(const char * src,unsigned srclen,wchar_t * dst,unsigned dstlen)694 static unsigned utf8towc(const char* src, unsigned srclen,
695                          wchar_t* dst, unsigned dstlen)
696 {
697   const char* p = src;
698   const char* e = src+srclen;
699   unsigned count = 0;
700   if( dstlen ) while( true )
701   {
702     if( p >= e )
703     {
704         dst[count] = 0;
705         return count;
706     }
707     if( !(*p & 0x80) )
708     {
709         // ASCII
710         dst[count] = *p++;
711     }
712     else
713     {
714       int len = 0;
715       unsigned ucs = utf8decode(p, e, &len);
716       p += len;
717 #ifdef _WIN32
718       if( ucs < 0x10000 )
719       {
720           dst[count] = static_cast<wchar_t>(ucs);
721       }
722       else
723       {
724         // Make a surrogate pair:
725         if( count+2 >= dstlen)
726         {
727             dst[count] = 0;
728             count += 2;
729             break;
730         }
731         dst[count] = static_cast<wchar_t>((((ucs-0x10000u)>>10)&0x3ff) | 0xd800);
732         dst[++count] = static_cast<wchar_t>((ucs&0x3ff) | 0xdc00);
733       }
734 #else
735       dst[count] = static_cast<wchar_t>(ucs);
736 #endif
737     }
738     if( ++count == dstlen )
739     {
740         dst[count-1] = 0;
741         break;
742     }
743   }
744   // We filled dst, measure the rest:
745   while( p < e )
746   {
747     if( !(*p & 0x80) )
748     {
749         p++;
750     }
751     else
752     {
753       int len = 0;
754 #ifdef _WIN32
755       const unsigned ucs = utf8decode(p, e, &len);
756       p += len;
757       if( ucs >= 0x10000 ) ++count;
758 #else
759       utf8decode(p, e, &len);
760       p += len;
761 #endif
762     }
763     ++count;
764   }
765 
766   return count;
767 }
768 
769 /************************************************************************/
770 /*                              utf8toa()                               */
771 /************************************************************************/
772 /* Convert a UTF-8 sequence into an array of 1-byte characters.
773 
774     If the UTF-8 decodes to a character greater than 0xff then it is
775     replaced with '?'.
776 
777     Errors in the UTF-8 are converted as individual bytes, same as
778     utf8decode() does. This allows ISO-8859-1 text mistakenly identified
779     as UTF-8 to be printed correctly (and possibly CP1512 on Windows).
780 
781     \a src points at the UTF-8, and \a srclen is the number of bytes to
782     convert.
783 
784     Up to \a dstlen bytes are written to \a dst, including a null
785     terminator. The return value is the number of bytes that would be
786     written, not counting the null terminator. If greater or equal to
787     \a dstlen then if you malloc a new array of size n+1 you will have
788     the space needed for the entire string. If \a dstlen is zero then
789     nothing is written and this call just measures the storage space
790     needed.
791 */
utf8toa(const char * src,unsigned srclen,char * dst,unsigned dstlen)792 static unsigned int utf8toa( const char* src, unsigned srclen,
793                              char* dst, unsigned dstlen )
794 {
795   const char* p = src;
796   const char* e = src+srclen;
797   unsigned int count = 0;
798   if( dstlen ) while( true )
799   {
800     if( p >= e )
801     {
802         dst[count] = 0;
803         return count;
804     }
805     unsigned char c = *reinterpret_cast<const unsigned char*>(p);
806     if( c < 0xC2 )
807     {
808         // ASCII or bad code.
809         dst[count] = c;
810         p++;
811     }
812     else
813     {
814         int len = 0;
815         const unsigned int ucs = utf8decode(p, e, &len);
816         p += len;
817         if( ucs < 0x100 )
818         {
819             dst[count] = static_cast<char>(ucs);
820         }
821         else
822         {
823             if( !bHaveWarned4 )
824             {
825                 bHaveWarned4 = true;
826                 CPLError(CE_Warning, CPLE_AppDefined,
827                          "One or several characters couldn't be converted "
828                          "correctly from UTF-8 to ISO-8859-1.  "
829                          "This warning will not be emitted anymore.");
830             }
831             dst[count] = '?';
832       }
833     }
834     if( ++count >= dstlen )
835     {
836         dst[count-1] = 0;
837         break;
838     }
839   }
840   // We filled dst, measure the rest:
841   while( p < e )
842   {
843     if( !(*p & 0x80) )
844     {
845         p++;
846     }
847     else
848     {
849         int len = 0;
850         utf8decode(p, e, &len);
851         p += len;
852     }
853     ++count;
854   }
855   return count;
856 }
857 
858 /************************************************************************/
859 /*                             utf8fromwc()                             */
860 /************************************************************************/
861 /* Turn "wide characters" as returned by some system calls
862     (especially on Windows) into UTF-8.
863 
864     Up to \a dstlen bytes are written to \a dst, including a null
865     terminator. The return value is the number of bytes that would be
866     written, not counting the null terminator. If greater or equal to
867     \a dstlen then if you malloc a new array of size n+1 you will have
868     the space needed for the entire string. If \a dstlen is zero then
869     nothing is written and this call just measures the storage space
870     needed.
871 
872     \a srclen is the number of words in \a src to convert. On Windows
873     this is not necessarily the number of characters, due to there
874     possibly being "surrogate pairs" in the UTF-16 encoding used.
875     On Unix wchar_t is 32 bits and each location is a character.
876 
877     On Unix if a src word is greater than 0x10ffff then this is an
878     illegal character according to RFC 3629. These are converted as
879     though they are 0xFFFD (REPLACEMENT CHARACTER). Characters in the
880     range 0xd800 to 0xdfff, or ending with 0xfffe or 0xffff are also
881     illegal according to RFC 3629. However I encode these as though
882     they are legal, so that utf8towc will return the original data.
883 
884     On Windows "surrogate pairs" are converted to a single character
885     and UTF-8 encoded (as 4 bytes). Mismatched halves of surrogate
886     pairs are converted as though they are individual characters.
887 */
utf8fromwc(char * dst,unsigned dstlen,const wchar_t * src,unsigned srclen)888 static unsigned int utf8fromwc( char* dst, unsigned dstlen,
889                                 const wchar_t* src, unsigned srclen )
890 {
891   unsigned int i = 0;
892   unsigned int count = 0;
893   if( dstlen ) while( true )
894   {
895       if( i >= srclen )
896       {
897           dst[count] = 0;
898           return count;
899       }
900       unsigned int ucs = src[i++];
901       if( ucs < 0x80U )
902       {
903           dst[count++] = static_cast<char>(ucs);
904           if( count >= dstlen )
905           {
906               dst[count-1] = 0;
907               break;
908           }
909       }
910       else if( ucs < 0x800U )
911       {
912           // 2 bytes.
913           if( count+2 >= dstlen )
914           {
915               dst[count] = 0;
916               count += 2;
917               break;
918           }
919           dst[count++] = 0xc0 | static_cast<char>(ucs >> 6);
920           dst[count++] = 0x80 | static_cast<char>(ucs & 0x3F);
921 #ifdef _WIN32
922       }
923       else if( ucs >= 0xd800 && ucs <= 0xdbff && i < srclen &&
924                src[i] >= 0xdc00 && src[i] <= 0xdfff)
925       {
926           // Surrogate pair.
927           unsigned int ucs2 = src[i++];
928           ucs = 0x10000U + ((ucs & 0x3ff) << 10) + (ucs2 & 0x3ff);
929           // All surrogate pairs turn into 4-byte utf8.
930 #else
931       }
932       else if( ucs >= 0x10000 )
933       {
934           if( ucs > 0x10ffff )
935           {
936               ucs = 0xfffd;
937               goto J1;
938           }
939 #endif
940           if( count+4 >= dstlen )
941           {
942               dst[count] = 0;
943               count += 4;
944               break;
945           }
946           dst[count++] = 0xf0 | static_cast<char>(ucs >> 18);
947           dst[count++] = 0x80 | static_cast<char>((ucs >> 12) & 0x3F);
948           dst[count++] = 0x80 | static_cast<char>((ucs >> 6) & 0x3F);
949           dst[count++] = 0x80 | static_cast<char>(ucs & 0x3F);
950       }
951       else
952       {
953 #ifndef _WIN32
954     J1:
955 #endif
956       // All others are 3 bytes:
957           if( count+3 >= dstlen )
958           {
959               dst[count] = 0;
960               count += 3;
961               break;
962           }
963           dst[count++] = 0xe0 | static_cast<char>(ucs >> 12);
964           dst[count++] = 0x80 | static_cast<char>((ucs >> 6) & 0x3F);
965           dst[count++] = 0x80 | static_cast<char>(ucs & 0x3F);
966       }
967   }
968 
969   // We filled dst, measure the rest:
970   while( i < srclen )
971   {
972       unsigned int ucs = src[i++];
973       if( ucs < 0x80U )
974       {
975           count++;
976       }
977       else if( ucs < 0x800U )
978       {
979           // 2 bytes.
980           count += 2;
981 #ifdef _WIN32
982       }
983       else if( ucs >= 0xd800 && ucs <= 0xdbff && i < srclen-1 &&
984                src[i+1] >= 0xdc00 && src[i+1] <= 0xdfff )
985       {
986           // Surrogate pair.
987           ++i;
988 #else
989       }
990       else if( ucs >= 0x10000 && ucs <= 0x10ffff )
991       {
992 #endif
993           count += 4;
994       }
995       else
996       {
997           count += 3;
998       }
999   }
1000   return count;
1001 }
1002 
1003 /************************************************************************/
1004 /*                             utf8froma()                              */
1005 /************************************************************************/
1006 
1007 /* Convert an ISO-8859-1 (i.e. normal c-string) byte stream to UTF-8.
1008 
1009     It is possible this should convert Microsoft's CP1252 to UTF-8
1010     instead. This would translate the codes in the range 0x80-0x9f
1011     to different characters. Currently it does not do this.
1012 
1013     Up to \a dstlen bytes are written to \a dst, including a null
1014     terminator. The return value is the number of bytes that would be
1015     written, not counting the null terminator. If greater or equal to
1016     \a dstlen then if you malloc a new array of size n+1 you will have
1017     the space needed for the entire string. If \a dstlen is zero then
1018     nothing is written and this call just measures the storage space
1019     needed.
1020 
1021     \a srclen is the number of bytes in \a src to convert.
1022 
1023     If the return value equals \a srclen then this indicates that
1024     no conversion is necessary, as only ASCII characters are in the
1025     string.
1026 */
utf8froma(char * dst,unsigned dstlen,const char * src,unsigned srclen)1027 static unsigned utf8froma(char* dst, unsigned dstlen,
1028                           const char* src, unsigned srclen) {
1029     const char* p = src;
1030     const char* e = src+srclen;
1031     unsigned count = 0;
1032     if( dstlen ) while( true )
1033     {
1034         if( p >= e )
1035         {
1036             dst[count] = 0;
1037             return count;
1038         }
1039         unsigned char ucs = *reinterpret_cast<const unsigned char*>(p);
1040         p++;
1041         if( ucs < 0x80U )
1042         {
1043             dst[count++] = ucs;
1044             if( count >= dstlen )
1045             {
1046                 dst[count-1] = 0;
1047                 break;
1048             }
1049         }
1050         else
1051         {
1052             // 2 bytes (note that CP1252 translate could make 3 bytes!)
1053             if( count+2 >= dstlen )
1054             {
1055                 dst[count] = 0;
1056                 count += 2;
1057                 break;
1058             }
1059             dst[count++] = 0xc0 | (ucs >> 6);
1060             dst[count++] = 0x80 | (ucs & 0x3F);
1061         }
1062     }
1063 
1064     // We filled dst, measure the rest:
1065     while( p < e )
1066     {
1067         unsigned char ucs = *reinterpret_cast<const unsigned char*>(p);
1068         p++;
1069         if( ucs < 0x80U )
1070         {
1071             count++;
1072         }
1073         else
1074         {
1075             count += 2;
1076         }
1077     }
1078 
1079     return count;
1080 }
1081 
1082 #ifdef _WIN32
1083 
1084 /************************************************************************/
1085 /*                            CPLWin32Recode()                          */
1086 /************************************************************************/
1087 
1088 /* Convert an CODEPAGE (i.e. normal c-string) byte stream
1089      to another CODEPAGE (i.e. normal c-string) byte stream.
1090 
1091     \a src is target c-string byte stream (including a null terminator).
1092     \a src_code_page is target c-string byte code page.
1093     \a dst_code_page is destination c-string byte code page.
1094 
1095    UTF7          65000
1096    UTF8          65001
1097    OEM-US          437
1098    OEM-ALABIC      720
1099    OEM-GREEK       737
1100    OEM-BALTIC      775
1101    OEM-MLATIN1     850
1102    OEM-LATIN2      852
1103    OEM-CYRILLIC    855
1104    OEM-TURKISH     857
1105    OEM-MLATIN1P    858
1106    OEM-HEBREW      862
1107    OEM-RUSSIAN     866
1108 
1109    THAI            874
1110    SJIS            932
1111    GBK             936
1112    KOREA           949
1113    BIG5            950
1114 
1115    EUROPE         1250
1116    CYRILLIC       1251
1117    LATIN1         1252
1118    GREEK          1253
1119    TURKISH        1254
1120    HEBREW         1255
1121    ARABIC         1256
1122    BALTIC         1257
1123    VIETNAM        1258
1124 
1125    ISO-LATIN1    28591
1126    ISO-LATIN2    28592
1127    ISO-LATIN3    28593
1128    ISO-BALTIC    28594
1129    ISO-CYRILLIC  28595
1130    ISO-ARABIC    28596
1131    ISO-HEBREW    28598
1132    ISO-TURKISH   28599
1133    ISO-LATIN9    28605
1134 
1135    ISO-2022-JP   50220
1136 
1137 */
1138 
CPLWin32Recode(const char * src,unsigned src_code_page,unsigned dst_code_page)1139 char* CPLWin32Recode( const char* src, unsigned src_code_page,
1140                       unsigned dst_code_page )
1141 {
1142     // Convert from source code page to Unicode.
1143 
1144     // Compute the length in wide characters.
1145     int wlen = MultiByteToWideChar( src_code_page, MB_ERR_INVALID_CHARS, src,
1146                                     -1, nullptr, 0 );
1147     if( wlen == 0 && GetLastError() == ERROR_NO_UNICODE_TRANSLATION )
1148     {
1149         if( !bHaveWarned5 )
1150         {
1151             bHaveWarned5 = true;
1152             CPLError(
1153                 CE_Warning, CPLE_AppDefined,
1154                 "One or several characters could not be translated from CP%d. "
1155                 "This warning will not be emitted anymore.", src_code_page);
1156         }
1157 
1158         // Retry now without MB_ERR_INVALID_CHARS flag.
1159         wlen = MultiByteToWideChar( src_code_page, 0, src, -1, nullptr, 0 );
1160     }
1161 
1162     // Do the actual conversion.
1163     wchar_t* tbuf =
1164         static_cast<wchar_t *>(CPLCalloc(sizeof(wchar_t), wlen + 1));
1165     tbuf[wlen] = 0;
1166     MultiByteToWideChar( src_code_page, 0, src, -1, tbuf, wlen+1 );
1167 
1168     // Convert from Unicode to destination code page.
1169 
1170     // Compute the length in chars.
1171     BOOL bUsedDefaultChar = FALSE;
1172     int len = 0;
1173     if( dst_code_page == CP_UTF7 || dst_code_page == CP_UTF8 )
1174         len = WideCharToMultiByte( dst_code_page, 0, tbuf, -1, nullptr, 0, nullptr, nullptr );
1175     else
1176         len = WideCharToMultiByte( dst_code_page, 0, tbuf, -1, nullptr, 0, nullptr,
1177                                    &bUsedDefaultChar );
1178     if( bUsedDefaultChar )
1179     {
1180         if( !bHaveWarned6 )
1181         {
1182             bHaveWarned6 = true;
1183             CPLError(
1184                 CE_Warning, CPLE_AppDefined,
1185                 "One or several characters could not be translated to CP%d. "
1186                 "This warning will not be emitted anymore.", dst_code_page);
1187         }
1188     }
1189 
1190     // Do the actual conversion.
1191     char* pszResult = static_cast<char *>(CPLCalloc(sizeof(char), len + 1));
1192     WideCharToMultiByte(dst_code_page, 0, tbuf, -1, pszResult, len+1, nullptr, nullptr);
1193     pszResult[len] = 0;
1194 
1195     CPLFree(tbuf);
1196 
1197     return pszResult;
1198 }
1199 
1200 #endif
1201 
1202 /*
1203 ** For now we disable the rest which is locale() related.  We may need
1204 ** parts of it later.
1205 */
1206 
1207 #ifdef notdef
1208 
1209 #ifdef _WIN32
1210 # include <windows.h>
1211 #endif
1212 
1213 /*! Return true if the "locale" seems to indicate that UTF-8 encoding
1214     is used. If true the utf8tomb and utf8frommb don't do anything
1215     useful.
1216 
1217     <i>It is highly recommended that you change your system so this
1218     does return true.</i> On Windows this is done by setting the
1219     "codepage" to CP_UTF8.  On Unix this is done by setting $LC_CTYPE
1220     to a string containing the letters "utf" or "UTF" in it, or by
1221     deleting all $LC* and $LANG environment variables. In the future
1222     it is likely that all non-Asian Unix systems will return true,
1223     due to the compatibility of UTF-8 with ISO-8859-1.
1224 */
utf8locale(void)1225 int utf8locale( void )
1226 {
1227     static int ret = 2;
1228     if( ret == 2 ) {
1229 #ifdef _WIN32
1230         ret = GetACP() == CP_UTF8;
1231 #else
1232         char* s;
1233         ret = 1; // assume UTF-8 if no locale
1234         if( ((s = getenv("LC_CTYPE")) && *s) ||
1235             ((s = getenv("LC_ALL"))   && *s) ||
1236             ((s = getenv("LANG"))     && *s) )
1237         {
1238             ret = strstr(s, "utf") || strstr(s, "UTF");
1239         }
1240 #endif
1241     }
1242 
1243     return ret;
1244 }
1245 
1246 /*! Convert the UTF-8 used by FLTK to the locale-specific encoding
1247     used for filenames (and sometimes used for data in files).
1248     Unfortunately due to stupid design you will have to do this as
1249     needed for filenames. This is a bug on both Unix and Windows.
1250 
1251     Up to \a dstlen bytes are written to \a dst, including a null
1252     terminator. The return value is the number of bytes that would be
1253     written, not counting the null terminator. If greater or equal to
1254     \a dstlen then if you malloc a new array of size n+1 you will have
1255     the space needed for the entire string. If \a dstlen is zero then
1256     nothing is written and this call just measures the storage space
1257     needed.
1258 
1259     If utf8locale() returns true then this does not change the data.
1260     It is copied and truncated as necessary to
1261     the destination buffer and \a srclen is always returned.  */
utf8tomb(const char * src,unsigned srclen,char * dst,unsigned dstlen)1262 unsigned utf8tomb( const char* src, unsigned srclen,
1263                    char* dst, unsigned dstlen )
1264 {
1265   if( !utf8locale() )
1266   {
1267 #ifdef _WIN32
1268     wchar_t lbuf[1024] = {};
1269     wchar_t* buf = lbuf;
1270     unsigned length = utf8towc(src, srclen, buf, 1024);
1271     unsigned ret;
1272     if( length >= 1024 )
1273     {
1274         buf = static_cast<wchar_t *>(malloc((length + 1) * sizeof(wchar_t)));
1275         utf8towc(src, srclen, buf, length + 1);
1276     }
1277     if( dstlen )
1278     {
1279       // apparently this does not null-terminate, even though msdn
1280       // documentation claims it does:
1281       ret =
1282         WideCharToMultiByte(GetACP(), 0, buf, length, dst, dstlen, 0, 0);
1283       dst[ret] = 0;
1284     }
1285     // if it overflows or measuring length, get the actual length:
1286     if( dstlen==0 || ret >= dstlen-1 )
1287         ret = WideCharToMultiByte(GetACP(), 0, buf, length, 0, 0, 0, 0);
1288     if( buf != lbuf ) free((void*)buf);
1289     return ret;
1290 #else
1291     wchar_t lbuf[1024] = {};
1292     wchar_t* buf = lbuf;
1293     unsigned length = utf8towc(src, srclen, buf, 1024);
1294     if( length >= 1024 )
1295     {
1296         buf = static_cast<wchar_t *>(malloc((length + 1) * sizeof(wchar_t)));
1297         utf8towc(src, srclen, buf, length+1);
1298     }
1299     int ret = 0;
1300     if( dstlen )
1301     {
1302       ret = wcstombs(dst, buf, dstlen);
1303       if( ret >= dstlen - 1 ) ret = wcstombs(0, buf, 0);
1304     } else {
1305       ret = wcstombs(0, buf, 0);
1306     }
1307     if( buf != lbuf ) free((void*)buf);
1308     if( ret >= 0 ) return (unsigned)ret;
1309     // On any errors we return the UTF-8 as raw text...
1310 #endif
1311   }
1312   // Identity transform:
1313   if( srclen < dstlen )
1314   {
1315     memcpy(dst, src, srclen);
1316     dst[srclen] = 0;
1317   } else {
1318     memcpy(dst, src, dstlen-1);
1319     dst[dstlen-1] = 0;
1320   }
1321   return srclen;
1322 }
1323 
1324 /*! Convert a filename from the locale-specific multibyte encoding
1325     used by Windows to UTF-8 as used by FLTK.
1326 
1327     Up to \a dstlen bytes are written to \a dst, including a null
1328     terminator. The return value is the number of bytes that would be
1329     written, not counting the null terminator. If greater or equal to
1330     \a dstlen then if you malloc a new array of size n+1 you will have
1331     the space needed for the entire string. If \a dstlen is zero then
1332     nothing is written and this call just measures the storage space
1333     needed.
1334 
1335     On Unix or on Windows when a UTF-8 locale is in effect, this
1336     does not change the data. It is copied and truncated as necessary to
1337     the destination buffer and \a srclen is always returned.
1338     You may also want to check if utf8test() returns non-zero, so that
1339     the filesystem can store filenames in UTF-8 encoding regardless of
1340     the locale.
1341 */
utf8frommb(char * dst,unsigned dstlen,const char * src,unsigned srclen)1342 unsigned utf8frommb(char* dst, unsigned dstlen,
1343                     const char* src, unsigned srclen)
1344 {
1345   if( !utf8locale() )
1346   {
1347 #ifdef _WIN32
1348     wchar_t lbuf[1024] = {};
1349     wchar_t* buf = lbuf;
1350     unsigned ret;
1351     const unsigned length =
1352       MultiByteToWideChar(GetACP(), 0, src, srclen, buf, 1024);
1353     if( length >= 1024 )
1354     {
1355       length = MultiByteToWideChar(GetACP(), 0, src, srclen, 0, 0);
1356       buf = static_cast<wchar_t *>(malloc(length * sizeof(wchar_t)));
1357       MultiByteToWideChar(GetACP(), 0, src, srclen, buf, length);
1358     }
1359     ret = utf8fromwc(dst, dstlen, buf, length);
1360     if( buf != lbuf ) free(buf);
1361     return ret;
1362 #else
1363     wchar_t lbuf[1024] = {};
1364     wchar_t* buf = lbuf;
1365     const int length = mbstowcs(buf, src, 1024);
1366     if( length >= 1024 )
1367     {
1368       length = mbstowcs(0, src, 0)+1;
1369       buf = static_cast<wchar_t *>(malloc(length*sizeof(unsigned short)));
1370       mbstowcs(buf, src, length);
1371     }
1372     if( length >= 0 )
1373     {
1374       const unsigned ret = utf8fromwc(dst, dstlen, buf, length);
1375       if( buf != lbuf ) free(buf);
1376       return ret;
1377     }
1378     // Errors in conversion return the UTF-8 unchanged.
1379 #endif
1380   }
1381   // Identity transform:
1382   if( srclen < dstlen )
1383   {
1384     memcpy(dst, src, srclen);
1385     dst[srclen] = 0;
1386   }
1387   else
1388   {
1389     memcpy(dst, src, dstlen-1);
1390     dst[dstlen-1] = 0;
1391   }
1392   return srclen;
1393 }
1394 
1395 #endif // def notdef - disabled locale specific stuff.
1396 
1397 /*! Examines the first \a srclen bytes in \a src and return a verdict
1398     on whether it is UTF-8 or not.
1399     - Returns 0 if there is any illegal UTF-8 sequences, using the
1400       same rules as utf8decode(). Note that some UCS values considered
1401       illegal by RFC 3629, such as 0xffff, are considered legal by this.
1402     - Returns 1 if there are only single-byte characters (i.e. no bytes
1403       have the high bit set). This is legal UTF-8, but also indicates
1404       plain ASCII. It also returns 1 if \a srclen is zero.
1405     - Returns 2 if there are only characters less than 0x800.
1406     - Returns 3 if there are only characters less than 0x10000.
1407     - Returns 4 if there are characters in the 0x10000 to 0x10ffff range.
1408 
1409     Because there are many illegal sequences in UTF-8, it is almost
1410     impossible for a string in another encoding to be confused with
1411     UTF-8. This is very useful for transitioning Unix to UTF-8
1412     filenames, you can simply test each filename with this to decide
1413     if it is UTF-8 or in the locale encoding. My hope is that if
1414     this is done we will be able to cleanly transition to a locale-less
1415     encoding.
1416 */
1417 
utf8test(const char * src,unsigned srclen)1418 static int utf8test( const char* src, unsigned srclen )
1419 {
1420     int ret = 1;
1421     const char* p = src;
1422     const char* e = src + srclen;
1423     while( p < e )
1424     {
1425         if( *p == 0 )
1426             return 0;
1427         if( *p & 0x80 )
1428         {
1429             int len = 0;
1430             utf8decode(p, e, &len);
1431             if( len < 2 ) return 0;
1432             if( len > ret ) ret = len;
1433             p += len;
1434         } else {
1435             p++;
1436         }
1437     }
1438     return ret;
1439 }
1440 
1441 #endif /* defined(CPL_RECODE_STUB) */
1442