1 /**********************************************************************
2 *
3 * Name: cpl_recode_stub.cpp
4 * Project: CPL - Common Portability Library
5 * Purpose: Character set recoding and char/wchar_t conversions, stub
6 * implementation to be used if iconv() functionality is not
7 * available.
8 * Author: Frank Warmerdam, warmerdam@pobox.com
9 *
10 * The bulk of this code is derived from the utf.c module from FLTK. It
11 * was originally downloaded from:
12 * http://svn.easysw.com/public/fltk/fltk/trunk/src/utf.c
13 *
14 **********************************************************************
15 * Copyright (c) 2008, Frank Warmerdam
16 * Copyright 2006 by Bill Spitzak and others.
17 * Copyright (c) 2009-2014, Even Rouault <even dot rouault at spatialys.com>
18 *
19 * Permission to use, copy, modify, and distribute this software for any
20 * purpose with or without fee is hereby granted, provided that the above
21 * copyright notice and this permission notice appear in all copies.
22 *
23 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
24 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
25 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
26 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
27 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
28 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
29 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
30 **********************************************************************/
31
32 #include "cpl_port.h"
33 #include "cpl_string.h"
34
35 #include <cstring>
36
37 #include "cpl_conv.h"
38 #include "cpl_error.h"
39
40 CPL_CVSID("$Id: cpl_recode_stub.cpp b1c9c12ad373e40b955162b45d704070d4ebf7b0 2019-06-19 16:50:15 +0200 Even Rouault $")
41
42 #ifdef CPL_RECODE_STUB
43
44 static unsigned utf8decode(const char* p, const char* end, int* len);
45 static unsigned utf8towc(const char* src, unsigned srclen,
46 wchar_t* dst, unsigned dstlen);
47 static unsigned utf8toa(const char* src, unsigned srclen,
48 char* dst, unsigned dstlen);
49 static unsigned utf8fromwc(char* dst, unsigned dstlen,
50 const wchar_t* src, unsigned srclen);
51 static unsigned utf8froma(char* dst, unsigned dstlen,
52 const char* src, unsigned srclen);
53 static int utf8test(const char* src, unsigned srclen);
54
55 #ifdef _WIN32
56
57 #include <windows.h>
58 #include <winnls.h>
59
60 static char* CPLWin32Recode( const char* src, unsigned src_code_page,
61 unsigned dst_code_page )
62 CPL_RETURNS_NONNULL;
63 #endif
64
65 /* used by cpl_recode.cpp */
66 extern void CPLClearRecodeStubWarningFlags();
67 extern char *CPLRecodeStub( const char *, const char *, const char * )
68 CPL_RETURNS_NONNULL;
69 extern char *CPLRecodeFromWCharStub( const wchar_t *,
70 const char *, const char * );
71 extern wchar_t *CPLRecodeToWCharStub( const char *,
72 const char *, const char * );
73 extern int CPLIsUTF8Stub( const char *, int );
74
75 /************************************************************************/
76 /* ==================================================================== */
77 /* Stub Implementation not depending on iconv() or WIN32 API. */
78 /* ==================================================================== */
79 /************************************************************************/
80
81 static bool bHaveWarned1 = false;
82 static bool bHaveWarned2 = false;
83 static bool bHaveWarned3 = false;
84 static bool bHaveWarned4 = false;
85 static bool bHaveWarned5 = false;
86 static bool bHaveWarned6 = false;
87
88 /************************************************************************/
89 /* CPLClearRecodeStubWarningFlags() */
90 /************************************************************************/
91
CPLClearRecodeStubWarningFlags()92 void CPLClearRecodeStubWarningFlags()
93 {
94 bHaveWarned1 = false;
95 bHaveWarned2 = false;
96 bHaveWarned3 = false;
97 bHaveWarned4 = false;
98 bHaveWarned5 = false;
99 bHaveWarned6 = false;
100 }
101
102 /************************************************************************/
103 /* CPLRecodeStub() */
104 /************************************************************************/
105
106 /**
107 * Convert a string from a source encoding to a destination encoding.
108 *
109 * The only guaranteed supported encodings are CPL_ENC_UTF8, CPL_ENC_ASCII
110 * and CPL_ENC_ISO8859_1. Currently, the following conversions are supported :
111 * <ul>
112 * <li>CPL_ENC_ASCII -> CPL_ENC_UTF8 or CPL_ENC_ISO8859_1 (no conversion in
113 * fact)</li>
114 * <li>CPL_ENC_ISO8859_1 -> CPL_ENC_UTF8</li>
115 * <li>CPL_ENC_UTF8 -> CPL_ENC_ISO8859_1</li>
116 * </ul>
117 *
118 * If an error occurs an error may, or may not be posted with CPLError().
119 *
120 * @param pszSource a NULL terminated string.
121 * @param pszSrcEncoding the source encoding.
122 * @param pszDstEncoding the destination encoding.
123 *
124 * @return a NULL terminated string which should be freed with CPLFree().
125 */
126
CPLRecodeStub(const char * pszSource,const char * pszSrcEncoding,const char * pszDstEncoding)127 char *CPLRecodeStub( const char *pszSource,
128 const char *pszSrcEncoding,
129 const char *pszDstEncoding )
130
131 {
132 /* -------------------------------------------------------------------- */
133 /* If the source or destination is current locale(), we change */
134 /* it to ISO8859-1 since our stub implementation does not */
135 /* attempt to address locales properly. */
136 /* -------------------------------------------------------------------- */
137
138 if( pszSrcEncoding[0] == '\0' )
139 pszSrcEncoding = CPL_ENC_ISO8859_1;
140
141 if( pszDstEncoding[0] == '\0' )
142 pszDstEncoding = CPL_ENC_ISO8859_1;
143
144 /* -------------------------------------------------------------------- */
145 /* ISO8859 to UTF8 */
146 /* -------------------------------------------------------------------- */
147 if( strcmp(pszSrcEncoding, CPL_ENC_ISO8859_1) == 0
148 && strcmp(pszDstEncoding, CPL_ENC_UTF8) == 0 )
149 {
150 const int nCharCount = static_cast<int>(strlen(pszSource));
151 char *pszResult = static_cast<char *>(CPLCalloc(1, nCharCount * 2 + 1));
152
153 utf8froma(pszResult, nCharCount * 2 + 1, pszSource, nCharCount);
154
155 return pszResult;
156 }
157
158 /* -------------------------------------------------------------------- */
159 /* UTF8 to ISO8859 */
160 /* -------------------------------------------------------------------- */
161 if( strcmp(pszSrcEncoding, CPL_ENC_UTF8) == 0
162 && strcmp(pszDstEncoding, CPL_ENC_ISO8859_1) == 0 )
163 {
164 int nCharCount = static_cast<int>(strlen(pszSource));
165 char *pszResult = static_cast<char *>(CPLCalloc(1, nCharCount + 1));
166
167 utf8toa(pszSource, nCharCount, pszResult, nCharCount + 1);
168
169 return pszResult;
170 }
171
172 #ifdef _WIN32
173 /* ---------------------------------------------------------------------*/
174 /* CPXXX to UTF8 */
175 /* ---------------------------------------------------------------------*/
176 if( STARTS_WITH(pszSrcEncoding, "CP")
177 && strcmp(pszDstEncoding, CPL_ENC_UTF8) == 0 )
178 {
179 int nCode = atoi( pszSrcEncoding + 2 );
180 if( nCode > 0 ) {
181 return CPLWin32Recode( pszSource, nCode, CP_UTF8 );
182 }
183 else if( EQUAL(pszSrcEncoding, "CP_OEMCP") )
184 return CPLWin32Recode( pszSource, CP_OEMCP, CP_UTF8 );
185 else if( EQUAL(pszSrcEncoding, "CP_ACP") )
186 return CPLWin32Recode( pszSource, CP_ACP, CP_UTF8 );
187 }
188
189 /* ---------------------------------------------------------------------*/
190 /* UTF8 to CPXXX */
191 /* ---------------------------------------------------------------------*/
192 if( strcmp(pszSrcEncoding, CPL_ENC_UTF8) == 0
193 && STARTS_WITH(pszDstEncoding, "CP") )
194 {
195 int nCode = atoi( pszDstEncoding + 2 );
196 if( nCode > 0 ) {
197 return CPLWin32Recode( pszSource, CP_UTF8, nCode );
198 }
199 else if( EQUAL(pszDstEncoding, "CP_OEMCP") )
200 return CPLWin32Recode( pszSource, CP_UTF8, CP_OEMCP );
201 else if( EQUAL(pszDstEncoding, "CP_ACP") )
202 return CPLWin32Recode( pszSource, CP_UTF8, CP_ACP );
203 }
204 #endif
205
206 /* -------------------------------------------------------------------- */
207 /* Anything else to UTF-8 is treated as ISO8859-1 to UTF-8 with */
208 /* a one-time warning. */
209 /* -------------------------------------------------------------------- */
210 if( strcmp(pszDstEncoding, CPL_ENC_UTF8) == 0 )
211 {
212 int nCharCount = static_cast<int>(strlen(pszSource));
213 char *pszResult = static_cast<char *>(CPLCalloc(1, nCharCount * 2 + 1));
214
215 if( EQUAL( pszSrcEncoding, "CP437") ) // For ZIP file handling.
216 {
217 bool bIsAllPrintableASCII = true;
218 for( int i = 0; i <nCharCount; i++ )
219 {
220 if( pszSource[i] < 32 || pszSource[i] > 126 )
221 {
222 bIsAllPrintableASCII = false;
223 break;
224 }
225 }
226 if( bIsAllPrintableASCII )
227 {
228 if( nCharCount )
229 memcpy( pszResult, pszSource, nCharCount );
230 return pszResult;
231 }
232 }
233
234 if( !bHaveWarned1 )
235 {
236 bHaveWarned1 = true;
237 CPLError( CE_Warning, CPLE_AppDefined,
238 "Recode from %s to UTF-8 not supported, "
239 "treated as ISO-8859-1 to UTF-8.",
240 pszSrcEncoding );
241 }
242
243 utf8froma( pszResult, nCharCount*2+1, pszSource, nCharCount );
244
245 return pszResult;
246 }
247
248 /* -------------------------------------------------------------------- */
249 /* UTF-8 to anything else is treated as UTF-8 to ISO-8859-1 */
250 /* with a warning. */
251 /* -------------------------------------------------------------------- */
252 if( strcmp(pszSrcEncoding, CPL_ENC_UTF8) == 0
253 && strcmp(pszDstEncoding, CPL_ENC_ISO8859_1) == 0 )
254 {
255 int nCharCount = static_cast<int>(strlen(pszSource));
256 char *pszResult = static_cast<char *>(CPLCalloc(1, nCharCount + 1));
257
258 if( !bHaveWarned2 )
259 {
260 bHaveWarned2 = true;
261 CPLError( CE_Warning, CPLE_AppDefined,
262 "Recode from UTF-8 to %s not supported, "
263 "treated as UTF-8 to ISO-8859-1.",
264 pszDstEncoding );
265 }
266
267 utf8toa(pszSource, nCharCount, pszResult, nCharCount + 1);
268
269 return pszResult;
270 }
271
272 /* -------------------------------------------------------------------- */
273 /* Everything else is treated as a no-op with a warning. */
274 /* -------------------------------------------------------------------- */
275 {
276 if( !bHaveWarned3 )
277 {
278 bHaveWarned3 = true;
279 CPLError( CE_Warning, CPLE_AppDefined,
280 "Recode from %s to %s not supported, no change applied.",
281 pszSrcEncoding, pszDstEncoding );
282 }
283
284 return CPLStrdup(pszSource);
285 }
286 }
287
288 /************************************************************************/
289 /* CPLRecodeFromWCharStub() */
290 /************************************************************************/
291
292 /**
293 * Convert wchar_t string to UTF-8.
294 *
295 * Convert a wchar_t string into a multibyte utf-8 string. The only
296 * guaranteed supported source encoding is CPL_ENC_UCS2, and the only
297 * guaranteed supported destination encodings are CPL_ENC_UTF8, CPL_ENC_ASCII
298 * and CPL_ENC_ISO8859_1. In some cases (i.e. using iconv()) other encodings
299 * may also be supported.
300 *
301 * Note that the wchar_t type varies in size on different systems. On
302 * win32 it is normally 2 bytes, and on unix 4 bytes.
303 *
304 * If an error occurs an error may, or may not be posted with CPLError().
305 *
306 * @param pwszSource the source wchar_t string, terminated with a 0 wchar_t.
307 * @param pszSrcEncoding the source encoding, typically CPL_ENC_UCS2.
308 * @param pszDstEncoding the destination encoding, typically CPL_ENC_UTF8.
309 *
310 * @return a zero terminated multi-byte string which should be freed with
311 * CPLFree(), or NULL if an error occurs.
312 */
313
CPLRecodeFromWCharStub(const wchar_t * pwszSource,const char * pszSrcEncoding,const char * pszDstEncoding)314 char *CPLRecodeFromWCharStub( const wchar_t *pwszSource,
315 const char *pszSrcEncoding,
316 const char *pszDstEncoding )
317
318 {
319 /* -------------------------------------------------------------------- */
320 /* We try to avoid changes of character set. We are just */
321 /* providing for unicode to unicode. */
322 /* -------------------------------------------------------------------- */
323 if( strcmp(pszSrcEncoding, "WCHAR_T") != 0 &&
324 strcmp(pszSrcEncoding, CPL_ENC_UTF8) != 0
325 && strcmp(pszSrcEncoding, CPL_ENC_UTF16) != 0
326 && strcmp(pszSrcEncoding, CPL_ENC_UCS2) != 0
327 && strcmp(pszSrcEncoding, CPL_ENC_UCS4) != 0 )
328 {
329 CPLError( CE_Failure, CPLE_AppDefined,
330 "Stub recoding implementation does not support "
331 "CPLRecodeFromWCharStub(...,%s,%s)",
332 pszSrcEncoding, pszDstEncoding );
333 return nullptr;
334 }
335
336 /* -------------------------------------------------------------------- */
337 /* What is the source length. */
338 /* -------------------------------------------------------------------- */
339 int nSrcLen = 0;
340
341 while( pwszSource[nSrcLen] != 0 )
342 nSrcLen++;
343
344 /* -------------------------------------------------------------------- */
345 /* Allocate destination buffer plenty big. */
346 /* -------------------------------------------------------------------- */
347 const int nDstBufSize = nSrcLen * 4 + 1;
348 // Nearly worst case.
349 char *pszResult = static_cast<char *>(CPLMalloc(nDstBufSize));
350
351 if( nSrcLen == 0 )
352 {
353 pszResult[0] = '\0';
354 return pszResult;
355 }
356
357 /* -------------------------------------------------------------------- */
358 /* Convert, and confirm we had enough space. */
359 /* -------------------------------------------------------------------- */
360 const int nDstLen =
361 utf8fromwc( pszResult, nDstBufSize, pwszSource, nSrcLen );
362 if( nDstLen >= nDstBufSize )
363 {
364 CPLAssert( false ); // too small!
365 return nullptr;
366 }
367
368 /* -------------------------------------------------------------------- */
369 /* If something other than UTF-8 was requested, recode now. */
370 /* -------------------------------------------------------------------- */
371 if( strcmp(pszDstEncoding, CPL_ENC_UTF8) == 0 )
372 return pszResult;
373
374 char *pszFinalResult =
375 CPLRecodeStub( pszResult, CPL_ENC_UTF8, pszDstEncoding );
376
377 CPLFree( pszResult );
378
379 return pszFinalResult;
380 }
381
382 /************************************************************************/
383 /* CPLRecodeToWCharStub() */
384 /************************************************************************/
385
386 /**
387 * Convert UTF-8 string to a wchar_t string.
388 *
389 * Convert a 8bit, multi-byte per character input string into a wide
390 * character (wchar_t) string. The only guaranteed supported source encodings
391 * are CPL_ENC_UTF8, CPL_ENC_ASCII and CPL_ENC_ISO8869_1 (LATIN1). The only
392 * guaranteed supported destination encoding is CPL_ENC_UCS2. Other source
393 * and destination encodings may be supported depending on the underlying
394 * implementation.
395 *
396 * Note that the wchar_t type varies in size on different systems. On
397 * win32 it is normally 2 bytes, and on unix 4 bytes.
398 *
399 * If an error occurs an error may, or may not be posted with CPLError().
400 *
401 * @param pszSource input multi-byte character string.
402 * @param pszSrcEncoding source encoding, typically CPL_ENC_UTF8.
403 * @param pszDstEncoding destination encoding, typically CPL_ENC_UCS2.
404 *
405 * @return the zero terminated wchar_t string (to be freed with CPLFree()) or
406 * NULL on error.
407 *
408 * @since GDAL 1.6.0
409 */
410
CPLRecodeToWCharStub(const char * pszSource,const char * pszSrcEncoding,const char * pszDstEncoding)411 wchar_t *CPLRecodeToWCharStub( const char *pszSource,
412 const char *pszSrcEncoding,
413 const char *pszDstEncoding )
414
415 {
416 char *pszUTF8Source = const_cast<char *>(pszSource);
417
418 if( strcmp(pszSrcEncoding, CPL_ENC_UTF8) != 0
419 && strcmp(pszSrcEncoding, CPL_ENC_ASCII) != 0 )
420 {
421 pszUTF8Source =
422 CPLRecodeStub(pszSource, pszSrcEncoding, CPL_ENC_UTF8);
423 if( pszUTF8Source == nullptr )
424 return nullptr;
425 }
426
427 /* -------------------------------------------------------------------- */
428 /* We try to avoid changes of character set. We are just */
429 /* providing for unicode to unicode. */
430 /* -------------------------------------------------------------------- */
431 if( strcmp(pszDstEncoding, "WCHAR_T") != 0
432 && strcmp(pszDstEncoding, CPL_ENC_UCS2) != 0
433 && strcmp(pszDstEncoding, CPL_ENC_UCS4) != 0
434 && strcmp(pszDstEncoding, CPL_ENC_UTF16) != 0 )
435 {
436 CPLError( CE_Failure, CPLE_AppDefined,
437 "Stub recoding implementation does not support "
438 "CPLRecodeToWCharStub(...,%s,%s)",
439 pszSrcEncoding, pszDstEncoding );
440 if( pszUTF8Source != pszSource )
441 CPLFree( pszUTF8Source );
442 return nullptr;
443 }
444
445 /* -------------------------------------------------------------------- */
446 /* Do the UTF-8 to UCS-2 recoding. */
447 /* -------------------------------------------------------------------- */
448 int nSrcLen = static_cast<int>(strlen(pszUTF8Source));
449 wchar_t *pwszResult =
450 static_cast<wchar_t *>(CPLCalloc(sizeof(wchar_t), nSrcLen + 1));
451
452 utf8towc( pszUTF8Source, nSrcLen, pwszResult, nSrcLen+1 );
453
454 if( pszUTF8Source != pszSource )
455 CPLFree( pszUTF8Source );
456
457 return pwszResult;
458 }
459
460 /************************************************************************/
461 /* CPLIsUTF8() */
462 /************************************************************************/
463
464 /**
465 * Test if a string is encoded as UTF-8.
466 *
467 * @param pabyData input string to test
468 * @param nLen length of the input string, or -1 if the function must compute
469 * the string length. In which case it must be null terminated.
470 * @return TRUE if the string is encoded as UTF-8. FALSE otherwise
471 *
472 * @since GDAL 1.7.0
473 */
CPLIsUTF8Stub(const char * pabyData,int nLen)474 int CPLIsUTF8Stub(const char* pabyData, int nLen)
475 {
476 if( nLen < 0 )
477 nLen = static_cast<int>(strlen(pabyData));
478 return utf8test(pabyData, static_cast<unsigned>(nLen)) != 0;
479 }
480
481 /************************************************************************/
482 /* ==================================================================== */
483 /* UTF.C code from FLTK with some modifications. */
484 /* ==================================================================== */
485 /************************************************************************/
486
487 /* Set to 1 to turn bad UTF8 bytes into ISO-8859-1. If this is to zero
488 they are instead turned into the Unicode REPLACEMENT CHARACTER, of
489 value 0xfffd.
490 If this is on utf8decode will correctly map most (perhaps all)
491 human-readable text that is in ISO-8859-1. This may allow you
492 to completely ignore character sets in your code because virtually
493 everything is either ISO-8859-1 or UTF-8.
494 */
495 #define ERRORS_TO_ISO8859_1 1
496
497 /* Set to 1 to turn bad UTF8 bytes in the 0x80-0x9f range into the
498 Unicode index for Microsoft's CP1252 character set. You should
499 also set ERRORS_TO_ISO8859_1. With this a huge amount of more
500 available text (such as all web pages) are correctly converted
501 to Unicode.
502 */
503 #define ERRORS_TO_CP1252 1
504
505 /* A number of Unicode code points are in fact illegal and should not
506 be produced by a UTF-8 converter. Turn this on will replace the
507 bytes in those encodings with errors. If you do this then converting
508 arbitrary 16-bit data to UTF-8 and then back is not an identity,
509 which will probably break a lot of software.
510 */
511 #define STRICT_RFC3629 0
512
513 #if ERRORS_TO_CP1252
514 // Codes 0x80..0x9f from the Microsoft CP1252 character set, translated
515 // to Unicode:
516 constexpr unsigned short cp1252[32] = {
517 0x20ac, 0x0081, 0x201a, 0x0192, 0x201e, 0x2026, 0x2020, 0x2021,
518 0x02c6, 0x2030, 0x0160, 0x2039, 0x0152, 0x008d, 0x017d, 0x008f,
519 0x0090, 0x2018, 0x2019, 0x201c, 0x201d, 0x2022, 0x2013, 0x2014,
520 0x02dc, 0x2122, 0x0161, 0x203a, 0x0153, 0x009d, 0x017e, 0x0178
521 };
522 #endif
523
524 /************************************************************************/
525 /* utf8decode() */
526 /************************************************************************/
527
528 /*
529 Decode a single UTF-8 encoded character starting at \e p. The
530 resulting Unicode value (in the range 0-0x10ffff) is returned,
531 and \e len is set the number of bytes in the UTF-8 encoding
532 (adding \e len to \e p will point at the next character).
533
534 If \a p points at an illegal UTF-8 encoding, including one that
535 would go past \e end, or where a code is uses more bytes than
536 necessary, then *reinterpret_cast<const unsigned char*>(p) is translated as though it is
537 in the Microsoft CP1252 character set and \e len is set to 1.
538 Treating errors this way allows this to decode almost any
539 ISO-8859-1 or CP1252 text that has been mistakenly placed where
540 UTF-8 is expected, and has proven very useful.
541
542 If you want errors to be converted to error characters (as the
543 standards recommend), adding a test to see if the length is
544 unexpectedly 1 will work:
545
546 \code
547 if( *p & 0x80 )
548 { // What should be a multibyte encoding.
549 code = utf8decode(p, end, &len);
550 if( len<2 ) code = 0xFFFD; // Turn errors into REPLACEMENT CHARACTER.
551 }
552 else
553 { // Handle the 1-byte utf8 encoding:
554 code = *p;
555 len = 1;
556 }
557 \endcode
558
559 Direct testing for the 1-byte case (as shown above) will also
560 speed up the scanning of strings where the majority of characters
561 are ASCII.
562 */
utf8decode(const char * p,const char * end,int * len)563 static unsigned utf8decode(const char* p, const char* end, int* len)
564 {
565 unsigned char c = *reinterpret_cast<const unsigned char*>(p);
566 if( c < 0x80 )
567 {
568 *len = 1;
569 return c;
570 #if ERRORS_TO_CP1252
571 }
572 else if( c < 0xa0 )
573 {
574 *len = 1;
575 return cp1252[c-0x80];
576 #endif
577 }
578 else if( c < 0xc2 )
579 {
580 goto FAIL;
581 }
582 if( p+1 >= end || (p[1] & 0xc0) != 0x80 ) goto FAIL;
583 if( c < 0xe0 )
584 {
585 *len = 2;
586 return
587 ((p[0] & 0x1f) << 6) +
588 ((p[1] & 0x3f));
589 }
590 else if( c == 0xe0 )
591 {
592 if( (reinterpret_cast<const unsigned char*>(p))[1] < 0xa0 ) goto FAIL;
593 goto UTF8_3;
594 #if STRICT_RFC3629
595 }
596 else if( c == 0xed )
597 {
598 // RFC 3629 says surrogate chars are illegal.
599 if( (reinterpret_cast<const unsigned char*>(p))[1] >= 0xa0 ) goto FAIL;
600 goto UTF8_3;
601 }
602 else if( c == 0xef )
603 {
604 // 0xfffe and 0xffff are also illegal characters.
605 if( (reinterpret_cast<const unsigned char*>(p))[1]==0xbf &&
606 (reinterpret_cast<const unsigned char*>(p))[2]>=0xbe ) goto FAIL;
607 goto UTF8_3;
608 #endif
609 }
610 else if( c < 0xf0 )
611 {
612 UTF8_3:
613 if( p+2 >= end || (p[2]&0xc0) != 0x80 ) goto FAIL;
614 *len = 3;
615 return
616 ((p[0] & 0x0f) << 12) +
617 ((p[1] & 0x3f) << 6) +
618 ((p[2] & 0x3f));
619 }
620 else if( c == 0xf0 )
621 {
622 if( (reinterpret_cast<const unsigned char*>(p))[1] < 0x90 ) goto FAIL;
623 goto UTF8_4;
624 }
625 else if( c < 0xf4 )
626 {
627 UTF8_4:
628 if( p+3 >= end || (p[2]&0xc0) != 0x80 || (p[3]&0xc0) != 0x80 ) goto FAIL;
629 *len = 4;
630 #if STRICT_RFC3629
631 // RFC 3629 says all codes ending in fffe or ffff are illegal:
632 if( (p[1]&0xf)==0xf &&
633 (reinterpret_cast<const unsigned char*>(p))[2] == 0xbf &&
634 (reinterpret_cast<const unsigned char*>(p))[3] >= 0xbe ) goto FAIL;
635 #endif
636 return
637 ((p[0] & 0x07) << 18) +
638 ((p[1] & 0x3f) << 12) +
639 ((p[2] & 0x3f) << 6) +
640 ((p[3] & 0x3f));
641 }
642 else if( c == 0xf4 )
643 {
644 if( (reinterpret_cast<const unsigned char*>(p))[1] > 0x8f ) goto FAIL; // After 0x10ffff.
645 goto UTF8_4;
646 }
647 else
648 {
649 FAIL:
650 *len = 1;
651 #if ERRORS_TO_ISO8859_1
652 return c;
653 #else
654 return 0xfffd; // Unicode REPLACEMENT CHARACTER
655 #endif
656 }
657 }
658
659 /************************************************************************/
660 /* utf8towc() */
661 /************************************************************************/
662
663 /* Convert a UTF-8 sequence into an array of wchar_t. These
664 are used by some system calls, especially on Windows.
665
666 \a src points at the UTF-8, and \a srclen is the number of bytes to
667 convert.
668
669 \a dst points at an array to write, and \a dstlen is the number of
670 locations in this array. At most \a dstlen-1 words will be
671 written there, plus a 0 terminating word. Thus this function
672 will never overwrite the buffer and will always return a
673 zero-terminated string. If \a dstlen is zero then \a dst can be
674 null and no data is written, but the length is returned.
675
676 The return value is the number of words that \e would be written
677 to \a dst if it were long enough, not counting the terminating
678 zero. If the return value is greater or equal to \a dstlen it
679 indicates truncation, you can then allocate a new array of size
680 return+1 and call this again.
681
682 Errors in the UTF-8 are converted as though each byte in the
683 erroneous string is in the Microsoft CP1252 encoding. This allows
684 ISO-8859-1 text mistakenly identified as UTF-8 to be printed
685 correctly.
686
687 Notice that sizeof(wchar_t) is 2 on Windows and is 4 on Linux
688 and most other systems. Where wchar_t is 16 bits, Unicode
689 characters in the range 0x10000 to 0x10ffff are converted to
690 "surrogate pairs" which take two words each (this is called UTF-16
691 encoding). If wchar_t is 32 bits this rather nasty problem is
692 avoided.
693 */
utf8towc(const char * src,unsigned srclen,wchar_t * dst,unsigned dstlen)694 static unsigned utf8towc(const char* src, unsigned srclen,
695 wchar_t* dst, unsigned dstlen)
696 {
697 const char* p = src;
698 const char* e = src+srclen;
699 unsigned count = 0;
700 if( dstlen ) while( true )
701 {
702 if( p >= e )
703 {
704 dst[count] = 0;
705 return count;
706 }
707 if( !(*p & 0x80) )
708 {
709 // ASCII
710 dst[count] = *p++;
711 }
712 else
713 {
714 int len = 0;
715 unsigned ucs = utf8decode(p, e, &len);
716 p += len;
717 #ifdef _WIN32
718 if( ucs < 0x10000 )
719 {
720 dst[count] = static_cast<wchar_t>(ucs);
721 }
722 else
723 {
724 // Make a surrogate pair:
725 if( count+2 >= dstlen)
726 {
727 dst[count] = 0;
728 count += 2;
729 break;
730 }
731 dst[count] = static_cast<wchar_t>((((ucs-0x10000u)>>10)&0x3ff) | 0xd800);
732 dst[++count] = static_cast<wchar_t>((ucs&0x3ff) | 0xdc00);
733 }
734 #else
735 dst[count] = static_cast<wchar_t>(ucs);
736 #endif
737 }
738 if( ++count == dstlen )
739 {
740 dst[count-1] = 0;
741 break;
742 }
743 }
744 // We filled dst, measure the rest:
745 while( p < e )
746 {
747 if( !(*p & 0x80) )
748 {
749 p++;
750 }
751 else
752 {
753 int len = 0;
754 #ifdef _WIN32
755 const unsigned ucs = utf8decode(p, e, &len);
756 p += len;
757 if( ucs >= 0x10000 ) ++count;
758 #else
759 utf8decode(p, e, &len);
760 p += len;
761 #endif
762 }
763 ++count;
764 }
765
766 return count;
767 }
768
769 /************************************************************************/
770 /* utf8toa() */
771 /************************************************************************/
772 /* Convert a UTF-8 sequence into an array of 1-byte characters.
773
774 If the UTF-8 decodes to a character greater than 0xff then it is
775 replaced with '?'.
776
777 Errors in the UTF-8 are converted as individual bytes, same as
778 utf8decode() does. This allows ISO-8859-1 text mistakenly identified
779 as UTF-8 to be printed correctly (and possibly CP1512 on Windows).
780
781 \a src points at the UTF-8, and \a srclen is the number of bytes to
782 convert.
783
784 Up to \a dstlen bytes are written to \a dst, including a null
785 terminator. The return value is the number of bytes that would be
786 written, not counting the null terminator. If greater or equal to
787 \a dstlen then if you malloc a new array of size n+1 you will have
788 the space needed for the entire string. If \a dstlen is zero then
789 nothing is written and this call just measures the storage space
790 needed.
791 */
utf8toa(const char * src,unsigned srclen,char * dst,unsigned dstlen)792 static unsigned int utf8toa( const char* src, unsigned srclen,
793 char* dst, unsigned dstlen )
794 {
795 const char* p = src;
796 const char* e = src+srclen;
797 unsigned int count = 0;
798 if( dstlen ) while( true )
799 {
800 if( p >= e )
801 {
802 dst[count] = 0;
803 return count;
804 }
805 unsigned char c = *reinterpret_cast<const unsigned char*>(p);
806 if( c < 0xC2 )
807 {
808 // ASCII or bad code.
809 dst[count] = c;
810 p++;
811 }
812 else
813 {
814 int len = 0;
815 const unsigned int ucs = utf8decode(p, e, &len);
816 p += len;
817 if( ucs < 0x100 )
818 {
819 dst[count] = static_cast<char>(ucs);
820 }
821 else
822 {
823 if( !bHaveWarned4 )
824 {
825 bHaveWarned4 = true;
826 CPLError(CE_Warning, CPLE_AppDefined,
827 "One or several characters couldn't be converted "
828 "correctly from UTF-8 to ISO-8859-1. "
829 "This warning will not be emitted anymore.");
830 }
831 dst[count] = '?';
832 }
833 }
834 if( ++count >= dstlen )
835 {
836 dst[count-1] = 0;
837 break;
838 }
839 }
840 // We filled dst, measure the rest:
841 while( p < e )
842 {
843 if( !(*p & 0x80) )
844 {
845 p++;
846 }
847 else
848 {
849 int len = 0;
850 utf8decode(p, e, &len);
851 p += len;
852 }
853 ++count;
854 }
855 return count;
856 }
857
858 /************************************************************************/
859 /* utf8fromwc() */
860 /************************************************************************/
861 /* Turn "wide characters" as returned by some system calls
862 (especially on Windows) into UTF-8.
863
864 Up to \a dstlen bytes are written to \a dst, including a null
865 terminator. The return value is the number of bytes that would be
866 written, not counting the null terminator. If greater or equal to
867 \a dstlen then if you malloc a new array of size n+1 you will have
868 the space needed for the entire string. If \a dstlen is zero then
869 nothing is written and this call just measures the storage space
870 needed.
871
872 \a srclen is the number of words in \a src to convert. On Windows
873 this is not necessarily the number of characters, due to there
874 possibly being "surrogate pairs" in the UTF-16 encoding used.
875 On Unix wchar_t is 32 bits and each location is a character.
876
877 On Unix if a src word is greater than 0x10ffff then this is an
878 illegal character according to RFC 3629. These are converted as
879 though they are 0xFFFD (REPLACEMENT CHARACTER). Characters in the
880 range 0xd800 to 0xdfff, or ending with 0xfffe or 0xffff are also
881 illegal according to RFC 3629. However I encode these as though
882 they are legal, so that utf8towc will return the original data.
883
884 On Windows "surrogate pairs" are converted to a single character
885 and UTF-8 encoded (as 4 bytes). Mismatched halves of surrogate
886 pairs are converted as though they are individual characters.
887 */
utf8fromwc(char * dst,unsigned dstlen,const wchar_t * src,unsigned srclen)888 static unsigned int utf8fromwc( char* dst, unsigned dstlen,
889 const wchar_t* src, unsigned srclen )
890 {
891 unsigned int i = 0;
892 unsigned int count = 0;
893 if( dstlen ) while( true )
894 {
895 if( i >= srclen )
896 {
897 dst[count] = 0;
898 return count;
899 }
900 unsigned int ucs = src[i++];
901 if( ucs < 0x80U )
902 {
903 dst[count++] = static_cast<char>(ucs);
904 if( count >= dstlen )
905 {
906 dst[count-1] = 0;
907 break;
908 }
909 }
910 else if( ucs < 0x800U )
911 {
912 // 2 bytes.
913 if( count+2 >= dstlen )
914 {
915 dst[count] = 0;
916 count += 2;
917 break;
918 }
919 dst[count++] = 0xc0 | static_cast<char>(ucs >> 6);
920 dst[count++] = 0x80 | static_cast<char>(ucs & 0x3F);
921 #ifdef _WIN32
922 }
923 else if( ucs >= 0xd800 && ucs <= 0xdbff && i < srclen &&
924 src[i] >= 0xdc00 && src[i] <= 0xdfff)
925 {
926 // Surrogate pair.
927 unsigned int ucs2 = src[i++];
928 ucs = 0x10000U + ((ucs & 0x3ff) << 10) + (ucs2 & 0x3ff);
929 // All surrogate pairs turn into 4-byte utf8.
930 #else
931 }
932 else if( ucs >= 0x10000 )
933 {
934 if( ucs > 0x10ffff )
935 {
936 ucs = 0xfffd;
937 goto J1;
938 }
939 #endif
940 if( count+4 >= dstlen )
941 {
942 dst[count] = 0;
943 count += 4;
944 break;
945 }
946 dst[count++] = 0xf0 | static_cast<char>(ucs >> 18);
947 dst[count++] = 0x80 | static_cast<char>((ucs >> 12) & 0x3F);
948 dst[count++] = 0x80 | static_cast<char>((ucs >> 6) & 0x3F);
949 dst[count++] = 0x80 | static_cast<char>(ucs & 0x3F);
950 }
951 else
952 {
953 #ifndef _WIN32
954 J1:
955 #endif
956 // All others are 3 bytes:
957 if( count+3 >= dstlen )
958 {
959 dst[count] = 0;
960 count += 3;
961 break;
962 }
963 dst[count++] = 0xe0 | static_cast<char>(ucs >> 12);
964 dst[count++] = 0x80 | static_cast<char>((ucs >> 6) & 0x3F);
965 dst[count++] = 0x80 | static_cast<char>(ucs & 0x3F);
966 }
967 }
968
969 // We filled dst, measure the rest:
970 while( i < srclen )
971 {
972 unsigned int ucs = src[i++];
973 if( ucs < 0x80U )
974 {
975 count++;
976 }
977 else if( ucs < 0x800U )
978 {
979 // 2 bytes.
980 count += 2;
981 #ifdef _WIN32
982 }
983 else if( ucs >= 0xd800 && ucs <= 0xdbff && i < srclen-1 &&
984 src[i+1] >= 0xdc00 && src[i+1] <= 0xdfff )
985 {
986 // Surrogate pair.
987 ++i;
988 #else
989 }
990 else if( ucs >= 0x10000 && ucs <= 0x10ffff )
991 {
992 #endif
993 count += 4;
994 }
995 else
996 {
997 count += 3;
998 }
999 }
1000 return count;
1001 }
1002
1003 /************************************************************************/
1004 /* utf8froma() */
1005 /************************************************************************/
1006
1007 /* Convert an ISO-8859-1 (i.e. normal c-string) byte stream to UTF-8.
1008
1009 It is possible this should convert Microsoft's CP1252 to UTF-8
1010 instead. This would translate the codes in the range 0x80-0x9f
1011 to different characters. Currently it does not do this.
1012
1013 Up to \a dstlen bytes are written to \a dst, including a null
1014 terminator. The return value is the number of bytes that would be
1015 written, not counting the null terminator. If greater or equal to
1016 \a dstlen then if you malloc a new array of size n+1 you will have
1017 the space needed for the entire string. If \a dstlen is zero then
1018 nothing is written and this call just measures the storage space
1019 needed.
1020
1021 \a srclen is the number of bytes in \a src to convert.
1022
1023 If the return value equals \a srclen then this indicates that
1024 no conversion is necessary, as only ASCII characters are in the
1025 string.
1026 */
utf8froma(char * dst,unsigned dstlen,const char * src,unsigned srclen)1027 static unsigned utf8froma(char* dst, unsigned dstlen,
1028 const char* src, unsigned srclen) {
1029 const char* p = src;
1030 const char* e = src+srclen;
1031 unsigned count = 0;
1032 if( dstlen ) while( true )
1033 {
1034 if( p >= e )
1035 {
1036 dst[count] = 0;
1037 return count;
1038 }
1039 unsigned char ucs = *reinterpret_cast<const unsigned char*>(p);
1040 p++;
1041 if( ucs < 0x80U )
1042 {
1043 dst[count++] = ucs;
1044 if( count >= dstlen )
1045 {
1046 dst[count-1] = 0;
1047 break;
1048 }
1049 }
1050 else
1051 {
1052 // 2 bytes (note that CP1252 translate could make 3 bytes!)
1053 if( count+2 >= dstlen )
1054 {
1055 dst[count] = 0;
1056 count += 2;
1057 break;
1058 }
1059 dst[count++] = 0xc0 | (ucs >> 6);
1060 dst[count++] = 0x80 | (ucs & 0x3F);
1061 }
1062 }
1063
1064 // We filled dst, measure the rest:
1065 while( p < e )
1066 {
1067 unsigned char ucs = *reinterpret_cast<const unsigned char*>(p);
1068 p++;
1069 if( ucs < 0x80U )
1070 {
1071 count++;
1072 }
1073 else
1074 {
1075 count += 2;
1076 }
1077 }
1078
1079 return count;
1080 }
1081
1082 #ifdef _WIN32
1083
1084 /************************************************************************/
1085 /* CPLWin32Recode() */
1086 /************************************************************************/
1087
1088 /* Convert an CODEPAGE (i.e. normal c-string) byte stream
1089 to another CODEPAGE (i.e. normal c-string) byte stream.
1090
1091 \a src is target c-string byte stream (including a null terminator).
1092 \a src_code_page is target c-string byte code page.
1093 \a dst_code_page is destination c-string byte code page.
1094
1095 UTF7 65000
1096 UTF8 65001
1097 OEM-US 437
1098 OEM-ALABIC 720
1099 OEM-GREEK 737
1100 OEM-BALTIC 775
1101 OEM-MLATIN1 850
1102 OEM-LATIN2 852
1103 OEM-CYRILLIC 855
1104 OEM-TURKISH 857
1105 OEM-MLATIN1P 858
1106 OEM-HEBREW 862
1107 OEM-RUSSIAN 866
1108
1109 THAI 874
1110 SJIS 932
1111 GBK 936
1112 KOREA 949
1113 BIG5 950
1114
1115 EUROPE 1250
1116 CYRILLIC 1251
1117 LATIN1 1252
1118 GREEK 1253
1119 TURKISH 1254
1120 HEBREW 1255
1121 ARABIC 1256
1122 BALTIC 1257
1123 VIETNAM 1258
1124
1125 ISO-LATIN1 28591
1126 ISO-LATIN2 28592
1127 ISO-LATIN3 28593
1128 ISO-BALTIC 28594
1129 ISO-CYRILLIC 28595
1130 ISO-ARABIC 28596
1131 ISO-HEBREW 28598
1132 ISO-TURKISH 28599
1133 ISO-LATIN9 28605
1134
1135 ISO-2022-JP 50220
1136
1137 */
1138
CPLWin32Recode(const char * src,unsigned src_code_page,unsigned dst_code_page)1139 char* CPLWin32Recode( const char* src, unsigned src_code_page,
1140 unsigned dst_code_page )
1141 {
1142 // Convert from source code page to Unicode.
1143
1144 // Compute the length in wide characters.
1145 int wlen = MultiByteToWideChar( src_code_page, MB_ERR_INVALID_CHARS, src,
1146 -1, nullptr, 0 );
1147 if( wlen == 0 && GetLastError() == ERROR_NO_UNICODE_TRANSLATION )
1148 {
1149 if( !bHaveWarned5 )
1150 {
1151 bHaveWarned5 = true;
1152 CPLError(
1153 CE_Warning, CPLE_AppDefined,
1154 "One or several characters could not be translated from CP%d. "
1155 "This warning will not be emitted anymore.", src_code_page);
1156 }
1157
1158 // Retry now without MB_ERR_INVALID_CHARS flag.
1159 wlen = MultiByteToWideChar( src_code_page, 0, src, -1, nullptr, 0 );
1160 }
1161
1162 // Do the actual conversion.
1163 wchar_t* tbuf =
1164 static_cast<wchar_t *>(CPLCalloc(sizeof(wchar_t), wlen + 1));
1165 tbuf[wlen] = 0;
1166 MultiByteToWideChar( src_code_page, 0, src, -1, tbuf, wlen+1 );
1167
1168 // Convert from Unicode to destination code page.
1169
1170 // Compute the length in chars.
1171 BOOL bUsedDefaultChar = FALSE;
1172 int len = 0;
1173 if( dst_code_page == CP_UTF7 || dst_code_page == CP_UTF8 )
1174 len = WideCharToMultiByte( dst_code_page, 0, tbuf, -1, nullptr, 0, nullptr, nullptr );
1175 else
1176 len = WideCharToMultiByte( dst_code_page, 0, tbuf, -1, nullptr, 0, nullptr,
1177 &bUsedDefaultChar );
1178 if( bUsedDefaultChar )
1179 {
1180 if( !bHaveWarned6 )
1181 {
1182 bHaveWarned6 = true;
1183 CPLError(
1184 CE_Warning, CPLE_AppDefined,
1185 "One or several characters could not be translated to CP%d. "
1186 "This warning will not be emitted anymore.", dst_code_page);
1187 }
1188 }
1189
1190 // Do the actual conversion.
1191 char* pszResult = static_cast<char *>(CPLCalloc(sizeof(char), len + 1));
1192 WideCharToMultiByte(dst_code_page, 0, tbuf, -1, pszResult, len+1, nullptr, nullptr);
1193 pszResult[len] = 0;
1194
1195 CPLFree(tbuf);
1196
1197 return pszResult;
1198 }
1199
1200 #endif
1201
1202 /*
1203 ** For now we disable the rest which is locale() related. We may need
1204 ** parts of it later.
1205 */
1206
1207 #ifdef notdef
1208
1209 #ifdef _WIN32
1210 # include <windows.h>
1211 #endif
1212
1213 /*! Return true if the "locale" seems to indicate that UTF-8 encoding
1214 is used. If true the utf8tomb and utf8frommb don't do anything
1215 useful.
1216
1217 <i>It is highly recommended that you change your system so this
1218 does return true.</i> On Windows this is done by setting the
1219 "codepage" to CP_UTF8. On Unix this is done by setting $LC_CTYPE
1220 to a string containing the letters "utf" or "UTF" in it, or by
1221 deleting all $LC* and $LANG environment variables. In the future
1222 it is likely that all non-Asian Unix systems will return true,
1223 due to the compatibility of UTF-8 with ISO-8859-1.
1224 */
utf8locale(void)1225 int utf8locale( void )
1226 {
1227 static int ret = 2;
1228 if( ret == 2 ) {
1229 #ifdef _WIN32
1230 ret = GetACP() == CP_UTF8;
1231 #else
1232 char* s;
1233 ret = 1; // assume UTF-8 if no locale
1234 if( ((s = getenv("LC_CTYPE")) && *s) ||
1235 ((s = getenv("LC_ALL")) && *s) ||
1236 ((s = getenv("LANG")) && *s) )
1237 {
1238 ret = strstr(s, "utf") || strstr(s, "UTF");
1239 }
1240 #endif
1241 }
1242
1243 return ret;
1244 }
1245
1246 /*! Convert the UTF-8 used by FLTK to the locale-specific encoding
1247 used for filenames (and sometimes used for data in files).
1248 Unfortunately due to stupid design you will have to do this as
1249 needed for filenames. This is a bug on both Unix and Windows.
1250
1251 Up to \a dstlen bytes are written to \a dst, including a null
1252 terminator. The return value is the number of bytes that would be
1253 written, not counting the null terminator. If greater or equal to
1254 \a dstlen then if you malloc a new array of size n+1 you will have
1255 the space needed for the entire string. If \a dstlen is zero then
1256 nothing is written and this call just measures the storage space
1257 needed.
1258
1259 If utf8locale() returns true then this does not change the data.
1260 It is copied and truncated as necessary to
1261 the destination buffer and \a srclen is always returned. */
utf8tomb(const char * src,unsigned srclen,char * dst,unsigned dstlen)1262 unsigned utf8tomb( const char* src, unsigned srclen,
1263 char* dst, unsigned dstlen )
1264 {
1265 if( !utf8locale() )
1266 {
1267 #ifdef _WIN32
1268 wchar_t lbuf[1024] = {};
1269 wchar_t* buf = lbuf;
1270 unsigned length = utf8towc(src, srclen, buf, 1024);
1271 unsigned ret;
1272 if( length >= 1024 )
1273 {
1274 buf = static_cast<wchar_t *>(malloc((length + 1) * sizeof(wchar_t)));
1275 utf8towc(src, srclen, buf, length + 1);
1276 }
1277 if( dstlen )
1278 {
1279 // apparently this does not null-terminate, even though msdn
1280 // documentation claims it does:
1281 ret =
1282 WideCharToMultiByte(GetACP(), 0, buf, length, dst, dstlen, 0, 0);
1283 dst[ret] = 0;
1284 }
1285 // if it overflows or measuring length, get the actual length:
1286 if( dstlen==0 || ret >= dstlen-1 )
1287 ret = WideCharToMultiByte(GetACP(), 0, buf, length, 0, 0, 0, 0);
1288 if( buf != lbuf ) free((void*)buf);
1289 return ret;
1290 #else
1291 wchar_t lbuf[1024] = {};
1292 wchar_t* buf = lbuf;
1293 unsigned length = utf8towc(src, srclen, buf, 1024);
1294 if( length >= 1024 )
1295 {
1296 buf = static_cast<wchar_t *>(malloc((length + 1) * sizeof(wchar_t)));
1297 utf8towc(src, srclen, buf, length+1);
1298 }
1299 int ret = 0;
1300 if( dstlen )
1301 {
1302 ret = wcstombs(dst, buf, dstlen);
1303 if( ret >= dstlen - 1 ) ret = wcstombs(0, buf, 0);
1304 } else {
1305 ret = wcstombs(0, buf, 0);
1306 }
1307 if( buf != lbuf ) free((void*)buf);
1308 if( ret >= 0 ) return (unsigned)ret;
1309 // On any errors we return the UTF-8 as raw text...
1310 #endif
1311 }
1312 // Identity transform:
1313 if( srclen < dstlen )
1314 {
1315 memcpy(dst, src, srclen);
1316 dst[srclen] = 0;
1317 } else {
1318 memcpy(dst, src, dstlen-1);
1319 dst[dstlen-1] = 0;
1320 }
1321 return srclen;
1322 }
1323
1324 /*! Convert a filename from the locale-specific multibyte encoding
1325 used by Windows to UTF-8 as used by FLTK.
1326
1327 Up to \a dstlen bytes are written to \a dst, including a null
1328 terminator. The return value is the number of bytes that would be
1329 written, not counting the null terminator. If greater or equal to
1330 \a dstlen then if you malloc a new array of size n+1 you will have
1331 the space needed for the entire string. If \a dstlen is zero then
1332 nothing is written and this call just measures the storage space
1333 needed.
1334
1335 On Unix or on Windows when a UTF-8 locale is in effect, this
1336 does not change the data. It is copied and truncated as necessary to
1337 the destination buffer and \a srclen is always returned.
1338 You may also want to check if utf8test() returns non-zero, so that
1339 the filesystem can store filenames in UTF-8 encoding regardless of
1340 the locale.
1341 */
utf8frommb(char * dst,unsigned dstlen,const char * src,unsigned srclen)1342 unsigned utf8frommb(char* dst, unsigned dstlen,
1343 const char* src, unsigned srclen)
1344 {
1345 if( !utf8locale() )
1346 {
1347 #ifdef _WIN32
1348 wchar_t lbuf[1024] = {};
1349 wchar_t* buf = lbuf;
1350 unsigned ret;
1351 const unsigned length =
1352 MultiByteToWideChar(GetACP(), 0, src, srclen, buf, 1024);
1353 if( length >= 1024 )
1354 {
1355 length = MultiByteToWideChar(GetACP(), 0, src, srclen, 0, 0);
1356 buf = static_cast<wchar_t *>(malloc(length * sizeof(wchar_t)));
1357 MultiByteToWideChar(GetACP(), 0, src, srclen, buf, length);
1358 }
1359 ret = utf8fromwc(dst, dstlen, buf, length);
1360 if( buf != lbuf ) free(buf);
1361 return ret;
1362 #else
1363 wchar_t lbuf[1024] = {};
1364 wchar_t* buf = lbuf;
1365 const int length = mbstowcs(buf, src, 1024);
1366 if( length >= 1024 )
1367 {
1368 length = mbstowcs(0, src, 0)+1;
1369 buf = static_cast<wchar_t *>(malloc(length*sizeof(unsigned short)));
1370 mbstowcs(buf, src, length);
1371 }
1372 if( length >= 0 )
1373 {
1374 const unsigned ret = utf8fromwc(dst, dstlen, buf, length);
1375 if( buf != lbuf ) free(buf);
1376 return ret;
1377 }
1378 // Errors in conversion return the UTF-8 unchanged.
1379 #endif
1380 }
1381 // Identity transform:
1382 if( srclen < dstlen )
1383 {
1384 memcpy(dst, src, srclen);
1385 dst[srclen] = 0;
1386 }
1387 else
1388 {
1389 memcpy(dst, src, dstlen-1);
1390 dst[dstlen-1] = 0;
1391 }
1392 return srclen;
1393 }
1394
1395 #endif // def notdef - disabled locale specific stuff.
1396
1397 /*! Examines the first \a srclen bytes in \a src and return a verdict
1398 on whether it is UTF-8 or not.
1399 - Returns 0 if there is any illegal UTF-8 sequences, using the
1400 same rules as utf8decode(). Note that some UCS values considered
1401 illegal by RFC 3629, such as 0xffff, are considered legal by this.
1402 - Returns 1 if there are only single-byte characters (i.e. no bytes
1403 have the high bit set). This is legal UTF-8, but also indicates
1404 plain ASCII. It also returns 1 if \a srclen is zero.
1405 - Returns 2 if there are only characters less than 0x800.
1406 - Returns 3 if there are only characters less than 0x10000.
1407 - Returns 4 if there are characters in the 0x10000 to 0x10ffff range.
1408
1409 Because there are many illegal sequences in UTF-8, it is almost
1410 impossible for a string in another encoding to be confused with
1411 UTF-8. This is very useful for transitioning Unix to UTF-8
1412 filenames, you can simply test each filename with this to decide
1413 if it is UTF-8 or in the locale encoding. My hope is that if
1414 this is done we will be able to cleanly transition to a locale-less
1415 encoding.
1416 */
1417
utf8test(const char * src,unsigned srclen)1418 static int utf8test( const char* src, unsigned srclen )
1419 {
1420 int ret = 1;
1421 const char* p = src;
1422 const char* e = src + srclen;
1423 while( p < e )
1424 {
1425 if( *p == 0 )
1426 return 0;
1427 if( *p & 0x80 )
1428 {
1429 int len = 0;
1430 utf8decode(p, e, &len);
1431 if( len < 2 ) return 0;
1432 if( len > ret ) ret = len;
1433 p += len;
1434 } else {
1435 p++;
1436 }
1437 }
1438 return ret;
1439 }
1440
1441 #endif /* defined(CPL_RECODE_STUB) */
1442