1 /**********************************************************************
2 * $Id: cpl_recode_stub.cpp 14368 2008-04-30 02:22:31Z warmerdam $
3 *
4 * Name: cpl_recode.cpp
5 * Project: CPL - Common Portability Library
6 * Purpose: Character set recoding and char/wchar_t conversions.
7 * Author: Frank Warmerdam, warmerdam@pobox.com
8 *
9 * The bulk of this code is derived from the utf.c module from FLTK. It
10 * was originally downloaded from:
11 * http://svn.easysw.com/public/fltk/fltk/trunk/src/utf.c
12 *
13 **********************************************************************
14 * Copyright (c) 2008, Frank Warmerdam
15 * Copyright 2006 by Bill Spitzak and others.
16 *
17 * Permission to use, copy, modify, and distribute this software for any
18 * purpose with or without fee is hereby granted, provided that the above
19 * copyright notice and this permission notice appear in all copies.
20 *
21 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
22 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
23 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
24 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
25 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
26 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
27 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
28 **********************************************************************/
29
30 #include "cpl_string.h"
31
32 CPL_CVSID("$Id: cpl_recode_stub.cpp 14368 2008-04-30 02:22:31Z warmerdam $");
33
34 #define CPL_RECODE_STUB
35
36 #ifdef CPL_RECODE_STUB
37
38 static unsigned utf8decode(const char* p, const char* end, int* len);
39 static unsigned utf8towc(const char* src, unsigned srclen,
40 wchar_t* dst, unsigned dstlen);
41 static unsigned utf8toa(const char* src, unsigned srclen,
42 char* dst, unsigned dstlen);
43 static unsigned utf8fromwc(char* dst, unsigned dstlen,
44 const wchar_t* src, unsigned srclen);
45 static unsigned utf8froma(char* dst, unsigned dstlen,
46 const char* src, unsigned srclen);
47
48 #ifdef FUTURE_NEEDS
49 static const char* utf8fwd(const char* p, const char* start, const char* end);
50 static const char* utf8back(const char* p, const char* start, const char*end);
51 static int utf8encode(unsigned ucs, char* buf);
52 static int utf8bytes(unsigned ucs);
53 #endif /* def FUTURE_NEEDS */
54
55 /************************************************************************/
56 /* ==================================================================== */
57 /* Stub Implementation not depending on iconv() or WIN32 API. */
58 /* ==================================================================== */
59 /************************************************************************/
60
61 /************************************************************************/
62 /* CPLRecode() */
63 /************************************************************************/
64
CPLRecode(const char * pszSource,const char * pszSrcEncoding,const char * pszDstEncoding)65 char CPL_DLL *CPLRecode( const char *pszSource,
66 const char *pszSrcEncoding,
67 const char *pszDstEncoding )
68
69 {
70 /* -------------------------------------------------------------------- */
71 /* Handle a few common short cuts. */
72 /* -------------------------------------------------------------------- */
73 if( strcmp(pszSrcEncoding,pszDstEncoding) == 0 )
74 return CPLStrdup(pszSource);
75
76 if( strcmp(pszSrcEncoding,CPL_ENC_ASCII) == 0
77 && (strcmp(pszDstEncoding,CPL_ENC_UTF8) == 0
78 || strcmp(pszDstEncoding,CPL_ENC_ISO8859_1) == 0) )
79 return CPLStrdup(pszSource);
80
81 /* -------------------------------------------------------------------- */
82 /* If the source or destination is current locale(), we change */
83 /* it to ISO8859-1 since our stub implementation does not */
84 /* attempt to address locales properly. */
85 /* -------------------------------------------------------------------- */
86
87 if( pszSrcEncoding[0] == '\0' )
88 pszSrcEncoding = CPL_ENC_ISO8859_1;
89
90 if( pszDstEncoding[0] == '\0' )
91 pszDstEncoding = CPL_ENC_ISO8859_1;
92
93 /* -------------------------------------------------------------------- */
94 /* ISO8859 to UTF8 */
95 /* -------------------------------------------------------------------- */
96 if( strcmp(pszSrcEncoding,CPL_ENC_ISO8859_1) == 0
97 && strcmp(pszDstEncoding,CPL_ENC_UTF8) == 0 )
98 {
99 int nCharCount = strlen(pszSource);
100 char *pszResult = (char *) CPLCalloc(1,nCharCount*2+1);
101
102 utf8froma( pszResult, nCharCount*2+1, pszSource, nCharCount );
103
104 return pszResult;
105 }
106
107 /* -------------------------------------------------------------------- */
108 /* UTF8 to ISO8859 */
109 /* -------------------------------------------------------------------- */
110 if( strcmp(pszSrcEncoding,CPL_ENC_UTF8) == 0
111 && strcmp(pszDstEncoding,CPL_ENC_ISO8859_1) == 0 )
112 {
113 int nCharCount = strlen(pszSource);
114 char *pszResult = (char *) CPLCalloc(1,nCharCount+1);
115
116 utf8toa( pszSource, nCharCount, pszResult, nCharCount+1 );
117
118 return pszResult;
119 }
120
121 /* -------------------------------------------------------------------- */
122 /* Anything else to UTF-8 is treated as ISO8859-1 to UTF-8 with */
123 /* a one-time warning. */
124 /* -------------------------------------------------------------------- */
125 if( strcmp(pszDstEncoding,CPL_ENC_UTF8) == 0 )
126 {
127 int nCharCount = strlen(pszSource);
128 char *pszResult = (char *) CPLCalloc(1,nCharCount*2+1);
129 static int bHaveWarned = FALSE;
130
131 if( !bHaveWarned )
132 {
133 bHaveWarned = 1;
134 CPLError( CE_Warning, CPLE_AppDefined,
135 "Recode from %s to UTF-8 not supported, treated as ISO8859-1 to UTF-8.",
136 pszSrcEncoding );
137 }
138
139 utf8froma( pszResult, nCharCount*2+1, pszSource, nCharCount );
140
141 return pszResult;
142 }
143
144 /* -------------------------------------------------------------------- */
145 /* UTF-8 to anything else is treated as UTF-8 to ISO-8859-1 */
146 /* with a warning. */
147 /* -------------------------------------------------------------------- */
148 if( strcmp(pszSrcEncoding,CPL_ENC_UTF8) == 0
149 && strcmp(pszDstEncoding,CPL_ENC_ISO8859_1) == 0 )
150 {
151 int nCharCount = strlen(pszSource);
152 char *pszResult = (char *) CPLCalloc(1,nCharCount+1);
153 static int bHaveWarned = FALSE;
154
155 if( !bHaveWarned )
156 {
157 bHaveWarned = 1;
158 CPLError( CE_Warning, CPLE_AppDefined,
159 "Recode from UTF-8 to %s not supported, treated as UTF-8 to ISO8859-1.",
160 pszDstEncoding );
161 }
162
163 utf8toa( pszSource, nCharCount, pszResult, nCharCount+1 );
164
165 return pszResult;
166 }
167
168 /* -------------------------------------------------------------------- */
169 /* Everything else is treated as a no-op with a warning. */
170 /* -------------------------------------------------------------------- */
171 {
172 static int bHaveWarned = FALSE;
173
174 if( !bHaveWarned )
175 {
176 bHaveWarned = 1;
177 CPLError( CE_Warning, CPLE_AppDefined,
178 "Recode from %s to %s not supported, no change applied.",
179 pszSrcEncoding, pszDstEncoding );
180 }
181
182 return CPLStrdup(pszSource);
183 }
184 }
185
186 /************************************************************************/
187 /* CPLRecodeFromWChar() */
188 /************************************************************************/
189
190 /**
191 * Convert wchar_t string to UTF-8.
192 *
193 * Convert a wchar_t string into a multibyte utf-8 string. The only
194 * guaranteed supported source encoding is CPL_ENC_UCS2, and the only
195 * guaranteed supported destination encodings are CPL_ENC_UTF8, CPL_ENC_ASCII
196 * and CPL_ENC_ISO8859_1. In some cases (ie. using iconv()) other encodings
197 * may also be supported.
198 *
199 * Note that the wchar_t type varies in size on different systems. On
200 * win32 it is normally 2 bytes, and on unix 4 bytes.
201 *
202 * If an error occurs an error may, or may not be posted with CPLError().
203 *
204 * @param pwszSource the source wchar_t string, terminated with a 0 wchar_t.
205 * @param pszSrcEncoding the source encoding, typically CPL_ENC_UCS2.
206 * @param pszDstEncoding the destination encoding, typically CPL_ENC_UTF8.
207 *
208 * @return a zero terminated multi-byte string which should be freed with
209 * CPLFree(), or NULL if an error occurs.
210 */
211
CPLRecodeFromWChar(const wchar_t * pwszSource,const char * pszSrcEncoding,const char * pszDstEncoding)212 char CPL_DLL *CPLRecodeFromWChar( const wchar_t *pwszSource,
213 const char *pszSrcEncoding,
214 const char *pszDstEncoding )
215
216 {
217 /* -------------------------------------------------------------------- */
218 /* We try to avoid changes of character set. We are just */
219 /* providing for unicode to unicode. */
220 /* -------------------------------------------------------------------- */
221 if( strcmp(pszSrcEncoding,CPL_ENC_UTF8) != 0
222 && strcmp(pszSrcEncoding,CPL_ENC_UTF16) != 0
223 && strcmp(pszSrcEncoding,CPL_ENC_UCS2) != 0
224 && strcmp(pszSrcEncoding,CPL_ENC_UCS4) != 0 )
225 {
226 CPLError( CE_Failure, CPLE_AppDefined,
227 "Stub recoding implementation does not support\n"
228 "CPLRecodeFromWChar(...,%s,%s)",
229 pszSrcEncoding, pszDstEncoding );
230 return NULL;
231 }
232
233 /* -------------------------------------------------------------------- */
234 /* What is the source length. */
235 /* -------------------------------------------------------------------- */
236 int nSrcLen = 0;
237
238 while( pwszSource[nSrcLen] != 0 )
239 nSrcLen++;
240
241 /* -------------------------------------------------------------------- */
242 /* Allocate destination buffer plenty big. */
243 /* -------------------------------------------------------------------- */
244 char *pszResult;
245 int nDstBufSize, nDstLen;
246
247 nDstBufSize = nSrcLen * 4 + 1;
248 pszResult = (char *) CPLMalloc(nDstBufSize); // nearly worst case.
249
250 /* -------------------------------------------------------------------- */
251 /* Convert, and confirm we had enough space. */
252 /* -------------------------------------------------------------------- */
253 nDstLen = utf8fromwc( pszResult, nDstBufSize, pwszSource, nSrcLen );
254 if( nDstLen >= nDstBufSize - 1 )
255 {
256 CPLAssert( FALSE ); // too small!
257 return NULL;
258 }
259
260 /* -------------------------------------------------------------------- */
261 /* If something other than UTF-8 was requested, recode now. */
262 /* -------------------------------------------------------------------- */
263 if( strcmp(pszDstEncoding,CPL_ENC_UTF8) == 0 )
264 return pszResult;
265
266 char *pszFinalResult =
267 CPLRecode( pszResult, CPL_ENC_UTF8, pszDstEncoding );
268
269 CPLFree( pszResult );
270
271 return pszFinalResult;
272 }
273
274 /************************************************************************/
275 /* CPLRecodeToWChar() */
276 /************************************************************************/
277
278 /**
279 * Convert UTF-8 string to a wchar_t string.
280 *
281 * Convert a 8bit, multi-byte per character input string into a wide
282 * character (wchar_t) string. The only guaranteed supported source encodings
283 * are CPL_ENC_UTF8, CPL_ENC_ASCII and CPL_ENC_ISO8869_1 (LATIN1). The only
284 * guaranteed supported destination encoding is CPL_ENC_UCS2. Other source
285 * and destination encodings may be supported depending on the underlying
286 * implementation.
287 *
288 * Note that the wchar_t type varies in size on different systems. On
289 * win32 it is normally 2 bytes, and on unix 4 bytes.
290 *
291 * If an error occurs an error may, or may not be posted with CPLError().
292 *
293 * @param pszSource input multi-byte character string.
294 * @param pszSrcEncoding source encoding, typically CPL_ENC_UTF8.
295 * @param pszDstEncoding destination encoding, typically CPL_ENC_UCS2.
296 *
297 * @return the zero terminated wchar_t string (to be freed with CPLFree()) or
298 * NULL on error.
299 */
300
CPLRecodeToWChar(const char * pszSource,const char * pszSrcEncoding,const char * pszDstEncoding)301 wchar_t CPL_DLL *CPLRecodeToWChar( const char *pszSource,
302 const char *pszSrcEncoding,
303 const char *pszDstEncoding )
304
305 {
306 char *pszUTF8Source = (char *) pszSource;
307
308 if( strcmp(pszSrcEncoding,CPL_ENC_UTF8) != 0
309 && strcmp(pszSrcEncoding,CPL_ENC_ASCII) != 0 )
310 {
311 pszUTF8Source = CPLRecode( pszSource, pszSrcEncoding, CPL_ENC_UTF8 );
312 if( pszUTF8Source == NULL )
313 return NULL;
314 }
315
316 /* -------------------------------------------------------------------- */
317 /* We try to avoid changes of character set. We are just */
318 /* providing for unicode to unicode. */
319 /* -------------------------------------------------------------------- */
320 if( strcmp(pszDstEncoding,CPL_ENC_UCS2) != 0
321 && strcmp(pszDstEncoding,CPL_ENC_UCS4) != 0
322 && strcmp(pszDstEncoding,CPL_ENC_UTF16) != 0 )
323 {
324 CPLError( CE_Failure, CPLE_AppDefined,
325 "Stub recoding implementation does not support\n"
326 "CPLRecodeToWChar(...,%s,%s)",
327 pszSrcEncoding, pszDstEncoding );
328 return NULL;
329 }
330
331 /* -------------------------------------------------------------------- */
332 /* Do the UTF-8 to UCS-2 recoding. */
333 /* -------------------------------------------------------------------- */
334 int nSrcLen = strlen(pszUTF8Source);
335 wchar_t *pwszResult = (wchar_t *) CPLCalloc(sizeof(wchar_t),nSrcLen+1);
336
337 utf8towc( pszUTF8Source, nSrcLen, pwszResult, nSrcLen+1 );
338
339 if( pszUTF8Source != pszSource )
340 CPLFree( pszUTF8Source );
341
342 return pwszResult;
343 }
344
345 /************************************************************************/
346 /* ==================================================================== */
347 /* UTF.C code from FLTK with some modifications. */
348 /* ==================================================================== */
349 /************************************************************************/
350
351 /* Set to 1 to turn bad UTF8 bytes into ISO-8859-1. If this is to zero
352 they are instead turned into the Unicode REPLACEMENT CHARACTER, of
353 value 0xfffd.
354 If this is on utf8decode will correctly map most (perhaps all)
355 human-readable text that is in ISO-8859-1. This may allow you
356 to completely ignore character sets in your code because virtually
357 everything is either ISO-8859-1 or UTF-8.
358 */
359 #define ERRORS_TO_ISO8859_1 1
360
361 /* Set to 1 to turn bad UTF8 bytes in the 0x80-0x9f range into the
362 Unicode index for Microsoft's CP1252 character set. You should
363 also set ERRORS_TO_ISO8859_1. With this a huge amount of more
364 available text (such as all web pages) are correctly converted
365 to Unicode.
366 */
367 #define ERRORS_TO_CP1252 1
368
369 /* A number of Unicode code points are in fact illegal and should not
370 be produced by a UTF-8 converter. Turn this on will replace the
371 bytes in those encodings with errors. If you do this then converting
372 arbitrary 16-bit data to UTF-8 and then back is not an identity,
373 which will probably break a lot of software.
374 */
375 #define STRICT_RFC3629 0
376
377 #if ERRORS_TO_CP1252
378 // Codes 0x80..0x9f from the Microsoft CP1252 character set, translated
379 // to Unicode:
380 static unsigned short cp1252[32] = {
381 0x20ac, 0x0081, 0x201a, 0x0192, 0x201e, 0x2026, 0x2020, 0x2021,
382 0x02c6, 0x2030, 0x0160, 0x2039, 0x0152, 0x008d, 0x017d, 0x008f,
383 0x0090, 0x2018, 0x2019, 0x201c, 0x201d, 0x2022, 0x2013, 0x2014,
384 0x02dc, 0x2122, 0x0161, 0x203a, 0x0153, 0x009d, 0x017e, 0x0178
385 };
386 #endif
387
388 /************************************************************************/
389 /* utf8decode() */
390 /************************************************************************/
391
392 /*
393 Decode a single UTF-8 encoded character starting at \e p. The
394 resulting Unicode value (in the range 0-0x10ffff) is returned,
395 and \e len is set the the number of bytes in the UTF-8 encoding
396 (adding \e len to \e p will point at the next character).
397
398 If \a p points at an illegal UTF-8 encoding, including one that
399 would go past \e end, or where a code is uses more bytes than
400 necessary, then *(unsigned char*)p is translated as though it is
401 in the Microsoft CP1252 character set and \e len is set to 1.
402 Treating errors this way allows this to decode almost any
403 ISO-8859-1 or CP1252 text that has been mistakenly placed where
404 UTF-8 is expected, and has proven very useful.
405
406 If you want errors to be converted to error characters (as the
407 standards recommend), adding a test to see if the length is
408 unexpectedly 1 will work:
409
410 \code
411 if (*p & 0x80) { // what should be a multibyte encoding
412 code = utf8decode(p,end,&len);
413 if (len<2) code = 0xFFFD; // Turn errors into REPLACEMENT CHARACTER
414 } else { // handle the 1-byte utf8 encoding:
415 code = *p;
416 len = 1;
417 }
418 \endcode
419
420 Direct testing for the 1-byte case (as shown above) will also
421 speed up the scanning of strings where the majority of characters
422 are ASCII.
423 */
utf8decode(const char * p,const char * end,int * len)424 static unsigned utf8decode(const char* p, const char* end, int* len)
425 {
426 unsigned char c = *(unsigned char*)p;
427 if (c < 0x80) {
428 *len = 1;
429 return c;
430 #if ERRORS_TO_CP1252
431 } else if (c < 0xa0) {
432 *len = 1;
433 return cp1252[c-0x80];
434 #endif
435 } else if (c < 0xc2) {
436 goto FAIL;
437 }
438 if (p+1 >= end || (p[1]&0xc0) != 0x80) goto FAIL;
439 if (c < 0xe0) {
440 *len = 2;
441 return
442 ((p[0] & 0x1f) << 6) +
443 ((p[1] & 0x3f));
444 } else if (c == 0xe0) {
445 if (((unsigned char*)p)[1] < 0xa0) goto FAIL;
446 goto UTF8_3;
447 #if STRICT_RFC3629
448 } else if (c == 0xed) {
449 // RFC 3629 says surrogate chars are illegal.
450 if (((unsigned char*)p)[1] >= 0xa0) goto FAIL;
451 goto UTF8_3;
452 } else if (c == 0xef) {
453 // 0xfffe and 0xffff are also illegal characters
454 if (((unsigned char*)p)[1]==0xbf &&
455 ((unsigned char*)p)[2]>=0xbe) goto FAIL;
456 goto UTF8_3;
457 #endif
458 } else if (c < 0xf0) {
459 UTF8_3:
460 if (p+2 >= end || (p[2]&0xc0) != 0x80) goto FAIL;
461 *len = 3;
462 return
463 ((p[0] & 0x0f) << 12) +
464 ((p[1] & 0x3f) << 6) +
465 ((p[2] & 0x3f));
466 } else if (c == 0xf0) {
467 if (((unsigned char*)p)[1] < 0x90) goto FAIL;
468 goto UTF8_4;
469 } else if (c < 0xf4) {
470 UTF8_4:
471 if (p+3 >= end || (p[2]&0xc0) != 0x80 || (p[3]&0xc0) != 0x80) goto FAIL;
472 *len = 4;
473 #if STRICT_RFC3629
474 // RFC 3629 says all codes ending in fffe or ffff are illegal:
475 if ((p[1]&0xf)==0xf &&
476 ((unsigned char*)p)[2] == 0xbf &&
477 ((unsigned char*)p)[3] >= 0xbe) goto FAIL;
478 #endif
479 return
480 ((p[0] & 0x07) << 18) +
481 ((p[1] & 0x3f) << 12) +
482 ((p[2] & 0x3f) << 6) +
483 ((p[3] & 0x3f));
484 } else if (c == 0xf4) {
485 if (((unsigned char*)p)[1] > 0x8f) goto FAIL; // after 0x10ffff
486 goto UTF8_4;
487 } else {
488 FAIL:
489 *len = 1;
490 #if ERRORS_TO_ISO8859_1
491 return c;
492 #else
493 return 0xfffd; // Unicode REPLACEMENT CHARACTER
494 #endif
495 }
496 }
497
498 /************************************************************************/
499 /* utf8fwd() */
500 /************************************************************************/
501
502 /*
503 Move \a p forward until it points to the start of a UTF-8
504 character. If it already points at the start of one then it
505 is returned unchanged. Any UTF-8 errors are treated as though each
506 byte of the error is an individual character.
507
508 \e start is the start of the string and is used to limit the
509 backwards search for the start of a utf8 character.
510
511 \e end is the end of the string and is assummed to be a break
512 between characters. It is assummed to be greater than p.
513
514 This function is for moving a pointer that was jumped to the
515 middle of a string, such as when doing a binary search for
516 a position. You should use either this or utf8back() depending
517 on which direction your algorithim can handle the pointer
518 moving. Do not use this to scan strings, use utf8decode()
519 instead.
520 */
521
522 #ifdef FUTURE_NEEDS
utf8fwd(const char * p,const char * start,const char * end)523 static const char* utf8fwd(const char* p, const char* start, const char* end)
524 {
525 const char* a;
526 int len;
527 // if we are not pointing at a continuation character, we are done:
528 if ((*p&0xc0) != 0x80) return p;
529 // search backwards for a 0xc0 starting the character:
530 for (a = p-1; ; --a) {
531 if (a < start) return p;
532 if (!(a[0]&0x80)) return p;
533 if ((a[0]&0x40)) break;
534 }
535 utf8decode(a,end,&len);
536 a += len;
537 if (a > p) return a;
538 return p;
539 }
540 #endif /* def FUTURE_NEEDS */
541
542 /************************************************************************/
543 /* utf8back() */
544 /************************************************************************/
545
546 /*
547 Move \a p backward until it points to the start of a UTF-8
548 character. If it already points at the start of one then it
549 is returned unchanged. Any UTF-8 errors are treated as though each
550 byte of the error is an individual character.
551
552 \e start is the start of the string and is used to limit the
553 backwards search for the start of a UTF-8 character.
554
555 \e end is the end of the string and is assummed to be a break
556 between characters. It is assummed to be greater than p.
557
558 If you wish to decrement a UTF-8 pointer, pass p-1 to this.
559 */
560
561 #ifdef FUTURE_NEEDS
utf8back(const char * p,const char * start,const char * end)562 static const char* utf8back(const char* p, const char* start, const char* end)
563 {
564 const char* a;
565 int len;
566 // if we are not pointing at a continuation character, we are done:
567 if ((*p&0xc0) != 0x80) return p;
568 // search backwards for a 0xc0 starting the character:
569 for (a = p-1; ; --a) {
570 if (a < start) return p;
571 if (!(a[0]&0x80)) return p;
572 if ((a[0]&0x40)) break;
573 }
574 utf8decode(a,end,&len);
575 if (a+len > p) return a;
576 return p;
577 }
578 #endif /* def FUTURE_NEEDS */
579
580 /************************************************************************/
581 /* utf8bytes() */
582 /************************************************************************/
583
584 /* Returns number of bytes that utf8encode() will use to encode the
585 character \a ucs. */
586 #ifdef FUTURE_NEEDS
utf8bytes(unsigned ucs)587 static int utf8bytes(unsigned ucs) {
588 if (ucs < 0x000080U) {
589 return 1;
590 } else if (ucs < 0x000800U) {
591 return 2;
592 } else if (ucs < 0x010000U) {
593 return 3;
594 } else if (ucs < 0x10ffffU) {
595 return 4;
596 } else {
597 return 3; // length of the illegal character encoding
598 }
599 }
600 #endif /* def FUTURE_NEEDS */
601
602 /************************************************************************/
603 /* utf8encode() */
604 /************************************************************************/
605
606 /* Write the UTF-8 encoding of \e ucs into \e buf and return the
607 number of bytes written. Up to 4 bytes may be written. If you know
608 that \a ucs is less than 0x10000 then at most 3 bytes will be written.
609 If you wish to speed this up, remember that anything less than 0x80
610 is written as a single byte.
611
612 If ucs is greater than 0x10ffff this is an illegal character
613 according to RFC 3629. These are converted as though they are
614 0xFFFD (REPLACEMENT CHARACTER).
615
616 RFC 3629 also says many other values for \a ucs are illegal (in
617 the range 0xd800 to 0xdfff, or ending with 0xfffe or
618 0xffff). However I encode these as though they are legal, so that
619 utf8encode/utf8decode will be the identity for all codes between 0
620 and 0x10ffff.
621 */
622 #ifdef FUTURE_NEEDS
utf8encode(unsigned ucs,char * buf)623 static int utf8encode(unsigned ucs, char* buf) {
624 if (ucs < 0x000080U) {
625 buf[0] = ucs;
626 return 1;
627 } else if (ucs < 0x000800U) {
628 buf[0] = 0xc0 | (ucs >> 6);
629 buf[1] = 0x80 | (ucs & 0x3F);
630 return 2;
631 } else if (ucs < 0x010000U) {
632 buf[0] = 0xe0 | (ucs >> 12);
633 buf[1] = 0x80 | ((ucs >> 6) & 0x3F);
634 buf[2] = 0x80 | (ucs & 0x3F);
635 return 3;
636 } else if (ucs < 0x0010ffffU) {
637 buf[0] = 0xf0 | (ucs >> 18);
638 buf[1] = 0x80 | ((ucs >> 12) & 0x3F);
639 buf[2] = 0x80 | ((ucs >> 6) & 0x3F);
640 buf[3] = 0x80 | (ucs & 0x3F);
641 return 4;
642 } else {
643 // encode 0xfffd:
644 buf[0] = 0xefU;
645 buf[1] = 0xbfU;
646 buf[2] = 0xbdU;
647 return 3;
648 }
649 }
650 #endif /* def FUTURE_NEEDS */
651
652 /************************************************************************/
653 /* utf8towc() */
654 /************************************************************************/
655
656 /* Convert a UTF-8 sequence into an array of wchar_t. These
657 are used by some system calls, especially on Windows.
658
659 \a src points at the UTF-8, and \a srclen is the number of bytes to
660 convert.
661
662 \a dst points at an array to write, and \a dstlen is the number of
663 locations in this array. At most \a dstlen-1 words will be
664 written there, plus a 0 terminating word. Thus this function
665 will never overwrite the buffer and will always return a
666 zero-terminated string. If \a dstlen is zero then \a dst can be
667 null and no data is written, but the length is returned.
668
669 The return value is the number of words that \e would be written
670 to \a dst if it were long enough, not counting the terminating
671 zero. If the return value is greater or equal to \a dstlen it
672 indicates truncation, you can then allocate a new array of size
673 return+1 and call this again.
674
675 Errors in the UTF-8 are converted as though each byte in the
676 erroneous string is in the Microsoft CP1252 encoding. This allows
677 ISO-8859-1 text mistakenly identified as UTF-8 to be printed
678 correctly.
679
680 Notice that sizeof(wchar_t) is 2 on Windows and is 4 on Linux
681 and most other systems. Where wchar_t is 16 bits, Unicode
682 characters in the range 0x10000 to 0x10ffff are converted to
683 "surrogate pairs" which take two words each (this is called UTF-16
684 encoding). If wchar_t is 32 bits this rather nasty problem is
685 avoided.
686 */
utf8towc(const char * src,unsigned srclen,wchar_t * dst,unsigned dstlen)687 static unsigned utf8towc(const char* src, unsigned srclen,
688 wchar_t* dst, unsigned dstlen)
689 {
690 const char* p = src;
691 const char* e = src+srclen;
692 unsigned count = 0;
693 if (dstlen) for (;;) {
694 if (p >= e) {dst[count] = 0; return count;}
695 if (!(*p & 0x80)) { // ascii
696 dst[count] = *p++;
697 } else {
698 int len; unsigned ucs = utf8decode(p,e,&len);
699 p += len;
700 #ifdef _WIN32
701 if (ucs < 0x10000) {
702 dst[count] = ucs;
703 } else {
704 // make a surrogate pair:
705 if (count+2 >= dstlen) {dst[count] = 0; count += 2; break;}
706 dst[count] = (((ucs-0x10000u)>>10)&0x3ff) | 0xd800;
707 dst[++count] = (ucs&0x3ff) | 0xdc00;
708 }
709 #else
710 dst[count] = (wchar_t)ucs;
711 #endif
712 }
713 if (++count == dstlen) {dst[count-1] = 0; break;}
714 }
715 // we filled dst, measure the rest:
716 while (p < e) {
717 if (!(*p & 0x80)) p++;
718 else {
719 #ifdef _WIN32
720 int len; unsigned ucs = utf8decode(p,e,&len);
721 p += len;
722 if (ucs >= 0x10000) ++count;
723 #else
724 int len; utf8decode(p,e,&len);
725 p += len;
726 #endif
727 }
728 ++count;
729 }
730 return count;
731 }
732
733 /************************************************************************/
734 /* utf8toa() */
735 /************************************************************************/
736 /* Convert a UTF-8 sequence into an array of 1-byte characters.
737
738 If the UTF-8 decodes to a character greater than 0xff then it is
739 replaced with '?'.
740
741 Errors in the UTF-8 are converted as individual bytes, same as
742 utf8decode() does. This allows ISO-8859-1 text mistakenly identified
743 as UTF-8 to be printed correctly (and possibly CP1512 on Windows).
744
745 \a src points at the UTF-8, and \a srclen is the number of bytes to
746 convert.
747
748 Up to \a dstlen bytes are written to \a dst, including a null
749 terminator. The return value is the number of bytes that would be
750 written, not counting the null terminator. If greater or equal to
751 \a dstlen then if you malloc a new array of size n+1 you will have
752 the space needed for the entire string. If \a dstlen is zero then
753 nothing is written and this call just measures the storage space
754 needed.
755 */
utf8toa(const char * src,unsigned srclen,char * dst,unsigned dstlen)756 static unsigned utf8toa(const char* src, unsigned srclen,
757 char* dst, unsigned dstlen)
758 {
759 const char* p = src;
760 const char* e = src+srclen;
761 unsigned count = 0;
762 if (dstlen) for (;;) {
763 unsigned char c;
764 if (p >= e) {dst[count] = 0; return count;}
765 c = *(unsigned char*)p;
766 if (c < 0xC2) { // ascii or bad code
767 dst[count] = c;
768 p++;
769 } else {
770 int len; unsigned ucs = utf8decode(p,e,&len);
771 p += len;
772 if (ucs < 0x100) dst[count] = ucs;
773 else dst[count] = '?';
774 }
775 if (++count >= dstlen) {dst[count-1] = 0; break;}
776 }
777 // we filled dst, measure the rest:
778 while (p < e) {
779 if (!(*p & 0x80)) p++;
780 else {
781 int len;
782 utf8decode(p,e,&len);
783 p += len;
784 }
785 ++count;
786 }
787 return count;
788 }
789
790 /************************************************************************/
791 /* utf8fromwc() */
792 /************************************************************************/
793 /* Turn "wide characters" as returned by some system calls
794 (especially on Windows) into UTF-8.
795
796 Up to \a dstlen bytes are written to \a dst, including a null
797 terminator. The return value is the number of bytes that would be
798 written, not counting the null terminator. If greater or equal to
799 \a dstlen then if you malloc a new array of size n+1 you will have
800 the space needed for the entire string. If \a dstlen is zero then
801 nothing is written and this call just measures the storage space
802 needed.
803
804 \a srclen is the number of words in \a src to convert. On Windows
805 this is not necessairly the number of characters, due to there
806 possibly being "surrogate pairs" in the UTF-16 encoding used.
807 On Unix wchar_t is 32 bits and each location is a character.
808
809 On Unix if a src word is greater than 0x10ffff then this is an
810 illegal character according to RFC 3629. These are converted as
811 though they are 0xFFFD (REPLACEMENT CHARACTER). Characters in the
812 range 0xd800 to 0xdfff, or ending with 0xfffe or 0xffff are also
813 illegal according to RFC 3629. However I encode these as though
814 they are legal, so that utf8towc will return the original data.
815
816 On Windows "surrogate pairs" are converted to a single character
817 and UTF-8 encoded (as 4 bytes). Mismatched halves of surrogate
818 pairs are converted as though they are individual characters.
819 */
utf8fromwc(char * dst,unsigned dstlen,const wchar_t * src,unsigned srclen)820 static unsigned utf8fromwc(char* dst, unsigned dstlen,
821 const wchar_t* src, unsigned srclen) {
822 unsigned i = 0;
823 unsigned count = 0;
824 if (dstlen) for (;;) {
825 unsigned ucs;
826 if (i >= srclen) {dst[count] = 0; return count;}
827 ucs = src[i++];
828 if (ucs < 0x80U) {
829 dst[count++] = ucs;
830 if (count >= dstlen) {dst[count-1] = 0; break;}
831 } else if (ucs < 0x800U) { // 2 bytes
832 if (count+2 >= dstlen) {dst[count] = 0; count += 2; break;}
833 dst[count++] = 0xc0 | (ucs >> 6);
834 dst[count++] = 0x80 | (ucs & 0x3F);
835 #ifdef _WIN32
836 } else if (ucs >= 0xd800 && ucs <= 0xdbff && i < srclen &&
837 src[i] >= 0xdc00 && src[i] <= 0xdfff) {
838 // surrogate pair
839 unsigned ucs2 = src[i++];
840 ucs = 0x10000U + ((ucs&0x3ff)<<10) + (ucs2&0x3ff);
841 // all surrogate pairs turn into 4-byte utf8
842 #else
843 } else if (ucs >= 0x10000) {
844 if (ucs > 0x10ffff) {
845 ucs = 0xfffd;
846 goto J1;
847 }
848 #endif
849 if (count+4 >= dstlen) {dst[count] = 0; count += 4; break;}
850 dst[count++] = 0xf0 | (ucs >> 18);
851 dst[count++] = 0x80 | ((ucs >> 12) & 0x3F);
852 dst[count++] = 0x80 | ((ucs >> 6) & 0x3F);
853 dst[count++] = 0x80 | (ucs & 0x3F);
854 } else {
855 #ifndef _WIN32
856 J1:
857 #endif
858 // all others are 3 bytes:
859 if (count+3 >= dstlen) {dst[count] = 0; count += 3; break;}
860 dst[count++] = 0xe0 | (ucs >> 12);
861 dst[count++] = 0x80 | ((ucs >> 6) & 0x3F);
862 dst[count++] = 0x80 | (ucs & 0x3F);
863 }
864 }
865 // we filled dst, measure the rest:
866 while (i < srclen) {
867 unsigned ucs = src[i++];
868 if (ucs < 0x80U) {
869 count++;
870 } else if (ucs < 0x800U) { // 2 bytes
871 count += 2;
872 #ifdef _WIN32
873 } else if (ucs >= 0xd800 && ucs <= 0xdbff && i < srclen-1 &&
874 src[i+1] >= 0xdc00 && src[i+1] <= 0xdfff) {
875 // surrogate pair
876 ++i;
877 #else
878 } else if (ucs >= 0x10000 && ucs <= 0x10ffff) {
879 #endif
880 count += 4;
881 } else {
882 count += 3;
883 }
884 }
885 return count;
886 }
887
888
889 /************************************************************************/
890 /* utf8froma() */
891 /************************************************************************/
892
893 /* Convert an ISO-8859-1 (ie normal c-string) byte stream to UTF-8.
894
895 It is possible this should convert Microsoft's CP1252 to UTF-8
896 instead. This would translate the codes in the range 0x80-0x9f
897 to different characters. Currently it does not do this.
898
899 Up to \a dstlen bytes are written to \a dst, including a null
900 terminator. The return value is the number of bytes that would be
901 written, not counting the null terminator. If greater or equal to
902 \a dstlen then if you malloc a new array of size n+1 you will have
903 the space needed for the entire string. If \a dstlen is zero then
904 nothing is written and this call just measures the storage space
905 needed.
906
907 \a srclen is the number of bytes in \a src to convert.
908
909 If the return value equals \a srclen then this indicates that
910 no conversion is necessary, as only ASCII characters are in the
911 string.
912 */
utf8froma(char * dst,unsigned dstlen,const char * src,unsigned srclen)913 static unsigned utf8froma(char* dst, unsigned dstlen,
914 const char* src, unsigned srclen) {
915 const char* p = src;
916 const char* e = src+srclen;
917 unsigned count = 0;
918 if (dstlen) for (;;) {
919 unsigned char ucs;
920 if (p >= e) {dst[count] = 0; return count;}
921 ucs = *(unsigned char*)p++;
922 if (ucs < 0x80U) {
923 dst[count++] = ucs;
924 if (count >= dstlen) {dst[count-1] = 0; break;}
925 } else { // 2 bytes (note that CP1252 translate could make 3 bytes!)
926 if (count+2 >= dstlen) {dst[count] = 0; count += 2; break;}
927 dst[count++] = 0xc0 | (ucs >> 6);
928 dst[count++] = 0x80 | (ucs & 0x3F);
929 }
930 }
931 // we filled dst, measure the rest:
932 while (p < e) {
933 unsigned char ucs = *(unsigned char*)p++;
934 if (ucs < 0x80U) {
935 count++;
936 } else {
937 count += 2;
938 }
939 }
940 return count;
941 }
942
943 /*
944 ** For now we disable the rest which is locale() related. We may need
945 ** parts of it later.
946 */
947
948 #ifdef notdef
949
950 #ifdef _WIN32
951 # include <windows.h>
952 #endif
953
954 /*! Return true if the "locale" seems to indicate that UTF-8 encoding
955 is used. If true the utf8tomb and utf8frommb don't do anything
956 useful.
957
958 <i>It is highly recommended that you change your system so this
959 does return true.</i> On Windows this is done by setting the
960 "codepage" to CP_UTF8. On Unix this is done by setting $LC_CTYPE
961 to a string containing the letters "utf" or "UTF" in it, or by
962 deleting all $LC* and $LANG environment variables. In the future
963 it is likely that all non-Asian Unix systems will return true,
964 due to the compatability of UTF-8 with ISO-8859-1.
965 */
utf8locale(void)966 int utf8locale(void) {
967 static int ret = 2;
968 if (ret == 2) {
969 #ifdef _WIN32
970 ret = GetACP() == CP_UTF8;
971 #else
972 char* s;
973 ret = 1; // assumme UTF-8 if no locale
974 if (((s = getenv("LC_CTYPE")) && *s) ||
975 ((s = getenv("LC_ALL")) && *s) ||
976 ((s = getenv("LANG")) && *s)) {
977 ret = (strstr(s,"utf") || strstr(s,"UTF"));
978 }
979 #endif
980 }
981 return ret;
982 }
983
984 /*! Convert the UTF-8 used by FLTK to the locale-specific encoding
985 used for filenames (and sometimes used for data in files).
986 Unfortunatley due to stupid design you will have to do this as
987 needed for filenames. This is a bug on both Unix and Windows.
988
989 Up to \a dstlen bytes are written to \a dst, including a null
990 terminator. The return value is the number of bytes that would be
991 written, not counting the null terminator. If greater or equal to
992 \a dstlen then if you malloc a new array of size n+1 you will have
993 the space needed for the entire string. If \a dstlen is zero then
994 nothing is written and this call just measures the storage space
995 needed.
996
997 If utf8locale() returns true then this does not change the data.
998 It is copied and truncated as necessary to
999 the destination buffer and \a srclen is always returned. */
utf8tomb(const char * src,unsigned srclen,char * dst,unsigned dstlen)1000 unsigned utf8tomb(const char* src, unsigned srclen,
1001 char* dst, unsigned dstlen)
1002 {
1003 if (!utf8locale()) {
1004 #ifdef _WIN32
1005 wchar_t lbuf[1024];
1006 wchar_t* buf = lbuf;
1007 unsigned length = utf8towc(src, srclen, buf, 1024);
1008 unsigned ret;
1009 if (length >= 1024) {
1010 buf = (wchar_t*)(malloc((length+1)*sizeof(wchar_t)));
1011 utf8towc(src, srclen, buf, length+1);
1012 }
1013 if (dstlen) {
1014 // apparently this does not null-terminate, even though msdn
1015 // documentation claims it does:
1016 ret =
1017 WideCharToMultiByte(GetACP(), 0, buf, length, dst, dstlen, 0, 0);
1018 dst[ret] = 0;
1019 }
1020 // if it overflows or measuring length, get the actual length:
1021 if (dstlen==0 || ret >= dstlen-1)
1022 ret =
1023 WideCharToMultiByte(GetACP(), 0, buf, length, 0, 0, 0, 0);
1024 if (buf != lbuf) free((void*)buf);
1025 return ret;
1026 #else
1027 wchar_t lbuf[1024];
1028 wchar_t* buf = lbuf;
1029 unsigned length = utf8towc(src, srclen, buf, 1024);
1030 int ret;
1031 if (length >= 1024) {
1032 buf = (wchar_t*)(malloc((length+1)*sizeof(wchar_t)));
1033 utf8towc(src, srclen, buf, length+1);
1034 }
1035 if (dstlen) {
1036 ret = wcstombs(dst, buf, dstlen);
1037 if (ret >= dstlen-1) ret = wcstombs(0,buf,0);
1038 } else {
1039 ret = wcstombs(0,buf,0);
1040 }
1041 if (buf != lbuf) free((void*)buf);
1042 if (ret >= 0) return (unsigned)ret;
1043 // on any errors we return the UTF-8 as raw text...
1044 #endif
1045 }
1046 // identity transform:
1047 if (srclen < dstlen) {
1048 memcpy(dst, src, srclen);
1049 dst[srclen] = 0;
1050 } else {
1051 memcpy(dst, src, dstlen-1);
1052 dst[dstlen-1] = 0;
1053 }
1054 return srclen;
1055 }
1056
1057 /*! Convert a filename from the locale-specific multibyte encoding
1058 used by Windows to UTF-8 as used by FLTK.
1059
1060 Up to \a dstlen bytes are written to \a dst, including a null
1061 terminator. The return value is the number of bytes that would be
1062 written, not counting the null terminator. If greater or equal to
1063 \a dstlen then if you malloc a new array of size n+1 you will have
1064 the space needed for the entire string. If \a dstlen is zero then
1065 nothing is written and this call just measures the storage space
1066 needed.
1067
1068 On Unix or on Windows when a UTF-8 locale is in effect, this
1069 does not change the data. It is copied and truncated as necessary to
1070 the destination buffer and \a srclen is always returned.
1071 You may also want to check if utf8test() returns non-zero, so that
1072 the filesystem can store filenames in UTF-8 encoding regardless of
1073 the locale.
1074 */
utf8frommb(char * dst,unsigned dstlen,const char * src,unsigned srclen)1075 unsigned utf8frommb(char* dst, unsigned dstlen,
1076 const char* src, unsigned srclen)
1077 {
1078 if (!utf8locale()) {
1079 #ifdef _WIN32
1080 wchar_t lbuf[1024];
1081 wchar_t* buf = lbuf;
1082 unsigned length;
1083 unsigned ret;
1084 length =
1085 MultiByteToWideChar(GetACP(), 0, src, srclen, buf, 1024);
1086 if (length >= 1024) {
1087 length = MultiByteToWideChar(GetACP(), 0, src, srclen, 0, 0);
1088 buf = (wchar_t*)(malloc(length*sizeof(wchar_t)));
1089 MultiByteToWideChar(GetACP(), 0, src, srclen, buf, length);
1090 }
1091 ret = utf8fromwc(dst, dstlen, buf, length);
1092 if (buf != lbuf) free((void*)buf);
1093 return ret;
1094 #else
1095 wchar_t lbuf[1024];
1096 wchar_t* buf = lbuf;
1097 int length;
1098 unsigned ret;
1099 length = mbstowcs(buf, src, 1024);
1100 if (length >= 1024) {
1101 length = mbstowcs(0, src, 0)+1;
1102 buf = (wchar_t*)(malloc(length*sizeof(unsigned short)));
1103 mbstowcs(buf, src, length);
1104 }
1105 if (length >= 0) {
1106 ret = utf8fromwc(dst, dstlen, buf, length);
1107 if (buf != lbuf) free((void*)buf);
1108 return ret;
1109 }
1110 // errors in conversion return the UTF-8 unchanged
1111 #endif
1112 }
1113 // identity transform:
1114 if (srclen < dstlen) {
1115 memcpy(dst, src, srclen);
1116 dst[srclen] = 0;
1117 } else {
1118 memcpy(dst, src, dstlen-1);
1119 dst[dstlen-1] = 0;
1120 }
1121 return srclen;
1122 }
1123
1124 /*! Examines the first \a srclen bytes in \a src and return a verdict
1125 on whether it is UTF-8 or not.
1126 - Returns 0 if there is any illegal UTF-8 sequences, using the
1127 same rules as utf8decode(). Note that some UCS values considered
1128 illegal by RFC 3629, such as 0xffff, are considered legal by this.
1129 - Returns 1 if there are only single-byte characters (ie no bytes
1130 have the high bit set). This is legal UTF-8, but also indicates
1131 plain ASCII. It also returns 1 if \a srclen is zero.
1132 - Returns 2 if there are only characters less than 0x800.
1133 - Returns 3 if there are only characters less than 0x10000.
1134 - Returns 4 if there are characters in the 0x10000 to 0x10ffff range.
1135
1136 Because there are many illegal sequences in UTF-8, it is almost
1137 impossible for a string in another encoding to be confused with
1138 UTF-8. This is very useful for transitioning Unix to UTF-8
1139 filenames, you can simply test each filename with this to decide
1140 if it is UTF-8 or in the locale encoding. My hope is that if
1141 this is done we will be able to cleanly transition to a locale-less
1142 encoding.
1143 */
utf8test(const char * src,unsigned srclen)1144 int utf8test(const char* src, unsigned srclen) {
1145 int ret = 1;
1146 const char* p = src;
1147 const char* e = src+srclen;
1148 while (p < e) {
1149 if (*p & 0x80) {
1150 int len; utf8decode(p,e,&len);
1151 if (len < 2) return 0;
1152 if (len > ret) ret = len;
1153 p += len;
1154 } else {
1155 p++;
1156 }
1157 }
1158 return ret;
1159 }
1160
1161 #endif /* def notdef - disabled locale specific stuff */
1162
1163 #endif /* defined(CPL_RECODE_STUB) */
1164