1 /**********************************************************************
2  * $Id: cpl_recode_stub.cpp 14368 2008-04-30 02:22:31Z warmerdam $
3  *
4  * Name:     cpl_recode.cpp
5  * Project:  CPL - Common Portability Library
6  * Purpose:  Character set recoding and char/wchar_t conversions.
7  * Author:   Frank Warmerdam, warmerdam@pobox.com
8  *
9  * The bulk of this code is derived from the utf.c module from FLTK. It
10  * was originally downloaded from:
11  *    http://svn.easysw.com/public/fltk/fltk/trunk/src/utf.c
12  *
13  **********************************************************************
14  * Copyright (c) 2008, Frank Warmerdam
15  * Copyright 2006 by Bill Spitzak and others.
16  *
17  * Permission to use, copy, modify, and distribute this software for any
18  * purpose with or without fee is hereby granted, provided that the above
19  * copyright notice and this permission notice appear in all copies.
20  *
21  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
22  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
23  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
24  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
25  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
26  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
27  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
28  **********************************************************************/
29 
30 #include "cpl_string.h"
31 
32 CPL_CVSID("$Id: cpl_recode_stub.cpp 14368 2008-04-30 02:22:31Z warmerdam $");
33 
34 #define CPL_RECODE_STUB
35 
36 #ifdef CPL_RECODE_STUB
37 
38 static unsigned utf8decode(const char* p, const char* end, int* len);
39 static unsigned utf8towc(const char* src, unsigned srclen,
40                          wchar_t* dst, unsigned dstlen);
41 static unsigned utf8toa(const char* src, unsigned srclen,
42                         char* dst, unsigned dstlen);
43 static unsigned utf8fromwc(char* dst, unsigned dstlen,
44                            const wchar_t* src, unsigned srclen);
45 static unsigned utf8froma(char* dst, unsigned dstlen,
46                           const char* src, unsigned srclen);
47 
48 #ifdef FUTURE_NEEDS
49 static const char* utf8fwd(const char* p, const char* start, const char* end);
50 static const char* utf8back(const char* p, const char* start, const char*end);
51 static int utf8encode(unsigned ucs, char* buf);
52 static int utf8bytes(unsigned ucs);
53 #endif /* def FUTURE_NEEDS */
54 
55 /************************************************************************/
56 /* ==================================================================== */
57 /*	Stub Implementation not depending on iconv() or WIN32 API.	*/
58 /* ==================================================================== */
59 /************************************************************************/
60 
61 /************************************************************************/
62 /*                             CPLRecode()                              */
63 /************************************************************************/
64 
CPLRecode(const char * pszSource,const char * pszSrcEncoding,const char * pszDstEncoding)65 char CPL_DLL *CPLRecode( const char *pszSource,
66                          const char *pszSrcEncoding,
67                          const char *pszDstEncoding )
68 
69 {
70 /* -------------------------------------------------------------------- */
71 /*      Handle a few common short cuts.                                 */
72 /* -------------------------------------------------------------------- */
73     if( strcmp(pszSrcEncoding,pszDstEncoding) == 0 )
74         return CPLStrdup(pszSource);
75 
76     if( strcmp(pszSrcEncoding,CPL_ENC_ASCII) == 0
77         && (strcmp(pszDstEncoding,CPL_ENC_UTF8) == 0
78             || strcmp(pszDstEncoding,CPL_ENC_ISO8859_1) == 0) )
79         return CPLStrdup(pszSource);
80 
81 /* -------------------------------------------------------------------- */
82 /*      If the source or destination is current locale(), we change     */
83 /*      it to ISO8859-1 since our stub implementation does not          */
84 /*      attempt to address locales properly.                            */
85 /* -------------------------------------------------------------------- */
86 
87     if( pszSrcEncoding[0] == '\0' )
88         pszSrcEncoding = CPL_ENC_ISO8859_1;
89 
90     if( pszDstEncoding[0] == '\0' )
91         pszDstEncoding = CPL_ENC_ISO8859_1;
92 
93 /* -------------------------------------------------------------------- */
94 /*      ISO8859 to UTF8                                                 */
95 /* -------------------------------------------------------------------- */
96     if( strcmp(pszSrcEncoding,CPL_ENC_ISO8859_1) == 0
97         && strcmp(pszDstEncoding,CPL_ENC_UTF8) == 0 )
98     {
99         int nCharCount = strlen(pszSource);
100         char *pszResult = (char *) CPLCalloc(1,nCharCount*2+1);
101 
102         utf8froma( pszResult, nCharCount*2+1, pszSource, nCharCount );
103 
104         return pszResult;
105     }
106 
107 /* -------------------------------------------------------------------- */
108 /*      UTF8 to ISO8859                                                 */
109 /* -------------------------------------------------------------------- */
110     if( strcmp(pszSrcEncoding,CPL_ENC_UTF8) == 0
111         && strcmp(pszDstEncoding,CPL_ENC_ISO8859_1) == 0 )
112     {
113         int nCharCount = strlen(pszSource);
114         char *pszResult = (char *) CPLCalloc(1,nCharCount+1);
115 
116         utf8toa( pszSource, nCharCount, pszResult, nCharCount+1 );
117 
118         return pszResult;
119     }
120 
121 /* -------------------------------------------------------------------- */
122 /*      Anything else to UTF-8 is treated as ISO8859-1 to UTF-8 with    */
123 /*      a one-time warning.                                             */
124 /* -------------------------------------------------------------------- */
125     if( strcmp(pszDstEncoding,CPL_ENC_UTF8) == 0 )
126     {
127         int nCharCount = strlen(pszSource);
128         char *pszResult = (char *) CPLCalloc(1,nCharCount*2+1);
129         static int bHaveWarned = FALSE;
130 
131         if( !bHaveWarned )
132         {
133             bHaveWarned = 1;
134             CPLError( CE_Warning, CPLE_AppDefined,
135                       "Recode from %s to UTF-8 not supported, treated as ISO8859-1 to UTF-8.",
136                       pszSrcEncoding );
137         }
138 
139         utf8froma( pszResult, nCharCount*2+1, pszSource, nCharCount );
140 
141         return pszResult;
142     }
143 
144 /* -------------------------------------------------------------------- */
145 /*      UTF-8 to anything else is treated as UTF-8 to ISO-8859-1        */
146 /*      with a warning.                                                 */
147 /* -------------------------------------------------------------------- */
148     if( strcmp(pszSrcEncoding,CPL_ENC_UTF8) == 0
149         && strcmp(pszDstEncoding,CPL_ENC_ISO8859_1) == 0 )
150     {
151         int nCharCount = strlen(pszSource);
152         char *pszResult = (char *) CPLCalloc(1,nCharCount+1);
153         static int bHaveWarned = FALSE;
154 
155         if( !bHaveWarned )
156         {
157             bHaveWarned = 1;
158             CPLError( CE_Warning, CPLE_AppDefined,
159                       "Recode from UTF-8 to %s not supported, treated as UTF-8 to ISO8859-1.",
160                       pszDstEncoding );
161         }
162 
163         utf8toa( pszSource, nCharCount, pszResult, nCharCount+1 );
164 
165         return pszResult;
166     }
167 
168 /* -------------------------------------------------------------------- */
169 /*      Everything else is treated as a no-op with a warning.           */
170 /* -------------------------------------------------------------------- */
171     {
172         static int bHaveWarned = FALSE;
173 
174         if( !bHaveWarned )
175         {
176             bHaveWarned = 1;
177             CPLError( CE_Warning, CPLE_AppDefined,
178                       "Recode from %s to %s not supported, no change applied.",
179                       pszSrcEncoding, pszDstEncoding );
180         }
181 
182         return CPLStrdup(pszSource);
183     }
184 }
185 
186 /************************************************************************/
187 /*                         CPLRecodeFromWChar()                         */
188 /************************************************************************/
189 
190 /**
191  * Convert wchar_t string to UTF-8.
192  *
193  * Convert a wchar_t string into a multibyte utf-8 string.  The only
194  * guaranteed supported source encoding is CPL_ENC_UCS2, and the only
195  * guaranteed supported destination encodings are CPL_ENC_UTF8, CPL_ENC_ASCII
196  * and CPL_ENC_ISO8859_1.  In some cases (ie. using iconv()) other encodings
197  * may also be supported.
198  *
199  * Note that the wchar_t type varies in size on different systems. On
200  * win32 it is normally 2 bytes, and on unix 4 bytes.
201  *
202  * If an error occurs an error may, or may not be posted with CPLError().
203  *
204  * @param pwszSource the source wchar_t string, terminated with a 0 wchar_t.
205  * @param pszSrcEncoding the source encoding, typically CPL_ENC_UCS2.
206  * @param pszDstEncoding the destination encoding, typically CPL_ENC_UTF8.
207  *
208  * @return a zero terminated multi-byte string which should be freed with
209  * CPLFree(), or NULL if an error occurs.
210  */
211 
CPLRecodeFromWChar(const wchar_t * pwszSource,const char * pszSrcEncoding,const char * pszDstEncoding)212 char CPL_DLL *CPLRecodeFromWChar( const wchar_t *pwszSource,
213                                   const char *pszSrcEncoding,
214                                   const char *pszDstEncoding )
215 
216 {
217 /* -------------------------------------------------------------------- */
218 /*      We try to avoid changes of character set.  We are just          */
219 /*      providing for unicode to unicode.                               */
220 /* -------------------------------------------------------------------- */
221     if( strcmp(pszSrcEncoding,CPL_ENC_UTF8) != 0
222         && strcmp(pszSrcEncoding,CPL_ENC_UTF16) != 0
223         && strcmp(pszSrcEncoding,CPL_ENC_UCS2) != 0
224         && strcmp(pszSrcEncoding,CPL_ENC_UCS4) != 0 )
225     {
226         CPLError( CE_Failure, CPLE_AppDefined,
227                   "Stub recoding implementation does not support\n"
228                   "CPLRecodeFromWChar(...,%s,%s)",
229                   pszSrcEncoding, pszDstEncoding );
230         return NULL;
231     }
232 
233 /* -------------------------------------------------------------------- */
234 /*      What is the source length.                                      */
235 /* -------------------------------------------------------------------- */
236     int nSrcLen = 0;
237 
238     while( pwszSource[nSrcLen] != 0 )
239         nSrcLen++;
240 
241 /* -------------------------------------------------------------------- */
242 /*      Allocate destination buffer plenty big.                         */
243 /* -------------------------------------------------------------------- */
244     char *pszResult;
245     int nDstBufSize, nDstLen;
246 
247     nDstBufSize = nSrcLen * 4 + 1;
248     pszResult = (char *) CPLMalloc(nDstBufSize); // nearly worst case.
249 
250 /* -------------------------------------------------------------------- */
251 /*      Convert, and confirm we had enough space.                       */
252 /* -------------------------------------------------------------------- */
253     nDstLen = utf8fromwc( pszResult, nDstBufSize, pwszSource, nSrcLen );
254     if( nDstLen >= nDstBufSize - 1 )
255     {
256         CPLAssert( FALSE ); // too small!
257         return NULL;
258     }
259 
260 /* -------------------------------------------------------------------- */
261 /*      If something other than UTF-8 was requested, recode now.        */
262 /* -------------------------------------------------------------------- */
263     if( strcmp(pszDstEncoding,CPL_ENC_UTF8) == 0 )
264         return pszResult;
265 
266     char *pszFinalResult =
267         CPLRecode( pszResult, CPL_ENC_UTF8, pszDstEncoding );
268 
269     CPLFree( pszResult );
270 
271     return pszFinalResult;
272 }
273 
274 /************************************************************************/
275 /*                          CPLRecodeToWChar()                          */
276 /************************************************************************/
277 
278 /**
279  * Convert UTF-8 string to a wchar_t string.
280  *
281  * Convert a 8bit, multi-byte per character input string into a wide
282  * character (wchar_t) string.  The only guaranteed supported source encodings
283  * are CPL_ENC_UTF8, CPL_ENC_ASCII and CPL_ENC_ISO8869_1 (LATIN1).  The only
284  * guaranteed supported destination encoding is CPL_ENC_UCS2.  Other source
285  * and destination encodings may be supported depending on the underlying
286  * implementation.
287  *
288  * Note that the wchar_t type varies in size on different systems. On
289  * win32 it is normally 2 bytes, and on unix 4 bytes.
290  *
291  * If an error occurs an error may, or may not be posted with CPLError().
292  *
293  * @param pszSource input multi-byte character string.
294  * @param pszSrcEncoding source encoding, typically CPL_ENC_UTF8.
295  * @param pszDstEncoding destination encoding, typically CPL_ENC_UCS2.
296  *
297  * @return the zero terminated wchar_t string (to be freed with CPLFree()) or
298  * NULL on error.
299  */
300 
CPLRecodeToWChar(const char * pszSource,const char * pszSrcEncoding,const char * pszDstEncoding)301 wchar_t CPL_DLL *CPLRecodeToWChar( const char *pszSource,
302                                    const char *pszSrcEncoding,
303                                    const char *pszDstEncoding )
304 
305 {
306     char *pszUTF8Source = (char *) pszSource;
307 
308     if( strcmp(pszSrcEncoding,CPL_ENC_UTF8) != 0
309         && strcmp(pszSrcEncoding,CPL_ENC_ASCII) != 0 )
310     {
311         pszUTF8Source = CPLRecode( pszSource, pszSrcEncoding, CPL_ENC_UTF8 );
312         if( pszUTF8Source == NULL )
313             return NULL;
314     }
315 
316 /* -------------------------------------------------------------------- */
317 /*      We try to avoid changes of character set.  We are just          */
318 /*      providing for unicode to unicode.                               */
319 /* -------------------------------------------------------------------- */
320     if( strcmp(pszDstEncoding,CPL_ENC_UCS2) != 0
321         && strcmp(pszDstEncoding,CPL_ENC_UCS4) != 0
322         && strcmp(pszDstEncoding,CPL_ENC_UTF16) != 0 )
323     {
324         CPLError( CE_Failure, CPLE_AppDefined,
325                   "Stub recoding implementation does not support\n"
326                   "CPLRecodeToWChar(...,%s,%s)",
327                   pszSrcEncoding, pszDstEncoding );
328         return NULL;
329     }
330 
331 /* -------------------------------------------------------------------- */
332 /*      Do the UTF-8 to UCS-2 recoding.                                 */
333 /* -------------------------------------------------------------------- */
334     int nSrcLen = strlen(pszUTF8Source);
335     wchar_t *pwszResult = (wchar_t *) CPLCalloc(sizeof(wchar_t),nSrcLen+1);
336 
337     utf8towc( pszUTF8Source, nSrcLen, pwszResult, nSrcLen+1 );
338 
339     if( pszUTF8Source != pszSource )
340         CPLFree( pszUTF8Source );
341 
342     return pwszResult;
343 }
344 
345 /************************************************************************/
346 /* ==================================================================== */
347 /*	UTF.C code from FLTK with some modifications.                   */
348 /* ==================================================================== */
349 /************************************************************************/
350 
351 /* Set to 1 to turn bad UTF8 bytes into ISO-8859-1. If this is to zero
352    they are instead turned into the Unicode REPLACEMENT CHARACTER, of
353    value 0xfffd.
354    If this is on utf8decode will correctly map most (perhaps all)
355    human-readable text that is in ISO-8859-1. This may allow you
356    to completely ignore character sets in your code because virtually
357    everything is either ISO-8859-1 or UTF-8.
358 */
359 #define ERRORS_TO_ISO8859_1 1
360 
361 /* Set to 1 to turn bad UTF8 bytes in the 0x80-0x9f range into the
362    Unicode index for Microsoft's CP1252 character set. You should
363    also set ERRORS_TO_ISO8859_1. With this a huge amount of more
364    available text (such as all web pages) are correctly converted
365    to Unicode.
366 */
367 #define ERRORS_TO_CP1252 1
368 
369 /* A number of Unicode code points are in fact illegal and should not
370    be produced by a UTF-8 converter. Turn this on will replace the
371    bytes in those encodings with errors. If you do this then converting
372    arbitrary 16-bit data to UTF-8 and then back is not an identity,
373    which will probably break a lot of software.
374 */
375 #define STRICT_RFC3629 0
376 
377 #if ERRORS_TO_CP1252
378 // Codes 0x80..0x9f from the Microsoft CP1252 character set, translated
379 // to Unicode:
380 static unsigned short cp1252[32] = {
381   0x20ac, 0x0081, 0x201a, 0x0192, 0x201e, 0x2026, 0x2020, 0x2021,
382   0x02c6, 0x2030, 0x0160, 0x2039, 0x0152, 0x008d, 0x017d, 0x008f,
383   0x0090, 0x2018, 0x2019, 0x201c, 0x201d, 0x2022, 0x2013, 0x2014,
384   0x02dc, 0x2122, 0x0161, 0x203a, 0x0153, 0x009d, 0x017e, 0x0178
385 };
386 #endif
387 
388 /************************************************************************/
389 /*                             utf8decode()                             */
390 /************************************************************************/
391 
392 /*
393     Decode a single UTF-8 encoded character starting at \e p. The
394     resulting Unicode value (in the range 0-0x10ffff) is returned,
395     and \e len is set the the number of bytes in the UTF-8 encoding
396     (adding \e len to \e p will point at the next character).
397 
398     If \a p points at an illegal UTF-8 encoding, including one that
399     would go past \e end, or where a code is uses more bytes than
400     necessary, then *(unsigned char*)p is translated as though it is
401     in the Microsoft CP1252 character set and \e len is set to 1.
402     Treating errors this way allows this to decode almost any
403     ISO-8859-1 or CP1252 text that has been mistakenly placed where
404     UTF-8 is expected, and has proven very useful.
405 
406     If you want errors to be converted to error characters (as the
407     standards recommend), adding a test to see if the length is
408     unexpectedly 1 will work:
409 
410 \code
411     if (*p & 0x80) { // what should be a multibyte encoding
412       code = utf8decode(p,end,&len);
413       if (len<2) code = 0xFFFD; // Turn errors into REPLACEMENT CHARACTER
414     } else { // handle the 1-byte utf8 encoding:
415       code = *p;
416       len = 1;
417     }
418 \endcode
419 
420     Direct testing for the 1-byte case (as shown above) will also
421     speed up the scanning of strings where the majority of characters
422     are ASCII.
423 */
utf8decode(const char * p,const char * end,int * len)424 static unsigned utf8decode(const char* p, const char* end, int* len)
425 {
426   unsigned char c = *(unsigned char*)p;
427   if (c < 0x80) {
428     *len = 1;
429     return c;
430 #if ERRORS_TO_CP1252
431   } else if (c < 0xa0) {
432     *len = 1;
433     return cp1252[c-0x80];
434 #endif
435   } else if (c < 0xc2) {
436     goto FAIL;
437   }
438   if (p+1 >= end || (p[1]&0xc0) != 0x80) goto FAIL;
439   if (c < 0xe0) {
440     *len = 2;
441     return
442       ((p[0] & 0x1f) << 6) +
443       ((p[1] & 0x3f));
444   } else if (c == 0xe0) {
445     if (((unsigned char*)p)[1] < 0xa0) goto FAIL;
446     goto UTF8_3;
447 #if STRICT_RFC3629
448   } else if (c == 0xed) {
449     // RFC 3629 says surrogate chars are illegal.
450     if (((unsigned char*)p)[1] >= 0xa0) goto FAIL;
451     goto UTF8_3;
452   } else if (c == 0xef) {
453     // 0xfffe and 0xffff are also illegal characters
454     if (((unsigned char*)p)[1]==0xbf &&
455 	((unsigned char*)p)[2]>=0xbe) goto FAIL;
456     goto UTF8_3;
457 #endif
458   } else if (c < 0xf0) {
459   UTF8_3:
460     if (p+2 >= end || (p[2]&0xc0) != 0x80) goto FAIL;
461     *len = 3;
462     return
463       ((p[0] & 0x0f) << 12) +
464       ((p[1] & 0x3f) << 6) +
465       ((p[2] & 0x3f));
466   } else if (c == 0xf0) {
467     if (((unsigned char*)p)[1] < 0x90) goto FAIL;
468     goto UTF8_4;
469   } else if (c < 0xf4) {
470   UTF8_4:
471     if (p+3 >= end || (p[2]&0xc0) != 0x80 || (p[3]&0xc0) != 0x80) goto FAIL;
472     *len = 4;
473 #if STRICT_RFC3629
474     // RFC 3629 says all codes ending in fffe or ffff are illegal:
475     if ((p[1]&0xf)==0xf &&
476 	((unsigned char*)p)[2] == 0xbf &&
477 	((unsigned char*)p)[3] >= 0xbe) goto FAIL;
478 #endif
479     return
480       ((p[0] & 0x07) << 18) +
481       ((p[1] & 0x3f) << 12) +
482       ((p[2] & 0x3f) << 6) +
483       ((p[3] & 0x3f));
484   } else if (c == 0xf4) {
485     if (((unsigned char*)p)[1] > 0x8f) goto FAIL; // after 0x10ffff
486     goto UTF8_4;
487   } else {
488   FAIL:
489     *len = 1;
490 #if ERRORS_TO_ISO8859_1
491     return c;
492 #else
493     return 0xfffd; // Unicode REPLACEMENT CHARACTER
494 #endif
495   }
496 }
497 
498 /************************************************************************/
499 /*                              utf8fwd()                               */
500 /************************************************************************/
501 
502 /*
503   Move \a p forward until it points to the start of a UTF-8
504   character. If it already points at the start of one then it
505   is returned unchanged. Any UTF-8 errors are treated as though each
506   byte of the error is an individual character.
507 
508   \e start is the start of the string and is used to limit the
509   backwards search for the start of a utf8 character.
510 
511   \e end is the end of the string and is assummed to be a break
512   between characters. It is assummed to be greater than p.
513 
514   This function is for moving a pointer that was jumped to the
515   middle of a string, such as when doing a binary search for
516   a position. You should use either this or utf8back() depending
517   on which direction your algorithim can handle the pointer
518   moving. Do not use this to scan strings, use utf8decode()
519   instead.
520 */
521 
522 #ifdef FUTURE_NEEDS
utf8fwd(const char * p,const char * start,const char * end)523 static const char* utf8fwd(const char* p, const char* start, const char* end)
524 {
525   const char* a;
526   int len;
527   // if we are not pointing at a continuation character, we are done:
528   if ((*p&0xc0) != 0x80) return p;
529   // search backwards for a 0xc0 starting the character:
530   for (a = p-1; ; --a) {
531     if (a < start) return p;
532     if (!(a[0]&0x80)) return p;
533     if ((a[0]&0x40)) break;
534   }
535   utf8decode(a,end,&len);
536   a += len;
537   if (a > p) return a;
538   return p;
539 }
540 #endif /* def FUTURE_NEEDS */
541 
542 /************************************************************************/
543 /*                              utf8back()                              */
544 /************************************************************************/
545 
546 /*
547   Move \a p backward until it points to the start of a UTF-8
548   character. If it already points at the start of one then it
549   is returned unchanged. Any UTF-8 errors are treated as though each
550   byte of the error is an individual character.
551 
552   \e start is the start of the string and is used to limit the
553   backwards search for the start of a UTF-8 character.
554 
555   \e end is the end of the string and is assummed to be a break
556   between characters. It is assummed to be greater than p.
557 
558   If you wish to decrement a UTF-8 pointer, pass p-1 to this.
559 */
560 
561 #ifdef FUTURE_NEEDS
utf8back(const char * p,const char * start,const char * end)562 static const char* utf8back(const char* p, const char* start, const char* end)
563 {
564   const char* a;
565   int len;
566   // if we are not pointing at a continuation character, we are done:
567   if ((*p&0xc0) != 0x80) return p;
568   // search backwards for a 0xc0 starting the character:
569   for (a = p-1; ; --a) {
570     if (a < start) return p;
571     if (!(a[0]&0x80)) return p;
572     if ((a[0]&0x40)) break;
573   }
574   utf8decode(a,end,&len);
575   if (a+len > p) return a;
576   return p;
577 }
578 #endif /* def FUTURE_NEEDS */
579 
580 /************************************************************************/
581 /*                             utf8bytes()                              */
582 /************************************************************************/
583 
584 /* Returns number of bytes that utf8encode() will use to encode the
585   character \a ucs. */
586 #ifdef FUTURE_NEEDS
utf8bytes(unsigned ucs)587 static int utf8bytes(unsigned ucs) {
588   if (ucs < 0x000080U) {
589     return 1;
590   } else if (ucs < 0x000800U) {
591     return 2;
592   } else if (ucs < 0x010000U) {
593     return 3;
594   } else if (ucs < 0x10ffffU) {
595     return 4;
596   } else {
597     return 3; // length of the illegal character encoding
598   }
599 }
600 #endif /* def FUTURE_NEEDS */
601 
602 /************************************************************************/
603 /*                             utf8encode()                             */
604 /************************************************************************/
605 
606 /* Write the UTF-8 encoding of \e ucs into \e buf and return the
607     number of bytes written. Up to 4 bytes may be written. If you know
608     that \a ucs is less than 0x10000 then at most 3 bytes will be written.
609     If you wish to speed this up, remember that anything less than 0x80
610     is written as a single byte.
611 
612     If ucs is greater than 0x10ffff this is an illegal character
613     according to RFC 3629. These are converted as though they are
614     0xFFFD (REPLACEMENT CHARACTER).
615 
616     RFC 3629 also says many other values for \a ucs are illegal (in
617     the range 0xd800 to 0xdfff, or ending with 0xfffe or
618     0xffff). However I encode these as though they are legal, so that
619     utf8encode/utf8decode will be the identity for all codes between 0
620     and 0x10ffff.
621 */
622 #ifdef FUTURE_NEEDS
utf8encode(unsigned ucs,char * buf)623 static int utf8encode(unsigned ucs, char* buf) {
624   if (ucs < 0x000080U) {
625     buf[0] = ucs;
626     return 1;
627   } else if (ucs < 0x000800U) {
628     buf[0] = 0xc0 | (ucs >> 6);
629     buf[1] = 0x80 | (ucs & 0x3F);
630     return 2;
631   } else if (ucs < 0x010000U) {
632     buf[0] = 0xe0 | (ucs >> 12);
633     buf[1] = 0x80 | ((ucs >> 6) & 0x3F);
634     buf[2] = 0x80 | (ucs & 0x3F);
635     return 3;
636   } else if (ucs < 0x0010ffffU) {
637     buf[0] = 0xf0 | (ucs >> 18);
638     buf[1] = 0x80 | ((ucs >> 12) & 0x3F);
639     buf[2] = 0x80 | ((ucs >> 6) & 0x3F);
640     buf[3] = 0x80 | (ucs & 0x3F);
641     return 4;
642   } else {
643     // encode 0xfffd:
644     buf[0] = 0xefU;
645     buf[1] = 0xbfU;
646     buf[2] = 0xbdU;
647     return 3;
648   }
649 }
650 #endif /* def FUTURE_NEEDS */
651 
652 /************************************************************************/
653 /*                              utf8towc()                              */
654 /************************************************************************/
655 
656 /*  Convert a UTF-8 sequence into an array of wchar_t. These
657     are used by some system calls, especially on Windows.
658 
659     \a src points at the UTF-8, and \a srclen is the number of bytes to
660     convert.
661 
662     \a dst points at an array to write, and \a dstlen is the number of
663     locations in this array. At most \a dstlen-1 words will be
664     written there, plus a 0 terminating word. Thus this function
665     will never overwrite the buffer and will always return a
666     zero-terminated string. If \a dstlen is zero then \a dst can be
667     null and no data is written, but the length is returned.
668 
669     The return value is the number of words that \e would be written
670     to \a dst if it were long enough, not counting the terminating
671     zero. If the return value is greater or equal to \a dstlen it
672     indicates truncation, you can then allocate a new array of size
673     return+1 and call this again.
674 
675     Errors in the UTF-8 are converted as though each byte in the
676     erroneous string is in the Microsoft CP1252 encoding. This allows
677     ISO-8859-1 text mistakenly identified as UTF-8 to be printed
678     correctly.
679 
680     Notice that sizeof(wchar_t) is 2 on Windows and is 4 on Linux
681     and most other systems. Where wchar_t is 16 bits, Unicode
682     characters in the range 0x10000 to 0x10ffff are converted to
683     "surrogate pairs" which take two words each (this is called UTF-16
684     encoding). If wchar_t is 32 bits this rather nasty problem is
685     avoided.
686 */
utf8towc(const char * src,unsigned srclen,wchar_t * dst,unsigned dstlen)687 static unsigned utf8towc(const char* src, unsigned srclen,
688                          wchar_t* dst, unsigned dstlen)
689 {
690   const char* p = src;
691   const char* e = src+srclen;
692   unsigned count = 0;
693   if (dstlen) for (;;) {
694     if (p >= e) {dst[count] = 0; return count;}
695     if (!(*p & 0x80)) { // ascii
696       dst[count] = *p++;
697     } else {
698       int len; unsigned ucs = utf8decode(p,e,&len);
699       p += len;
700 #ifdef _WIN32
701       if (ucs < 0x10000) {
702 	dst[count] = ucs;
703       } else {
704 	// make a surrogate pair:
705 	if (count+2 >= dstlen) {dst[count] = 0; count += 2; break;}
706 	dst[count] = (((ucs-0x10000u)>>10)&0x3ff) | 0xd800;
707 	dst[++count] = (ucs&0x3ff) | 0xdc00;
708       }
709 #else
710       dst[count] = (wchar_t)ucs;
711 #endif
712     }
713     if (++count == dstlen) {dst[count-1] = 0; break;}
714   }
715   // we filled dst, measure the rest:
716   while (p < e) {
717     if (!(*p & 0x80)) p++;
718     else {
719 #ifdef _WIN32
720       int len; unsigned ucs = utf8decode(p,e,&len);
721       p += len;
722       if (ucs >= 0x10000) ++count;
723 #else
724       int len; utf8decode(p,e,&len);
725       p += len;
726 #endif
727     }
728     ++count;
729   }
730   return count;
731 }
732 
733 /************************************************************************/
734 /*                              utf8toa()                               */
735 /************************************************************************/
736 /* Convert a UTF-8 sequence into an array of 1-byte characters.
737 
738     If the UTF-8 decodes to a character greater than 0xff then it is
739     replaced with '?'.
740 
741     Errors in the UTF-8 are converted as individual bytes, same as
742     utf8decode() does. This allows ISO-8859-1 text mistakenly identified
743     as UTF-8 to be printed correctly (and possibly CP1512 on Windows).
744 
745     \a src points at the UTF-8, and \a srclen is the number of bytes to
746     convert.
747 
748     Up to \a dstlen bytes are written to \a dst, including a null
749     terminator. The return value is the number of bytes that would be
750     written, not counting the null terminator. If greater or equal to
751     \a dstlen then if you malloc a new array of size n+1 you will have
752     the space needed for the entire string. If \a dstlen is zero then
753     nothing is written and this call just measures the storage space
754     needed.
755 */
utf8toa(const char * src,unsigned srclen,char * dst,unsigned dstlen)756 static unsigned utf8toa(const char* src, unsigned srclen,
757                         char* dst, unsigned dstlen)
758 {
759   const char* p = src;
760   const char* e = src+srclen;
761   unsigned count = 0;
762   if (dstlen) for (;;) {
763     unsigned char c;
764     if (p >= e) {dst[count] = 0; return count;}
765     c = *(unsigned char*)p;
766     if (c < 0xC2) { // ascii or bad code
767       dst[count] = c;
768       p++;
769     } else {
770       int len; unsigned ucs = utf8decode(p,e,&len);
771       p += len;
772       if (ucs < 0x100) dst[count] = ucs;
773       else dst[count] = '?';
774     }
775     if (++count >= dstlen) {dst[count-1] = 0; break;}
776   }
777   // we filled dst, measure the rest:
778   while (p < e) {
779     if (!(*p & 0x80)) p++;
780     else {
781       int len;
782       utf8decode(p,e,&len);
783       p += len;
784     }
785     ++count;
786   }
787   return count;
788 }
789 
790 /************************************************************************/
791 /*                             utf8fromwc()                             */
792 /************************************************************************/
793 /* Turn "wide characters" as returned by some system calls
794     (especially on Windows) into UTF-8.
795 
796     Up to \a dstlen bytes are written to \a dst, including a null
797     terminator. The return value is the number of bytes that would be
798     written, not counting the null terminator. If greater or equal to
799     \a dstlen then if you malloc a new array of size n+1 you will have
800     the space needed for the entire string. If \a dstlen is zero then
801     nothing is written and this call just measures the storage space
802     needed.
803 
804     \a srclen is the number of words in \a src to convert. On Windows
805     this is not necessairly the number of characters, due to there
806     possibly being "surrogate pairs" in the UTF-16 encoding used.
807     On Unix wchar_t is 32 bits and each location is a character.
808 
809     On Unix if a src word is greater than 0x10ffff then this is an
810     illegal character according to RFC 3629. These are converted as
811     though they are 0xFFFD (REPLACEMENT CHARACTER). Characters in the
812     range 0xd800 to 0xdfff, or ending with 0xfffe or 0xffff are also
813     illegal according to RFC 3629. However I encode these as though
814     they are legal, so that utf8towc will return the original data.
815 
816     On Windows "surrogate pairs" are converted to a single character
817     and UTF-8 encoded (as 4 bytes). Mismatched halves of surrogate
818     pairs are converted as though they are individual characters.
819 */
utf8fromwc(char * dst,unsigned dstlen,const wchar_t * src,unsigned srclen)820 static unsigned utf8fromwc(char* dst, unsigned dstlen,
821                            const wchar_t* src, unsigned srclen) {
822   unsigned i = 0;
823   unsigned count = 0;
824   if (dstlen) for (;;) {
825     unsigned ucs;
826     if (i >= srclen) {dst[count] = 0; return count;}
827     ucs = src[i++];
828     if (ucs < 0x80U) {
829       dst[count++] = ucs;
830       if (count >= dstlen) {dst[count-1] = 0; break;}
831     } else if (ucs < 0x800U) { // 2 bytes
832       if (count+2 >= dstlen) {dst[count] = 0; count += 2; break;}
833       dst[count++] = 0xc0 | (ucs >> 6);
834       dst[count++] = 0x80 | (ucs & 0x3F);
835 #ifdef _WIN32
836     } else if (ucs >= 0xd800 && ucs <= 0xdbff && i < srclen &&
837 	       src[i] >= 0xdc00 && src[i] <= 0xdfff) {
838       // surrogate pair
839       unsigned ucs2 = src[i++];
840       ucs = 0x10000U + ((ucs&0x3ff)<<10) + (ucs2&0x3ff);
841       // all surrogate pairs turn into 4-byte utf8
842 #else
843     } else if (ucs >= 0x10000) {
844       if (ucs > 0x10ffff) {
845 	ucs = 0xfffd;
846 	goto J1;
847       }
848 #endif
849       if (count+4 >= dstlen) {dst[count] = 0; count += 4; break;}
850       dst[count++] = 0xf0 | (ucs >> 18);
851       dst[count++] = 0x80 | ((ucs >> 12) & 0x3F);
852       dst[count++] = 0x80 | ((ucs >> 6) & 0x3F);
853       dst[count++] = 0x80 | (ucs & 0x3F);
854     } else {
855 #ifndef _WIN32
856     J1:
857 #endif
858       // all others are 3 bytes:
859       if (count+3 >= dstlen) {dst[count] = 0; count += 3; break;}
860       dst[count++] = 0xe0 | (ucs >> 12);
861       dst[count++] = 0x80 | ((ucs >> 6) & 0x3F);
862       dst[count++] = 0x80 | (ucs & 0x3F);
863     }
864   }
865   // we filled dst, measure the rest:
866   while (i < srclen) {
867     unsigned ucs = src[i++];
868     if (ucs < 0x80U) {
869       count++;
870     } else if (ucs < 0x800U) { // 2 bytes
871       count += 2;
872 #ifdef _WIN32
873     } else if (ucs >= 0xd800 && ucs <= 0xdbff && i < srclen-1 &&
874 	       src[i+1] >= 0xdc00 && src[i+1] <= 0xdfff) {
875       // surrogate pair
876       ++i;
877 #else
878     } else if (ucs >= 0x10000 && ucs <= 0x10ffff) {
879 #endif
880       count += 4;
881     } else {
882       count += 3;
883     }
884   }
885   return count;
886 }
887 
888 
889 /************************************************************************/
890 /*                             utf8froma()                              */
891 /************************************************************************/
892 
893 /* Convert an ISO-8859-1 (ie normal c-string) byte stream to UTF-8.
894 
895     It is possible this should convert Microsoft's CP1252 to UTF-8
896     instead. This would translate the codes in the range 0x80-0x9f
897     to different characters. Currently it does not do this.
898 
899     Up to \a dstlen bytes are written to \a dst, including a null
900     terminator. The return value is the number of bytes that would be
901     written, not counting the null terminator. If greater or equal to
902     \a dstlen then if you malloc a new array of size n+1 you will have
903     the space needed for the entire string. If \a dstlen is zero then
904     nothing is written and this call just measures the storage space
905     needed.
906 
907     \a srclen is the number of bytes in \a src to convert.
908 
909     If the return value equals \a srclen then this indicates that
910     no conversion is necessary, as only ASCII characters are in the
911     string.
912 */
utf8froma(char * dst,unsigned dstlen,const char * src,unsigned srclen)913 static unsigned utf8froma(char* dst, unsigned dstlen,
914                           const char* src, unsigned srclen) {
915   const char* p = src;
916   const char* e = src+srclen;
917   unsigned count = 0;
918   if (dstlen) for (;;) {
919     unsigned char ucs;
920     if (p >= e) {dst[count] = 0; return count;}
921     ucs = *(unsigned char*)p++;
922     if (ucs < 0x80U) {
923       dst[count++] = ucs;
924       if (count >= dstlen) {dst[count-1] = 0; break;}
925     } else { // 2 bytes (note that CP1252 translate could make 3 bytes!)
926       if (count+2 >= dstlen) {dst[count] = 0; count += 2; break;}
927       dst[count++] = 0xc0 | (ucs >> 6);
928       dst[count++] = 0x80 | (ucs & 0x3F);
929     }
930   }
931   // we filled dst, measure the rest:
932   while (p < e) {
933     unsigned char ucs = *(unsigned char*)p++;
934     if (ucs < 0x80U) {
935       count++;
936     } else {
937       count += 2;
938     }
939   }
940   return count;
941 }
942 
943 /*
944 ** For now we disable the rest which is locale() related.  We may need
945 ** parts of it later.
946 */
947 
948 #ifdef notdef
949 
950 #ifdef _WIN32
951 # include <windows.h>
952 #endif
953 
954 /*! Return true if the "locale" seems to indicate that UTF-8 encoding
955     is used. If true the utf8tomb and utf8frommb don't do anything
956     useful.
957 
958     <i>It is highly recommended that you change your system so this
959     does return true.</i> On Windows this is done by setting the
960     "codepage" to CP_UTF8.  On Unix this is done by setting $LC_CTYPE
961     to a string containing the letters "utf" or "UTF" in it, or by
962     deleting all $LC* and $LANG environment variables. In the future
963     it is likely that all non-Asian Unix systems will return true,
964     due to the compatability of UTF-8 with ISO-8859-1.
965 */
utf8locale(void)966 int utf8locale(void) {
967   static int ret = 2;
968   if (ret == 2) {
969 #ifdef _WIN32
970     ret = GetACP() == CP_UTF8;
971 #else
972     char* s;
973     ret = 1; // assumme UTF-8 if no locale
974     if (((s = getenv("LC_CTYPE")) && *s) ||
975 	((s = getenv("LC_ALL"))   && *s) ||
976 	((s = getenv("LANG"))     && *s)) {
977       ret = (strstr(s,"utf") || strstr(s,"UTF"));
978     }
979 #endif
980   }
981   return ret;
982 }
983 
984 /*! Convert the UTF-8 used by FLTK to the locale-specific encoding
985     used for filenames (and sometimes used for data in files).
986     Unfortunatley due to stupid design you will have to do this as
987     needed for filenames. This is a bug on both Unix and Windows.
988 
989     Up to \a dstlen bytes are written to \a dst, including a null
990     terminator. The return value is the number of bytes that would be
991     written, not counting the null terminator. If greater or equal to
992     \a dstlen then if you malloc a new array of size n+1 you will have
993     the space needed for the entire string. If \a dstlen is zero then
994     nothing is written and this call just measures the storage space
995     needed.
996 
997     If utf8locale() returns true then this does not change the data.
998     It is copied and truncated as necessary to
999     the destination buffer and \a srclen is always returned.  */
utf8tomb(const char * src,unsigned srclen,char * dst,unsigned dstlen)1000 unsigned utf8tomb(const char* src, unsigned srclen,
1001 		  char* dst, unsigned dstlen)
1002 {
1003   if (!utf8locale()) {
1004 #ifdef _WIN32
1005     wchar_t lbuf[1024];
1006     wchar_t* buf = lbuf;
1007     unsigned length = utf8towc(src, srclen, buf, 1024);
1008     unsigned ret;
1009     if (length >= 1024) {
1010       buf = (wchar_t*)(malloc((length+1)*sizeof(wchar_t)));
1011       utf8towc(src, srclen, buf, length+1);
1012     }
1013     if (dstlen) {
1014       // apparently this does not null-terminate, even though msdn
1015       // documentation claims it does:
1016       ret =
1017         WideCharToMultiByte(GetACP(), 0, buf, length, dst, dstlen, 0, 0);
1018       dst[ret] = 0;
1019     }
1020     // if it overflows or measuring length, get the actual length:
1021     if (dstlen==0 || ret >= dstlen-1)
1022       ret =
1023 	WideCharToMultiByte(GetACP(), 0, buf, length, 0, 0, 0, 0);
1024     if (buf != lbuf) free((void*)buf);
1025     return ret;
1026 #else
1027     wchar_t lbuf[1024];
1028     wchar_t* buf = lbuf;
1029     unsigned length = utf8towc(src, srclen, buf, 1024);
1030     int ret;
1031     if (length >= 1024) {
1032       buf = (wchar_t*)(malloc((length+1)*sizeof(wchar_t)));
1033       utf8towc(src, srclen, buf, length+1);
1034     }
1035     if (dstlen) {
1036       ret = wcstombs(dst, buf, dstlen);
1037       if (ret >= dstlen-1) ret = wcstombs(0,buf,0);
1038     } else {
1039       ret = wcstombs(0,buf,0);
1040     }
1041     if (buf != lbuf) free((void*)buf);
1042     if (ret >= 0) return (unsigned)ret;
1043     // on any errors we return the UTF-8 as raw text...
1044 #endif
1045   }
1046   // identity transform:
1047   if (srclen < dstlen) {
1048     memcpy(dst, src, srclen);
1049     dst[srclen] = 0;
1050   } else {
1051     memcpy(dst, src, dstlen-1);
1052     dst[dstlen-1] = 0;
1053   }
1054   return srclen;
1055 }
1056 
1057 /*! Convert a filename from the locale-specific multibyte encoding
1058     used by Windows to UTF-8 as used by FLTK.
1059 
1060     Up to \a dstlen bytes are written to \a dst, including a null
1061     terminator. The return value is the number of bytes that would be
1062     written, not counting the null terminator. If greater or equal to
1063     \a dstlen then if you malloc a new array of size n+1 you will have
1064     the space needed for the entire string. If \a dstlen is zero then
1065     nothing is written and this call just measures the storage space
1066     needed.
1067 
1068     On Unix or on Windows when a UTF-8 locale is in effect, this
1069     does not change the data. It is copied and truncated as necessary to
1070     the destination buffer and \a srclen is always returned.
1071     You may also want to check if utf8test() returns non-zero, so that
1072     the filesystem can store filenames in UTF-8 encoding regardless of
1073     the locale.
1074 */
utf8frommb(char * dst,unsigned dstlen,const char * src,unsigned srclen)1075 unsigned utf8frommb(char* dst, unsigned dstlen,
1076 		    const char* src, unsigned srclen)
1077 {
1078   if (!utf8locale()) {
1079 #ifdef _WIN32
1080     wchar_t lbuf[1024];
1081     wchar_t* buf = lbuf;
1082     unsigned length;
1083     unsigned ret;
1084     length =
1085       MultiByteToWideChar(GetACP(), 0, src, srclen, buf, 1024);
1086     if (length >= 1024) {
1087       length = MultiByteToWideChar(GetACP(), 0, src, srclen, 0, 0);
1088       buf = (wchar_t*)(malloc(length*sizeof(wchar_t)));
1089       MultiByteToWideChar(GetACP(), 0, src, srclen, buf, length);
1090     }
1091     ret = utf8fromwc(dst, dstlen, buf, length);
1092     if (buf != lbuf) free((void*)buf);
1093     return ret;
1094 #else
1095     wchar_t lbuf[1024];
1096     wchar_t* buf = lbuf;
1097     int length;
1098     unsigned ret;
1099     length = mbstowcs(buf, src, 1024);
1100     if (length >= 1024) {
1101       length = mbstowcs(0, src, 0)+1;
1102       buf = (wchar_t*)(malloc(length*sizeof(unsigned short)));
1103       mbstowcs(buf, src, length);
1104     }
1105     if (length >= 0) {
1106       ret = utf8fromwc(dst, dstlen, buf, length);
1107       if (buf != lbuf) free((void*)buf);
1108       return ret;
1109     }
1110     // errors in conversion return the UTF-8 unchanged
1111 #endif
1112   }
1113   // identity transform:
1114   if (srclen < dstlen) {
1115     memcpy(dst, src, srclen);
1116     dst[srclen] = 0;
1117   } else {
1118     memcpy(dst, src, dstlen-1);
1119     dst[dstlen-1] = 0;
1120   }
1121   return srclen;
1122 }
1123 
1124 /*! Examines the first \a srclen bytes in \a src and return a verdict
1125     on whether it is UTF-8 or not.
1126     - Returns 0 if there is any illegal UTF-8 sequences, using the
1127       same rules as utf8decode(). Note that some UCS values considered
1128       illegal by RFC 3629, such as 0xffff, are considered legal by this.
1129     - Returns 1 if there are only single-byte characters (ie no bytes
1130       have the high bit set). This is legal UTF-8, but also indicates
1131       plain ASCII. It also returns 1 if \a srclen is zero.
1132     - Returns 2 if there are only characters less than 0x800.
1133     - Returns 3 if there are only characters less than 0x10000.
1134     - Returns 4 if there are characters in the 0x10000 to 0x10ffff range.
1135 
1136     Because there are many illegal sequences in UTF-8, it is almost
1137     impossible for a string in another encoding to be confused with
1138     UTF-8. This is very useful for transitioning Unix to UTF-8
1139     filenames, you can simply test each filename with this to decide
1140     if it is UTF-8 or in the locale encoding. My hope is that if
1141     this is done we will be able to cleanly transition to a locale-less
1142     encoding.
1143 */
utf8test(const char * src,unsigned srclen)1144 int utf8test(const char* src, unsigned srclen) {
1145   int ret = 1;
1146   const char* p = src;
1147   const char* e = src+srclen;
1148   while (p < e) {
1149     if (*p & 0x80) {
1150       int len; utf8decode(p,e,&len);
1151       if (len < 2) return 0;
1152       if (len > ret) ret = len;
1153       p += len;
1154     } else {
1155       p++;
1156     }
1157   }
1158   return ret;
1159 }
1160 
1161 #endif /* def notdef - disabled locale specific stuff */
1162 
1163 #endif /* defined(CPL_RECODE_STUB) */
1164