1 /////////////////////////////////////////////////////////////////////////////
2 // Name:        src/common/strconv.cpp
3 // Purpose:     Unicode conversion classes
4 // Author:      Ove Kaaven, Robert Roebling, Vadim Zeitlin, Vaclav Slavik,
5 //              Ryan Norton, Fredrik Roubert (UTF7)
6 // Modified by:
7 // Created:     29/01/98
8 // Copyright:   (c) 1999 Ove Kaaven, Robert Roebling, Vaclav Slavik
9 //              (c) 2000-2003 Vadim Zeitlin
10 //              (c) 2004 Ryan Norton, Fredrik Roubert
11 // Licence:     wxWindows licence
12 /////////////////////////////////////////////////////////////////////////////
13 
14 // For compilers that support precompilation, includes "wx.h".
15 #include "wx/wxprec.h"
16 
17 #ifdef __BORLANDC__
18     #pragma hdrstop
19 #endif  //__BORLANDC__
20 
21 #ifndef WX_PRECOMP
22     #include "wx/intl.h"
23     #include "wx/log.h"
24     #include "wx/utils.h"
25     #include "wx/hashmap.h"
26 #endif
27 
28 #include "wx/strconv.h"
29 
30 #ifndef __WXWINCE__
31 #include <errno.h>
32 #endif
33 
34 #include <ctype.h>
35 #include <string.h>
36 #include <stdlib.h>
37 
38 #if defined(__WIN32__) && !defined(__WXMICROWIN__)
39     #include "wx/msw/private.h"
40     #include "wx/msw/missing.h"
41     #define wxHAVE_WIN32_MB2WC
42 #endif
43 
44 #ifdef HAVE_ICONV
45     #include <iconv.h>
46     #include "wx/thread.h"
47 #endif
48 
49 #include "wx/encconv.h"
50 #include "wx/fontmap.h"
51 
52 #ifdef __DARWIN__
53 #include "wx/osx/core/private/strconv_cf.h"
54 #endif //def __DARWIN__
55 
56 
57 #define TRACE_STRCONV wxT("strconv")
58 
59 // WC_UTF16 is defined only if sizeof(wchar_t) == 2, otherwise it's supposed to
60 // be 4 bytes
61 #if SIZEOF_WCHAR_T == 2
62     #define WC_UTF16
63 #endif
64 
65 
66 // ============================================================================
67 // implementation
68 // ============================================================================
69 
70 // helper function of cMB2WC(): check if n bytes at this location are all NUL
NotAllNULs(const char * p,size_t n)71 static bool NotAllNULs(const char *p, size_t n)
72 {
73     while ( n && *p++ == '\0' )
74         n--;
75 
76     return n != 0;
77 }
78 
79 // ----------------------------------------------------------------------------
80 // UTF-16 en/decoding to/from UCS-4 with surrogates handling
81 // ----------------------------------------------------------------------------
82 
encode_utf16(wxUint32 input,wxUint16 * output)83 static size_t encode_utf16(wxUint32 input, wxUint16 *output)
84 {
85     if (input <= 0xffff)
86     {
87         if (output)
88             *output = (wxUint16) input;
89 
90         return 1;
91     }
92     else if (input >= 0x110000)
93     {
94         return wxCONV_FAILED;
95     }
96     else
97     {
98         if (output)
99         {
100             *output++ = (wxUint16) ((input >> 10) + 0xd7c0);
101             *output = (wxUint16) ((input & 0x3ff) + 0xdc00);
102         }
103 
104         return 2;
105     }
106 }
107 
108 // Returns the next UTF-32 character from the wchar_t buffer terminated by the
109 // "end" pointer (the caller must ensure that on input "*pSrc < end") and
110 // advances the pointer to the character after this one.
111 //
112 // If an invalid or incomplete character is found, *pSrc is set to NULL, the
113 // caller must check for this.
wxDecodeSurrogate(const wxChar16 ** pSrc,const wxChar16 * end)114 static wxUint32 wxDecodeSurrogate(const wxChar16 **pSrc, const wxChar16* end)
115 {
116     const wxChar16*& src = *pSrc;
117 
118     // Is this a BMP character?
119     const wxUint16 u = *src++;
120     if ((u < 0xd800) || (u > 0xdfff))
121     {
122         // Yes, just return it.
123         return u;
124     }
125 
126     // No, we have the first half of a surrogate, check if we also have the
127     // second half (notice that this check does nothing if end == NULL, as it
128     // is allowed to be, and this is correct).
129     if ( src == end )
130     {
131         // No, we don't because this is the end of input.
132         src = NULL;
133         return 0;
134     }
135 
136     const wxUint16 u2 = *src++;
137     if ( (u2 < 0xdc00) || (u2 > 0xdfff) )
138     {
139         // No, it's not in the low surrogate range.
140         src = NULL;
141         return 0;
142     }
143 
144     // Yes, decode it and return the corresponding Unicode character.
145     return ((u - 0xd7c0) << 10) + (u2 - 0xdc00);
146 }
147 
148 // ----------------------------------------------------------------------------
149 // wxMBConv
150 // ----------------------------------------------------------------------------
151 
152 size_t
ToWChar(wchar_t * dst,size_t dstLen,const char * src,size_t srcLen) const153 wxMBConv::ToWChar(wchar_t *dst, size_t dstLen,
154                   const char *src, size_t srcLen) const
155 {
156     // although new conversion classes are supposed to implement this function
157     // directly, the existing ones only implement the old MB2WC() and so, to
158     // avoid to have to rewrite all conversion classes at once, we provide a
159     // default (but not efficient) implementation of this one in terms of the
160     // old function by copying the input to ensure that it's NUL-terminated and
161     // then using MB2WC() to convert it
162     //
163     // moreover, some conversion classes simply can't implement ToWChar()
164     // directly, the primary example is wxConvLibc: mbstowcs() only handles
165     // NUL-terminated strings
166 
167     // the number of chars [which would be] written to dst [if it were not NULL]
168     size_t dstWritten = 0;
169 
170     // the number of NULs terminating this string
171     size_t nulLen = 0;  // not really needed, but just to avoid warnings
172 
173     // if we were not given the input size we just have to assume that the
174     // string is properly terminated as we have no way of knowing how long it
175     // is anyhow, but if we do have the size check whether there are enough
176     // NULs at the end
177     wxCharBuffer bufTmp;
178     const char *srcEnd;
179     if ( srcLen != wxNO_LEN )
180     {
181         // we need to know how to find the end of this string
182         nulLen = GetMBNulLen();
183         if ( nulLen == wxCONV_FAILED )
184             return wxCONV_FAILED;
185 
186         // if there are enough NULs we can avoid the copy
187         if ( srcLen < nulLen || NotAllNULs(src + srcLen - nulLen, nulLen) )
188         {
189             // make a copy in order to properly NUL-terminate the string
190             bufTmp = wxCharBuffer(srcLen + nulLen - 1 /* 1 will be added */);
191             char * const p = bufTmp.data();
192             memcpy(p, src, srcLen);
193             for ( char *s = p + srcLen; s < p + srcLen + nulLen; s++ )
194                 *s = '\0';
195 
196             src = bufTmp;
197         }
198 
199         srcEnd = src + srcLen;
200     }
201     else // quit after the first loop iteration
202     {
203         srcEnd = NULL;
204     }
205 
206     // the idea of this code is straightforward: it converts a NUL-terminated
207     // chunk of the string during each iteration and updates the output buffer
208     // with the result
209     //
210     // all the complication come from the fact that this function, for
211     // historical reasons, must behave in 2 subtly different ways when it's
212     // called with a fixed number of characters and when it's called for the
213     // entire NUL-terminated string: in the former case (srcEnd != NULL) we
214     // must count all characters we convert, NUL or not; but in the latter we
215     // do not count the trailing NUL -- but still count all the NULs inside the
216     // string
217     //
218     // so for the (simple) former case we just always count the trailing NUL,
219     // but for the latter we need to wait until we see if there is going to be
220     // another loop iteration and only count it then
221     for ( ;; )
222     {
223         // try to convert the current chunk
224         size_t lenChunk = MB2WC(NULL, src, 0);
225         if ( lenChunk == wxCONV_FAILED )
226             return wxCONV_FAILED;
227 
228         dstWritten += lenChunk;
229         if ( !srcEnd )
230             dstWritten++;
231 
232         if ( dst )
233         {
234             if ( dstWritten > dstLen )
235                 return wxCONV_FAILED;
236 
237             // +1 is for trailing NUL
238             if ( MB2WC(dst, src, lenChunk + 1) == wxCONV_FAILED )
239                 return wxCONV_FAILED;
240 
241             dst += lenChunk;
242             if ( !srcEnd )
243                 dst++;
244         }
245 
246         if ( !srcEnd )
247         {
248             // we convert just one chunk in this case as this is the entire
249             // string anyhow (and we don't count the trailing NUL in this case)
250             break;
251         }
252 
253         // advance the input pointer past the end of this chunk: notice that we
254         // will always stop before srcEnd because we know that the chunk is
255         // always properly NUL-terminated
256         while ( NotAllNULs(src, nulLen) )
257         {
258             // notice that we must skip over multiple bytes here as we suppose
259             // that if NUL takes 2 or 4 bytes, then all the other characters do
260             // too and so if advanced by a single byte we might erroneously
261             // detect sequences of NUL bytes in the middle of the input
262             src += nulLen;
263         }
264 
265         // if the buffer ends before this NUL, we shouldn't count it in our
266         // output so skip the code below
267         if ( src == srcEnd )
268             break;
269 
270         // do count this terminator as it's inside the buffer we convert
271         dstWritten++;
272         if ( dst )
273             dst++;
274 
275         src += nulLen; // skip the terminator itself
276 
277         if ( src >= srcEnd )
278             break;
279     }
280 
281     return dstWritten;
282 }
283 
284 size_t
FromWChar(char * dst,size_t dstLen,const wchar_t * src,size_t srcLen) const285 wxMBConv::FromWChar(char *dst, size_t dstLen,
286                     const wchar_t *src, size_t srcLen) const
287 {
288     // the number of chars [which would be] written to dst [if it were not NULL]
289     size_t dstWritten = 0;
290 
291     // if we don't know its length we have no choice but to assume that it is
292     // NUL-terminated (notice that it can still be NUL-terminated even if
293     // explicit length is given but it doesn't change our return value)
294     const bool isNulTerminated = srcLen == wxNO_LEN;
295 
296     // make a copy of the input string unless it is already properly
297     // NUL-terminated
298     wxWCharBuffer bufTmp;
299     if ( isNulTerminated )
300     {
301         srcLen = wxWcslen(src) + 1;
302     }
303     else if ( srcLen != 0 && src[srcLen - 1] != L'\0' )
304     {
305         // make a copy in order to properly NUL-terminate the string
306         bufTmp = wxWCharBuffer(srcLen);
307         memcpy(bufTmp.data(), src, srcLen * sizeof(wchar_t));
308         src = bufTmp;
309     }
310 
311     const size_t lenNul = GetMBNulLen();
312     for ( const wchar_t * const srcEnd = src + srcLen;
313           src < srcEnd;
314           src++ /* skip L'\0' too */ )
315     {
316         // try to convert the current chunk
317         size_t lenChunk = WC2MB(NULL, src, 0);
318         if ( lenChunk == wxCONV_FAILED )
319             return wxCONV_FAILED;
320 
321         dstWritten += lenChunk;
322 
323         const wchar_t * const
324             chunkEnd = isNulTerminated ? srcEnd - 1 : src + wxWcslen(src);
325 
326         // our return value accounts for the trailing NUL(s), unlike that of
327         // WC2MB(), however don't do it for the last NUL we artificially added
328         // ourselves above
329         if ( chunkEnd < srcEnd )
330             dstWritten += lenNul;
331 
332         if ( dst )
333         {
334             if ( dstWritten > dstLen )
335                 return wxCONV_FAILED;
336 
337             // if we know that there is enough space in the destination buffer
338             // (because we accounted for lenNul in dstWritten above), we can
339             // convert directly in place -- but otherwise we need another
340             // temporary buffer to ensure that we don't overwrite the output
341             wxCharBuffer dstBuf;
342             char *dstTmp;
343             if ( chunkEnd == srcEnd )
344             {
345                 dstBuf = wxCharBuffer(lenChunk + lenNul - 1);
346                 dstTmp = dstBuf.data();
347             }
348             else
349             {
350                 dstTmp = dst;
351             }
352 
353             if ( WC2MB(dstTmp, src, lenChunk + lenNul) == wxCONV_FAILED )
354                 return wxCONV_FAILED;
355 
356             if ( dstTmp != dst )
357             {
358                 // copy everything up to but excluding the terminating NUL(s)
359                 // into the real output buffer
360                 memcpy(dst, dstTmp, lenChunk);
361 
362                 // micro-optimization: if dstTmp != dst it means that chunkEnd
363                 // == srcEnd and so we're done, no need to update anything below
364                 break;
365             }
366 
367             dst += lenChunk;
368             if ( chunkEnd < srcEnd )
369                 dst += lenNul;
370         }
371 
372         src = chunkEnd;
373     }
374 
375     return dstWritten;
376 }
377 
MB2WC(wchar_t * outBuff,const char * inBuff,size_t outLen) const378 size_t wxMBConv::MB2WC(wchar_t *outBuff, const char *inBuff, size_t outLen) const
379 {
380     size_t rc = ToWChar(outBuff, outLen, inBuff);
381     if ( rc != wxCONV_FAILED )
382     {
383         // ToWChar() returns the buffer length, i.e. including the trailing
384         // NUL, while this method doesn't take it into account
385         rc--;
386     }
387 
388     return rc;
389 }
390 
WC2MB(char * outBuff,const wchar_t * inBuff,size_t outLen) const391 size_t wxMBConv::WC2MB(char *outBuff, const wchar_t *inBuff, size_t outLen) const
392 {
393     size_t rc = FromWChar(outBuff, outLen, inBuff);
394     if ( rc != wxCONV_FAILED )
395     {
396         rc -= GetMBNulLen();
397     }
398 
399     return rc;
400 }
401 
~wxMBConv()402 wxMBConv::~wxMBConv()
403 {
404     // nothing to do here (necessary for Darwin linking probably)
405 }
406 
cMB2WC(const char * psz) const407 const wxWCharBuffer wxMBConv::cMB2WC(const char *psz) const
408 {
409     if ( psz )
410     {
411         // calculate the length of the buffer needed first
412         const size_t nLen = ToWChar(NULL, 0, psz);
413         if ( nLen != wxCONV_FAILED )
414         {
415             // now do the actual conversion
416             wxWCharBuffer buf(nLen - 1 /* +1 added implicitly */);
417 
418             // +1 for the trailing NULL
419             if ( ToWChar(buf.data(), nLen, psz) != wxCONV_FAILED )
420                 return buf;
421         }
422     }
423 
424     return wxWCharBuffer();
425 }
426 
cWC2MB(const wchar_t * pwz) const427 const wxCharBuffer wxMBConv::cWC2MB(const wchar_t *pwz) const
428 {
429     if ( pwz )
430     {
431         const size_t nLen = FromWChar(NULL, 0, pwz);
432         if ( nLen != wxCONV_FAILED )
433         {
434             wxCharBuffer buf(nLen - 1);
435             if ( FromWChar(buf.data(), nLen, pwz) != wxCONV_FAILED )
436                 return buf;
437         }
438     }
439 
440     return wxCharBuffer();
441 }
442 
443 const wxWCharBuffer
cMB2WC(const char * inBuff,size_t inLen,size_t * outLen) const444 wxMBConv::cMB2WC(const char *inBuff, size_t inLen, size_t *outLen) const
445 {
446     const size_t dstLen = ToWChar(NULL, 0, inBuff, inLen);
447     if ( dstLen != wxCONV_FAILED )
448     {
449         // notice that we allocate space for dstLen+1 wide characters here
450         // because we want the buffer to always be NUL-terminated, even if the
451         // input isn't (as otherwise the caller has no way to know its length)
452         wxWCharBuffer wbuf(dstLen);
453         if ( ToWChar(wbuf.data(), dstLen, inBuff, inLen) != wxCONV_FAILED )
454         {
455             if ( outLen )
456             {
457                 *outLen = dstLen;
458 
459                 // we also need to handle NUL-terminated input strings
460                 // specially: for them the output is the length of the string
461                 // excluding the trailing NUL, however if we're asked to
462                 // convert a specific number of characters we return the length
463                 // of the resulting output even if it's NUL-terminated
464                 if ( inLen == wxNO_LEN )
465                     (*outLen)--;
466             }
467 
468             return wbuf;
469         }
470     }
471 
472     if ( outLen )
473         *outLen = 0;
474 
475     return wxWCharBuffer();
476 }
477 
478 const wxCharBuffer
cWC2MB(const wchar_t * inBuff,size_t inLen,size_t * outLen) const479 wxMBConv::cWC2MB(const wchar_t *inBuff, size_t inLen, size_t *outLen) const
480 {
481     size_t dstLen = FromWChar(NULL, 0, inBuff, inLen);
482     if ( dstLen != wxCONV_FAILED )
483     {
484         const size_t nulLen = GetMBNulLen();
485 
486         // as above, ensure that the buffer is always NUL-terminated, even if
487         // the input is not
488         wxCharBuffer buf(dstLen + nulLen - 1);
489         memset(buf.data() + dstLen, 0, nulLen);
490         if ( FromWChar(buf.data(), dstLen, inBuff, inLen) != wxCONV_FAILED )
491         {
492             if ( outLen )
493             {
494                 *outLen = dstLen;
495 
496                 if ( inLen == wxNO_LEN )
497                 {
498                     // in this case both input and output are NUL-terminated
499                     // and we're not supposed to count NUL
500                     *outLen -= nulLen;
501                 }
502             }
503 
504             return buf;
505         }
506     }
507 
508     if ( outLen )
509         *outLen = 0;
510 
511     return wxCharBuffer();
512 }
513 
cMB2WC(const wxScopedCharBuffer & buf) const514 const wxWCharBuffer wxMBConv::cMB2WC(const wxScopedCharBuffer& buf) const
515 {
516     const size_t srcLen = buf.length();
517     if ( srcLen )
518     {
519         const size_t dstLen = ToWChar(NULL, 0, buf, srcLen);
520         if ( dstLen != wxCONV_FAILED )
521         {
522             wxWCharBuffer wbuf(dstLen);
523             wbuf.data()[dstLen] = L'\0';
524             if ( ToWChar(wbuf.data(), dstLen, buf, srcLen) != wxCONV_FAILED )
525                 return wbuf;
526         }
527     }
528 
529     return wxScopedWCharBuffer::CreateNonOwned(L"", 0);
530 }
531 
cWC2MB(const wxScopedWCharBuffer & wbuf) const532 const wxCharBuffer wxMBConv::cWC2MB(const wxScopedWCharBuffer& wbuf) const
533 {
534     const size_t srcLen = wbuf.length();
535     if ( srcLen )
536     {
537         const size_t dstLen = FromWChar(NULL, 0, wbuf, srcLen);
538         if ( dstLen != wxCONV_FAILED )
539         {
540             wxCharBuffer buf(dstLen);
541             buf.data()[dstLen] = '\0';
542             if ( FromWChar(buf.data(), dstLen, wbuf, srcLen) != wxCONV_FAILED )
543                 return buf;
544         }
545     }
546 
547     return wxScopedCharBuffer::CreateNonOwned("", 0);
548 }
549 
550 // ----------------------------------------------------------------------------
551 // wxMBConvLibc
552 // ----------------------------------------------------------------------------
553 
MB2WC(wchar_t * buf,const char * psz,size_t n) const554 size_t wxMBConvLibc::MB2WC(wchar_t *buf, const char *psz, size_t n) const
555 {
556     return wxMB2WC(buf, psz, n);
557 }
558 
WC2MB(char * buf,const wchar_t * psz,size_t n) const559 size_t wxMBConvLibc::WC2MB(char *buf, const wchar_t *psz, size_t n) const
560 {
561     return wxWC2MB(buf, psz, n);
562 }
563 
564 // ----------------------------------------------------------------------------
565 // wxConvBrokenFileNames
566 // ----------------------------------------------------------------------------
567 
568 #ifdef __UNIX__
569 
wxConvBrokenFileNames(const wxString & charset)570 wxConvBrokenFileNames::wxConvBrokenFileNames(const wxString& charset)
571 {
572     if ( wxStricmp(charset, wxT("UTF-8")) == 0 ||
573          wxStricmp(charset, wxT("UTF8")) == 0  )
574         m_conv = new wxMBConvUTF8(wxMBConvUTF8::MAP_INVALID_UTF8_TO_PUA);
575     else
576         m_conv = new wxCSConv(charset);
577 }
578 
579 #endif // __UNIX__
580 
581 // ----------------------------------------------------------------------------
582 // UTF-7
583 // ----------------------------------------------------------------------------
584 
585 // Implementation (C) 2004 Fredrik Roubert
586 //
587 // Changes to work in streaming mode (C) 2008 Vadim Zeitlin
588 
589 //
590 // BASE64 decoding table
591 //
592 static const unsigned char utf7unb64[] =
593 {
594     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
595     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
596     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
597     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
598     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
599     0xff, 0xff, 0xff, 0x3e, 0xff, 0xff, 0xff, 0x3f,
600     0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x3b,
601     0x3c, 0x3d, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
602     0xff, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06,
603     0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e,
604     0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16,
605     0x17, 0x18, 0x19, 0xff, 0xff, 0xff, 0xff, 0xff,
606     0xff, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 0x20,
607     0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28,
608     0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, 0x30,
609     0x31, 0x32, 0x33, 0xff, 0xff, 0xff, 0xff, 0xff,
610     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
611     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
612     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
613     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
614     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
615     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
616     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
617     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
618     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
619     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
620     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
621     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
622     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
623     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
624     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
625     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
626 };
627 
ToWChar(wchar_t * dst,size_t dstLen,const char * src,size_t srcLen) const628 size_t wxMBConvUTF7::ToWChar(wchar_t *dst, size_t dstLen,
629                              const char *src, size_t srcLen) const
630 {
631     DecoderState stateOrig,
632                 *statePtr;
633     if ( srcLen == wxNO_LEN )
634     {
635         // convert the entire string, up to and including the trailing NUL
636         srcLen = strlen(src) + 1;
637 
638         // when working on the entire strings we don't update nor use the shift
639         // state from the previous call
640         statePtr = &stateOrig;
641     }
642     else // when working with partial strings we do use the shift state
643     {
644         statePtr = const_cast<DecoderState *>(&m_stateDecoder);
645 
646         // also save the old state to be able to rollback to it on error
647         stateOrig = m_stateDecoder;
648     }
649 
650     // but to simplify the code below we use this variable in both cases
651     DecoderState& state = *statePtr;
652 
653 
654     // number of characters [which would have been] written to dst [if it were
655     // not NULL]
656     size_t len = 0;
657 
658     const char * const srcEnd = src + srcLen;
659 
660     while ( (src < srcEnd) && (!dst || (len < dstLen)) )
661     {
662         const unsigned char cc = *src++;
663 
664         if ( state.IsShifted() )
665         {
666             const unsigned char dc = utf7unb64[cc];
667             if ( dc == 0xff )
668             {
669                 // end of encoded part, check that nothing was left: there can
670                 // be up to 4 bits of 0 padding but nothing else (we also need
671                 // to check isLSB as we count bits modulo 8 while a valid UTF-7
672                 // encoded sequence must contain an integral number of UTF-16
673                 // characters)
674                 if ( state.isLSB || state.bit > 4 ||
675                         (state.accum & ((1 << state.bit) - 1)) )
676                 {
677                     if ( !len )
678                         state = stateOrig;
679 
680                     return wxCONV_FAILED;
681                 }
682 
683                 state.ToDirect();
684 
685                 // re-parse this character normally below unless it's '-' which
686                 // is consumed by the decoder
687                 if ( cc == '-' )
688                     continue;
689             }
690             else // valid encoded character
691             {
692                 // mini base64 decoder: each character is 6 bits
693                 state.bit += 6;
694                 state.accum <<= 6;
695                 state.accum += dc;
696 
697                 if ( state.bit >= 8 )
698                 {
699                     // got the full byte, consume it
700                     state.bit -= 8;
701                     unsigned char b = (state.accum >> state.bit) & 0x00ff;
702 
703                     if ( state.isLSB )
704                     {
705                         // we've got the full word, output it
706                         if ( dst )
707                             *dst++ = (state.msb << 8) | b;
708                         len++;
709                         state.isLSB = false;
710                     }
711                     else // MSB
712                     {
713                         // just store it while we wait for LSB
714                         state.msb = b;
715                         state.isLSB = true;
716                     }
717                 }
718             }
719         }
720 
721         if ( state.IsDirect() )
722         {
723             // start of an encoded segment?
724             if ( cc == '+' )
725             {
726                 // Can't end with a plus sign.
727                 if ( src == srcEnd )
728                     return wxCONV_FAILED;
729 
730                 if ( *src == '-' )
731                 {
732                     // just the encoded plus sign, don't switch to shifted mode
733                     if ( dst )
734                         *dst++ = '+';
735                     len++;
736                     src++;
737                 }
738                 else if ( utf7unb64[(unsigned)*src] == 0xff )
739                 {
740                     // empty encoded chunks are not allowed
741                     if ( !len )
742                         state = stateOrig;
743 
744                     return wxCONV_FAILED;
745                 }
746                 else // base-64 encoded chunk follows
747                 {
748                     state.ToShifted();
749                 }
750             }
751             else // not '+'
752             {
753                 // only printable 7 bit ASCII characters (with the exception of
754                 // NUL, TAB, CR and LF) can be used directly
755                 if ( cc >= 0x7f || (cc < ' ' &&
756                       !(cc == '\0' || cc == '\t' || cc == '\r' || cc == '\n')) )
757                     return wxCONV_FAILED;
758 
759                 if ( dst )
760                     *dst++ = cc;
761                 len++;
762             }
763         }
764     }
765 
766     if ( !len )
767     {
768         // as we didn't read any characters we should be called with the same
769         // data (followed by some more new data) again later so don't save our
770         // state
771         state = stateOrig;
772 
773         return wxCONV_FAILED;
774     }
775 
776     return len;
777 }
778 
779 //
780 // BASE64 encoding table
781 //
782 static const unsigned char utf7enb64[] =
783 {
784     'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H',
785     'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P',
786     'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
787     'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f',
788     'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n',
789     'o', 'p', 'q', 'r', 's', 't', 'u', 'v',
790     'w', 'x', 'y', 'z', '0', '1', '2', '3',
791     '4', '5', '6', '7', '8', '9', '+', '/'
792 };
793 
794 //
795 // UTF-7 encoding table
796 //
797 // 0 - Set D (directly encoded characters)
798 // 1 - Set O (optional direct characters)
799 // 2 - whitespace characters (optional)
800 // 3 - special characters
801 //
802 static const unsigned char utf7encode[128] =
803 {
804     0, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
805     3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
806     2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 3,
807     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
808     1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
809     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
810     1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
811     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3
812 };
813 
wxIsUTF7Direct(wchar_t wc)814 static inline bool wxIsUTF7Direct(wchar_t wc)
815 {
816     return wc < 0x80 && utf7encode[wc] < 1;
817 }
818 
FromWChar(char * dst,size_t dstLen,const wchar_t * src,size_t srcLen) const819 size_t wxMBConvUTF7::FromWChar(char *dst, size_t dstLen,
820                                const wchar_t *src, size_t srcLen) const
821 {
822     EncoderState stateOrig,
823                 *statePtr;
824     if ( srcLen == wxNO_LEN )
825     {
826         // we don't apply the stored state when operating on entire strings at
827         // once
828         statePtr = &stateOrig;
829 
830         srcLen = wxWcslen(src) + 1;
831     }
832     else // do use the mode we left the output in previously
833     {
834         stateOrig = m_stateEncoder;
835         statePtr = const_cast<EncoderState *>(&m_stateEncoder);
836     }
837 
838     EncoderState& state = *statePtr;
839 
840 
841     size_t len = 0;
842 
843     const wchar_t * const srcEnd = src + srcLen;
844     while ( src < srcEnd && (!dst || len < dstLen) )
845     {
846         wchar_t cc = *src++;
847         if ( wxIsUTF7Direct(cc) )
848         {
849             if ( state.IsShifted() )
850             {
851                 // pad with zeros the last encoded block if necessary
852                 if ( state.bit )
853                 {
854                     if ( dst )
855                         *dst++ = utf7enb64[((state.accum % 16) << (6 - state.bit)) % 64];
856                     len++;
857                 }
858 
859                 state.ToDirect();
860 
861                 if ( dst )
862                     *dst++ = '-';
863                 len++;
864             }
865 
866             if ( dst )
867                 *dst++ = (char)cc;
868             len++;
869         }
870         else if ( cc == '+' && state.IsDirect() )
871         {
872             if ( dst )
873             {
874                 *dst++ = '+';
875                 *dst++ = '-';
876             }
877 
878             len += 2;
879         }
880 #ifndef WC_UTF16
881         else if (((wxUint32)cc) > 0xffff)
882         {
883             // no surrogate pair generation (yet?)
884             return wxCONV_FAILED;
885         }
886 #endif
887         else
888         {
889             if ( state.IsDirect() )
890             {
891                 state.ToShifted();
892 
893                 if ( dst )
894                     *dst++ = '+';
895                 len++;
896             }
897 
898             // BASE64 encode string
899             for ( ;; )
900             {
901                 for ( unsigned lsb = 0; lsb < 2; lsb++ )
902                 {
903                     state.accum <<= 8;
904                     state.accum += lsb ? cc & 0xff : (cc & 0xff00) >> 8;
905 
906                     for (state.bit += 8; state.bit >= 6; )
907                     {
908                         state.bit -= 6;
909                         if ( dst )
910                             *dst++ = utf7enb64[(state.accum >> state.bit) % 64];
911                         len++;
912                     }
913                 }
914 
915                 if ( src == srcEnd || wxIsUTF7Direct(cc = *src) )
916                     break;
917 
918                 src++;
919             }
920         }
921     }
922 
923     // we need to restore the original encoder state if we were called just to
924     // calculate the amount of space needed as we will presumably be called
925     // again to really convert the data now
926     if ( !dst )
927         state = stateOrig;
928 
929     return len;
930 }
931 
932 // ----------------------------------------------------------------------------
933 // UTF-8
934 // ----------------------------------------------------------------------------
935 
936 static const wxUint32 utf8_max[]=
937     { 0x7f, 0x7ff, 0xffff, 0x1fffff, 0x3ffffff, 0x7fffffff, 0xffffffff };
938 
939 // boundaries of the private use area we use to (temporarily) remap invalid
940 // characters invalid in a UTF-8 encoded string
941 const wxUint32 wxUnicodePUA = 0x100000;
942 const wxUint32 wxUnicodePUAEnd = wxUnicodePUA + 256;
943 
944 // this table gives the length of the UTF-8 encoding from its first character:
945 const unsigned char tableUtf8Lengths[256] = {
946     // single-byte sequences (ASCII):
947     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 00..0F
948     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 10..1F
949     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 20..2F
950     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 30..3F
951     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 40..4F
952     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 50..5F
953     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 60..6F
954     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 70..7F
955 
956     // these are invalid:
957     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // 80..8F
958     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // 90..9F
959     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // A0..AF
960     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // B0..BF
961     0, 0,                                            // C0,C1
962 
963     // two-byte sequences:
964           2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,  // C2..CF
965     2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,  // D0..DF
966 
967     // three-byte sequences:
968     3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,  // E0..EF
969 
970     // four-byte sequences:
971     4, 4, 4, 4, 4,                                   // F0..F4
972 
973     // these are invalid again (5- or 6-byte
974     // sequences and sequences for code points
975     // above U+10FFFF, as restricted by RFC 3629):
976                    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0   // F5..FF
977 };
978 
979 size_t
ToWChar(wchar_t * dst,size_t dstLen,const char * src,size_t srcLen) const980 wxMBConvStrictUTF8::ToWChar(wchar_t *dst, size_t dstLen,
981                             const char *src, size_t srcLen) const
982 {
983     wchar_t *out = dstLen ? dst : NULL;
984     size_t written = 0;
985 
986     if ( srcLen == wxNO_LEN )
987         srcLen = strlen(src) + 1;
988 
989     for ( const char *p = src; ; p++ )
990     {
991         if ( (srcLen == wxNO_LEN ? !*p : !srcLen) )
992         {
993             // all done successfully, just add the trailing NULL if we are not
994             // using explicit length
995             if ( srcLen == wxNO_LEN )
996             {
997                 if ( out )
998                 {
999                     if ( !dstLen )
1000                         break;
1001 
1002                     *out = L'\0';
1003                 }
1004 
1005                 written++;
1006             }
1007 
1008             return written;
1009         }
1010 
1011         if ( out && !dstLen-- )
1012             break;
1013 
1014         wxUint32 code;
1015         unsigned char c = *p;
1016 
1017         if ( c < 0x80 )
1018         {
1019             if ( srcLen == 0 ) // the test works for wxNO_LEN too
1020                 break;
1021 
1022             if ( srcLen != wxNO_LEN )
1023                 srcLen--;
1024 
1025             code = c;
1026         }
1027         else
1028         {
1029             unsigned len = tableUtf8Lengths[c];
1030             if ( !len )
1031                 break;
1032 
1033             if ( srcLen < len ) // the test works for wxNO_LEN too
1034                 break;
1035 
1036             if ( srcLen != wxNO_LEN )
1037                 srcLen -= len;
1038 
1039             //   Char. number range   |        UTF-8 octet sequence
1040             //      (hexadecimal)     |              (binary)
1041             //  ----------------------+----------------------------------------
1042             //  0000 0000 - 0000 007F | 0xxxxxxx
1043             //  0000 0080 - 0000 07FF | 110xxxxx 10xxxxxx
1044             //  0000 0800 - 0000 FFFF | 1110xxxx 10xxxxxx 10xxxxxx
1045             //  0001 0000 - 0010 FFFF | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
1046             //
1047             //  Code point value is stored in bits marked with 'x',
1048             //  lowest-order bit of the value on the right side in the diagram
1049             //  above.                                         (from RFC 3629)
1050 
1051             // mask to extract lead byte's value ('x' bits above), by sequence
1052             // length:
1053             static const unsigned char leadValueMask[] = { 0x7F, 0x1F, 0x0F, 0x07 };
1054 
1055             len--; // it's more convenient to work with 0-based length here
1056 
1057             code = c & leadValueMask[len];
1058 
1059             // all remaining bytes, if any, are handled in the same way
1060             // regardless of sequence's length:
1061             for ( ; len; --len )
1062             {
1063                 c = *++p;
1064                 if ( (c & 0xC0) != 0x80 )
1065                     return wxCONV_FAILED;
1066 
1067                 code <<= 6;
1068                 code |= c & 0x3F;
1069             }
1070         }
1071 
1072 #ifdef WC_UTF16
1073         // cast is ok because wchar_t == wxUint16 if WC_UTF16
1074         if ( encode_utf16(code, (wxUint16 *)out) == 2 )
1075         {
1076             if ( out )
1077                 out++;
1078             written++;
1079         }
1080 #else // !WC_UTF16
1081         if ( out )
1082             *out = code;
1083 #endif // WC_UTF16/!WC_UTF16
1084 
1085         if ( out )
1086             out++;
1087 
1088         written++;
1089     }
1090 
1091     return wxCONV_FAILED;
1092 }
1093 
1094 size_t
FromWChar(char * dst,size_t dstLen,const wchar_t * src,size_t srcLen) const1095 wxMBConvStrictUTF8::FromWChar(char *dst, size_t dstLen,
1096                               const wchar_t *src, size_t srcLen) const
1097 {
1098     char *out = dstLen ? dst : NULL;
1099     size_t written = 0;
1100 
1101     const wchar_t* const end = srcLen == wxNO_LEN ? NULL : src + srcLen;
1102     for ( const wchar_t *wp = src; ; )
1103     {
1104         if ( end ? wp == end : !*wp )
1105         {
1106             // all done successfully, just add the trailing NULL if we are not
1107             // using explicit length
1108             if ( srcLen == wxNO_LEN )
1109             {
1110                 if ( out )
1111                 {
1112                     if ( !dstLen )
1113                         break;
1114 
1115                     *out = '\0';
1116                 }
1117 
1118                 written++;
1119             }
1120 
1121             return written;
1122         }
1123 
1124         wxUint32 code;
1125 #ifdef WC_UTF16
1126         code = wxDecodeSurrogate(&wp, end);
1127         if ( !wp )
1128             return wxCONV_FAILED;
1129 #else // wchar_t is UTF-32
1130         code = *wp++ & 0x7fffffff;
1131 #endif
1132 
1133         unsigned len;
1134         if ( code <= 0x7F )
1135         {
1136             len = 1;
1137             if ( out )
1138             {
1139                 if ( dstLen < len )
1140                     break;
1141 
1142                 out[0] = (char)code;
1143             }
1144         }
1145         else if ( code <= 0x07FF )
1146         {
1147             len = 2;
1148             if ( out )
1149             {
1150                 if ( dstLen < len )
1151                     break;
1152 
1153                 // NB: this line takes 6 least significant bits, encodes them as
1154                 // 10xxxxxx and discards them so that the next byte can be encoded:
1155                 out[1] = 0x80 | (code & 0x3F);  code >>= 6;
1156                 out[0] = 0xC0 | code;
1157             }
1158         }
1159         else if ( code < 0xFFFF )
1160         {
1161             len = 3;
1162             if ( out )
1163             {
1164                 if ( dstLen < len )
1165                     break;
1166 
1167                 out[2] = 0x80 | (code & 0x3F);  code >>= 6;
1168                 out[1] = 0x80 | (code & 0x3F);  code >>= 6;
1169                 out[0] = 0xE0 | code;
1170             }
1171         }
1172         else if ( code <= 0x10FFFF )
1173         {
1174             len = 4;
1175             if ( out )
1176             {
1177                 if ( dstLen < len )
1178                     break;
1179 
1180                 out[3] = 0x80 | (code & 0x3F);  code >>= 6;
1181                 out[2] = 0x80 | (code & 0x3F);  code >>= 6;
1182                 out[1] = 0x80 | (code & 0x3F);  code >>= 6;
1183                 out[0] = 0xF0 | code;
1184             }
1185         }
1186         else
1187         {
1188             wxFAIL_MSG( wxT("trying to encode undefined Unicode character") );
1189             break;
1190         }
1191 
1192         if ( out )
1193         {
1194             out += len;
1195             dstLen -= len;
1196         }
1197 
1198         written += len;
1199     }
1200 
1201     // we only get here if an error occurs during decoding
1202     return wxCONV_FAILED;
1203 }
1204 
ToWChar(wchar_t * buf,size_t n,const char * psz,size_t srcLen) const1205 size_t wxMBConvUTF8::ToWChar(wchar_t *buf, size_t n,
1206                              const char *psz, size_t srcLen) const
1207 {
1208     if ( m_options == MAP_INVALID_UTF8_NOT )
1209         return wxMBConvStrictUTF8::ToWChar(buf, n, psz, srcLen);
1210 
1211     size_t len = 0;
1212 
1213     // The length can be either given explicitly or computed implicitly for the
1214     // NUL-terminated strings.
1215     const bool isNulTerminated = srcLen == wxNO_LEN;
1216     while ((isNulTerminated ? *psz : srcLen--) && ((!buf) || (len < n)))
1217     {
1218         const char *opsz = psz;
1219         bool invalid = false;
1220         unsigned char cc = *psz++, fc = cc;
1221         unsigned cnt;
1222         for (cnt = 0; fc & 0x80; cnt++)
1223             fc <<= 1;
1224 
1225         if (!cnt)
1226         {
1227             // plain ASCII char
1228             if (buf)
1229                 *buf++ = cc;
1230             len++;
1231 
1232             // escape the escape character for octal escapes
1233             if ((m_options & MAP_INVALID_UTF8_TO_OCTAL)
1234                     && cc == '\\' && (!buf || len < n))
1235             {
1236                 if (buf)
1237                     *buf++ = cc;
1238                 len++;
1239             }
1240         }
1241         else
1242         {
1243             cnt--;
1244             if (!cnt)
1245             {
1246                 // invalid UTF-8 sequence
1247                 invalid = true;
1248             }
1249             else
1250             {
1251                 unsigned ocnt = cnt - 1;
1252                 wxUint32 res = cc & (0x3f >> cnt);
1253                 while (cnt--)
1254                 {
1255                     if (!isNulTerminated && !srcLen)
1256                     {
1257                         // invalid UTF-8 sequence ending before the end of code
1258                         // point.
1259                         invalid = true;
1260                         break;
1261                     }
1262 
1263                     cc = *psz;
1264                     if ((cc & 0xC0) != 0x80)
1265                     {
1266                         // invalid UTF-8 sequence
1267                         invalid = true;
1268                         break;
1269                     }
1270 
1271                     psz++;
1272                     if (!isNulTerminated)
1273                         srcLen--;
1274                     res = (res << 6) | (cc & 0x3f);
1275                 }
1276 
1277                 if (invalid || res <= utf8_max[ocnt])
1278                 {
1279                     // illegal UTF-8 encoding
1280                     invalid = true;
1281                 }
1282                 else if ((m_options & MAP_INVALID_UTF8_TO_PUA) &&
1283                         res >= wxUnicodePUA && res < wxUnicodePUAEnd)
1284                 {
1285                     // if one of our PUA characters turns up externally
1286                     // it must also be treated as an illegal sequence
1287                     // (a bit like you have to escape an escape character)
1288                     invalid = true;
1289                 }
1290                 else
1291                 {
1292 #ifdef WC_UTF16
1293                     // cast is ok because wchar_t == wxUint16 if WC_UTF16
1294                     size_t pa = encode_utf16(res, (wxUint16 *)buf);
1295                     if (pa == wxCONV_FAILED)
1296                     {
1297                         invalid = true;
1298                     }
1299                     else
1300                     {
1301                         if (buf)
1302                             buf += pa;
1303                         len += pa;
1304                     }
1305 #else // !WC_UTF16
1306                     if (buf)
1307                         *buf++ = (wchar_t)res;
1308                     len++;
1309 #endif // WC_UTF16/!WC_UTF16
1310                 }
1311             }
1312 
1313             if (invalid)
1314             {
1315                 if (m_options & MAP_INVALID_UTF8_TO_PUA)
1316                 {
1317                     while (opsz < psz && (!buf || len < n))
1318                     {
1319 #ifdef WC_UTF16
1320                         // cast is ok because wchar_t == wxUuint16 if WC_UTF16
1321                         size_t pa = encode_utf16((unsigned char)*opsz + wxUnicodePUA, (wxUint16 *)buf);
1322                         wxASSERT(pa != wxCONV_FAILED);
1323                         if (buf)
1324                             buf += pa;
1325                         opsz++;
1326                         len += pa;
1327 #else
1328                         if (buf)
1329                             *buf++ = (wchar_t)(wxUnicodePUA + (unsigned char)*opsz);
1330                         opsz++;
1331                         len++;
1332 #endif
1333                     }
1334                 }
1335                 else if (m_options & MAP_INVALID_UTF8_TO_OCTAL)
1336                 {
1337                     while (opsz < psz && (!buf || len < n))
1338                     {
1339                         if ( buf && len + 3 < n )
1340                         {
1341                             unsigned char on = *opsz;
1342                             *buf++ = L'\\';
1343                             *buf++ = (wchar_t)( L'0' + on / 0100 );
1344                             *buf++ = (wchar_t)( L'0' + (on % 0100) / 010 );
1345                             *buf++ = (wchar_t)( L'0' + on % 010 );
1346                         }
1347 
1348                         opsz++;
1349                         len += 4;
1350                     }
1351                 }
1352                 else // MAP_INVALID_UTF8_NOT
1353                 {
1354                     return wxCONV_FAILED;
1355                 }
1356             }
1357         }
1358     }
1359 
1360     if ( isNulTerminated )
1361     {
1362         // Add the trailing NUL in this case if we have a large enough buffer.
1363         if ( buf && (len < n) )
1364             *buf = 0;
1365 
1366         // And count it in any case.
1367         len++;
1368     }
1369 
1370     return len;
1371 }
1372 
isoctal(wchar_t wch)1373 static inline bool isoctal(wchar_t wch)
1374 {
1375     return L'0' <= wch && wch <= L'7';
1376 }
1377 
FromWChar(char * buf,size_t n,const wchar_t * psz,size_t srcLen) const1378 size_t wxMBConvUTF8::FromWChar(char *buf, size_t n,
1379                                const wchar_t *psz, size_t srcLen) const
1380 {
1381     if ( m_options == MAP_INVALID_UTF8_NOT )
1382         return wxMBConvStrictUTF8::FromWChar(buf, n, psz, srcLen);
1383 
1384     size_t len = 0;
1385 
1386     // The length can be either given explicitly or computed implicitly for the
1387     // NUL-terminated strings.
1388     const wchar_t* const end = srcLen == wxNO_LEN ? NULL : psz + srcLen;
1389     while ((end ? psz < end : *psz) && ((!buf) || (len < n)))
1390     {
1391         wxUint32 cc;
1392 
1393 #ifdef WC_UTF16
1394         cc = wxDecodeSurrogate(&psz, end);
1395         if ( !psz )
1396             return wxCONV_FAILED;
1397 #else
1398         cc = (*psz++) & 0x7fffffff;
1399 #endif
1400 
1401         if ( (m_options & MAP_INVALID_UTF8_TO_PUA)
1402                 && cc >= wxUnicodePUA && cc < wxUnicodePUAEnd )
1403         {
1404             if (buf)
1405                 *buf++ = (char)(cc - wxUnicodePUA);
1406             len++;
1407         }
1408         else if ( (m_options & MAP_INVALID_UTF8_TO_OCTAL)
1409                     && cc == L'\\' && psz[0] == L'\\' )
1410         {
1411             if (buf)
1412                 *buf++ = (char)cc;
1413             psz++;
1414             len++;
1415         }
1416         else if ( (m_options & MAP_INVALID_UTF8_TO_OCTAL) &&
1417                     cc == L'\\' &&
1418                         isoctal(psz[0]) && isoctal(psz[1]) && isoctal(psz[2]) )
1419         {
1420             if (buf)
1421             {
1422                 *buf++ = (char) ((psz[0] - L'0') * 0100 +
1423                                  (psz[1] - L'0') * 010 +
1424                                  (psz[2] - L'0'));
1425             }
1426 
1427             psz += 3;
1428             len++;
1429         }
1430         else
1431         {
1432             unsigned cnt;
1433             for (cnt = 0; cc > utf8_max[cnt]; cnt++)
1434             {
1435             }
1436 
1437             if (!cnt)
1438             {
1439                 // plain ASCII char
1440                 if (buf)
1441                     *buf++ = (char) cc;
1442                 len++;
1443             }
1444             else
1445             {
1446                 len += cnt + 1;
1447                 if (buf)
1448                 {
1449                     *buf++ = (char) ((-128 >> cnt) | ((cc >> (cnt * 6)) & (0x3f >> cnt)));
1450                     while (cnt--)
1451                         *buf++ = (char) (0x80 | ((cc >> (cnt * 6)) & 0x3f));
1452                 }
1453             }
1454         }
1455     }
1456 
1457     if ( !end )
1458     {
1459         // Add the trailing NUL in this case if we have a large enough buffer.
1460         if ( buf && (len < n) )
1461             *buf = 0;
1462 
1463         // And count it in any case.
1464         len++;
1465     }
1466 
1467     return len;
1468 }
1469 
1470 // ============================================================================
1471 // UTF-16
1472 // ============================================================================
1473 
1474 #ifdef WORDS_BIGENDIAN
1475     #define wxMBConvUTF16straight wxMBConvUTF16BE
1476     #define wxMBConvUTF16swap     wxMBConvUTF16LE
1477 #else
1478     #define wxMBConvUTF16swap     wxMBConvUTF16BE
1479     #define wxMBConvUTF16straight wxMBConvUTF16LE
1480 #endif
1481 
1482 /* static */
GetLength(const char * src,size_t srcLen)1483 size_t wxMBConvUTF16Base::GetLength(const char *src, size_t srcLen)
1484 {
1485     if ( srcLen == wxNO_LEN )
1486     {
1487         // count the number of bytes in input, including the trailing NULs
1488         const wxUint16 *inBuff = reinterpret_cast<const wxUint16 *>(src);
1489         for ( srcLen = 1; *inBuff++; srcLen++ )
1490             ;
1491 
1492         srcLen *= BYTES_PER_CHAR;
1493     }
1494     else // we already have the length
1495     {
1496         // we can only convert an entire number of UTF-16 characters
1497         if ( srcLen % BYTES_PER_CHAR )
1498             return wxCONV_FAILED;
1499     }
1500 
1501     return srcLen;
1502 }
1503 
1504 // case when in-memory representation is UTF-16 too
1505 #ifdef WC_UTF16
1506 
1507 // ----------------------------------------------------------------------------
1508 // conversions without endianness change
1509 // ----------------------------------------------------------------------------
1510 
1511 size_t
ToWChar(wchar_t * dst,size_t dstLen,const char * src,size_t srcLen) const1512 wxMBConvUTF16straight::ToWChar(wchar_t *dst, size_t dstLen,
1513                                const char *src, size_t srcLen) const
1514 {
1515     // set up the scene for using memcpy() (which is presumably more efficient
1516     // than copying the bytes one by one)
1517     srcLen = GetLength(src, srcLen);
1518     if ( srcLen == wxNO_LEN )
1519         return wxCONV_FAILED;
1520 
1521     const size_t inLen = srcLen / BYTES_PER_CHAR;
1522     if ( dst )
1523     {
1524         if ( dstLen < inLen )
1525             return wxCONV_FAILED;
1526 
1527         memcpy(dst, src, srcLen);
1528     }
1529 
1530     return inLen;
1531 }
1532 
1533 size_t
FromWChar(char * dst,size_t dstLen,const wchar_t * src,size_t srcLen) const1534 wxMBConvUTF16straight::FromWChar(char *dst, size_t dstLen,
1535                                  const wchar_t *src, size_t srcLen) const
1536 {
1537     if ( srcLen == wxNO_LEN )
1538         srcLen = wxWcslen(src) + 1;
1539 
1540     srcLen *= BYTES_PER_CHAR;
1541 
1542     if ( dst )
1543     {
1544         if ( dstLen < srcLen )
1545             return wxCONV_FAILED;
1546 
1547         memcpy(dst, src, srcLen);
1548     }
1549 
1550     return srcLen;
1551 }
1552 
1553 // ----------------------------------------------------------------------------
1554 // endian-reversing conversions
1555 // ----------------------------------------------------------------------------
1556 
1557 size_t
ToWChar(wchar_t * dst,size_t dstLen,const char * src,size_t srcLen) const1558 wxMBConvUTF16swap::ToWChar(wchar_t *dst, size_t dstLen,
1559                            const char *src, size_t srcLen) const
1560 {
1561     srcLen = GetLength(src, srcLen);
1562     if ( srcLen == wxNO_LEN )
1563         return wxCONV_FAILED;
1564 
1565     srcLen /= BYTES_PER_CHAR;
1566 
1567     if ( dst )
1568     {
1569         if ( dstLen < srcLen )
1570             return wxCONV_FAILED;
1571 
1572         const wxUint16 *inBuff = reinterpret_cast<const wxUint16 *>(src);
1573         for ( size_t n = 0; n < srcLen; n++, inBuff++ )
1574         {
1575             *dst++ = wxUINT16_SWAP_ALWAYS(*inBuff);
1576         }
1577     }
1578 
1579     return srcLen;
1580 }
1581 
1582 size_t
FromWChar(char * dst,size_t dstLen,const wchar_t * src,size_t srcLen) const1583 wxMBConvUTF16swap::FromWChar(char *dst, size_t dstLen,
1584                              const wchar_t *src, size_t srcLen) const
1585 {
1586     if ( srcLen == wxNO_LEN )
1587         srcLen = wxWcslen(src) + 1;
1588 
1589     srcLen *= BYTES_PER_CHAR;
1590 
1591     if ( dst )
1592     {
1593         if ( dstLen < srcLen )
1594             return wxCONV_FAILED;
1595 
1596         wxUint16 *outBuff = reinterpret_cast<wxUint16 *>(dst);
1597         for ( size_t n = 0; n < srcLen; n += BYTES_PER_CHAR, src++ )
1598         {
1599             *outBuff++ = wxUINT16_SWAP_ALWAYS(*src);
1600         }
1601     }
1602 
1603     return srcLen;
1604 }
1605 
1606 #else // !WC_UTF16: wchar_t is UTF-32
1607 
1608 // ----------------------------------------------------------------------------
1609 // conversions without endianness change
1610 // ----------------------------------------------------------------------------
1611 
1612 size_t
ToWChar(wchar_t * dst,size_t dstLen,const char * src,size_t srcLen) const1613 wxMBConvUTF16straight::ToWChar(wchar_t *dst, size_t dstLen,
1614                                const char *src, size_t srcLen) const
1615 {
1616     srcLen = GetLength(src, srcLen);
1617     if ( srcLen == wxNO_LEN )
1618         return wxCONV_FAILED;
1619 
1620     const size_t inLen = srcLen / BYTES_PER_CHAR;
1621     size_t outLen = 0;
1622     const wxUint16 *inBuff = reinterpret_cast<const wxUint16 *>(src);
1623     for ( const wxUint16 * const inEnd = inBuff + inLen; inBuff < inEnd; )
1624     {
1625         const wxUint32 ch = wxDecodeSurrogate(&inBuff, inEnd);
1626         if ( !inBuff )
1627             return wxCONV_FAILED;
1628 
1629         outLen++;
1630 
1631         if ( dst )
1632         {
1633             if ( outLen > dstLen )
1634                 return wxCONV_FAILED;
1635 
1636             *dst++ = ch;
1637         }
1638     }
1639 
1640 
1641     return outLen;
1642 }
1643 
1644 size_t
FromWChar(char * dst,size_t dstLen,const wchar_t * src,size_t srcLen) const1645 wxMBConvUTF16straight::FromWChar(char *dst, size_t dstLen,
1646                                  const wchar_t *src, size_t srcLen) const
1647 {
1648     if ( srcLen == wxNO_LEN )
1649         srcLen = wxWcslen(src) + 1;
1650 
1651     size_t outLen = 0;
1652     wxUint16 *outBuff = reinterpret_cast<wxUint16 *>(dst);
1653     for ( size_t n = 0; n < srcLen; n++ )
1654     {
1655         wxUint16 cc[2] = { 0 };
1656         const size_t numChars = encode_utf16(*src++, cc);
1657         if ( numChars == wxCONV_FAILED )
1658             return wxCONV_FAILED;
1659 
1660         outLen += numChars * BYTES_PER_CHAR;
1661         if ( outBuff )
1662         {
1663             if ( outLen > dstLen )
1664                 return wxCONV_FAILED;
1665 
1666             *outBuff++ = cc[0];
1667             if ( numChars == 2 )
1668             {
1669                 // second character of a surrogate
1670                 *outBuff++ = cc[1];
1671             }
1672         }
1673     }
1674 
1675     return outLen;
1676 }
1677 
1678 // ----------------------------------------------------------------------------
1679 // endian-reversing conversions
1680 // ----------------------------------------------------------------------------
1681 
1682 size_t
ToWChar(wchar_t * dst,size_t dstLen,const char * src,size_t srcLen) const1683 wxMBConvUTF16swap::ToWChar(wchar_t *dst, size_t dstLen,
1684                            const char *src, size_t srcLen) const
1685 {
1686     srcLen = GetLength(src, srcLen);
1687     if ( srcLen == wxNO_LEN )
1688         return wxCONV_FAILED;
1689 
1690     const size_t inLen = srcLen / BYTES_PER_CHAR;
1691     size_t outLen = 0;
1692     const wxUint16 *inBuff = reinterpret_cast<const wxUint16 *>(src);
1693     for ( const wxUint16 * const inEnd = inBuff + inLen; inBuff < inEnd; )
1694     {
1695         wxUint16 tmp[2];
1696         const wxUint16* tmpEnd = tmp;
1697 
1698         tmp[0] = wxUINT16_SWAP_ALWAYS(*inBuff);
1699         tmpEnd++;
1700 
1701         if ( inBuff + 1 < inEnd )
1702         {
1703             // Normal case, we have a next character to decode.
1704             tmp[1] = wxUINT16_SWAP_ALWAYS(inBuff[1]);
1705             tmpEnd++;
1706         }
1707 
1708         const wxUint16* p = tmp;
1709         const wxUint32 ch = wxDecodeSurrogate(&p, tmpEnd);
1710         if ( !p )
1711             return wxCONV_FAILED;
1712 
1713         // Move the real pointer by the same amount as "p" was updated by.
1714         inBuff += p - tmp;
1715 
1716         outLen++;
1717 
1718         if ( dst )
1719         {
1720             if ( outLen > dstLen )
1721                 return wxCONV_FAILED;
1722 
1723             *dst++ = ch;
1724         }
1725     }
1726 
1727 
1728     return outLen;
1729 }
1730 
1731 size_t
FromWChar(char * dst,size_t dstLen,const wchar_t * src,size_t srcLen) const1732 wxMBConvUTF16swap::FromWChar(char *dst, size_t dstLen,
1733                              const wchar_t *src, size_t srcLen) const
1734 {
1735     if ( srcLen == wxNO_LEN )
1736         srcLen = wxWcslen(src) + 1;
1737 
1738     size_t outLen = 0;
1739     wxUint16 *outBuff = reinterpret_cast<wxUint16 *>(dst);
1740     for ( const wchar_t *srcEnd = src + srcLen; src < srcEnd; src++ )
1741     {
1742         wxUint16 cc[2] = { 0 };
1743         const size_t numChars = encode_utf16(*src, cc);
1744         if ( numChars == wxCONV_FAILED )
1745             return wxCONV_FAILED;
1746 
1747         outLen += numChars * BYTES_PER_CHAR;
1748         if ( outBuff )
1749         {
1750             if ( outLen > dstLen )
1751                 return wxCONV_FAILED;
1752 
1753             *outBuff++ = wxUINT16_SWAP_ALWAYS(cc[0]);
1754             if ( numChars == 2 )
1755             {
1756                 // second character of a surrogate
1757                 *outBuff++ = wxUINT16_SWAP_ALWAYS(cc[1]);
1758             }
1759         }
1760     }
1761 
1762     return outLen;
1763 }
1764 
1765 #endif // WC_UTF16/!WC_UTF16
1766 
1767 
1768 // ============================================================================
1769 // UTF-32
1770 // ============================================================================
1771 
1772 #ifdef WORDS_BIGENDIAN
1773     #define wxMBConvUTF32straight  wxMBConvUTF32BE
1774     #define wxMBConvUTF32swap      wxMBConvUTF32LE
1775 #else
1776     #define wxMBConvUTF32swap      wxMBConvUTF32BE
1777     #define wxMBConvUTF32straight  wxMBConvUTF32LE
1778 #endif
1779 
1780 
1781 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32LE) wxConvUTF32LE;
1782 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32BE) wxConvUTF32BE;
1783 
1784 /* static */
GetLength(const char * src,size_t srcLen)1785 size_t wxMBConvUTF32Base::GetLength(const char *src, size_t srcLen)
1786 {
1787     if ( srcLen == wxNO_LEN )
1788     {
1789         // count the number of bytes in input, including the trailing NULs
1790         const wxUint32 *inBuff = reinterpret_cast<const wxUint32 *>(src);
1791         for ( srcLen = 1; *inBuff++; srcLen++ )
1792             ;
1793 
1794         srcLen *= BYTES_PER_CHAR;
1795     }
1796     else // we already have the length
1797     {
1798         // we can only convert an entire number of UTF-32 characters
1799         if ( srcLen % BYTES_PER_CHAR )
1800             return wxCONV_FAILED;
1801     }
1802 
1803     return srcLen;
1804 }
1805 
1806 // case when in-memory representation is UTF-16
1807 #ifdef WC_UTF16
1808 
1809 // ----------------------------------------------------------------------------
1810 // conversions without endianness change
1811 // ----------------------------------------------------------------------------
1812 
1813 size_t
ToWChar(wchar_t * dst,size_t dstLen,const char * src,size_t srcLen) const1814 wxMBConvUTF32straight::ToWChar(wchar_t *dst, size_t dstLen,
1815                                const char *src, size_t srcLen) const
1816 {
1817     srcLen = GetLength(src, srcLen);
1818     if ( srcLen == wxNO_LEN )
1819         return wxCONV_FAILED;
1820 
1821     const wxUint32 *inBuff = reinterpret_cast<const wxUint32 *>(src);
1822     const size_t inLen = srcLen / BYTES_PER_CHAR;
1823     size_t outLen = 0;
1824     for ( size_t n = 0; n < inLen; n++ )
1825     {
1826         wxUint16 cc[2] = { 0 };
1827         const size_t numChars = encode_utf16(*inBuff++, cc);
1828         if ( numChars == wxCONV_FAILED )
1829             return wxCONV_FAILED;
1830 
1831         outLen += numChars;
1832         if ( dst )
1833         {
1834             if ( outLen > dstLen )
1835                 return wxCONV_FAILED;
1836 
1837             *dst++ = cc[0];
1838             if ( numChars == 2 )
1839             {
1840                 // second character of a surrogate
1841                 *dst++ = cc[1];
1842             }
1843         }
1844     }
1845 
1846     return outLen;
1847 }
1848 
1849 size_t
FromWChar(char * dst,size_t dstLen,const wchar_t * src,size_t srcLen) const1850 wxMBConvUTF32straight::FromWChar(char *dst, size_t dstLen,
1851                                  const wchar_t *src, size_t srcLen) const
1852 {
1853     if ( srcLen == wxNO_LEN )
1854         srcLen = wxWcslen(src) + 1;
1855 
1856     if ( !dst )
1857     {
1858         // optimization: return maximal space which could be needed for this
1859         // string instead of the exact amount which could be less if there are
1860         // any surrogates in the input
1861         //
1862         // we consider that surrogates are rare enough to make it worthwhile to
1863         // avoid running the loop below at the cost of slightly extra memory
1864         // consumption
1865         return srcLen * BYTES_PER_CHAR;
1866     }
1867 
1868     wxUint32 *outBuff = reinterpret_cast<wxUint32 *>(dst);
1869     size_t outLen = 0;
1870     for ( const wchar_t * const srcEnd = src + srcLen; src < srcEnd; )
1871     {
1872         const wxUint32 ch = wxDecodeSurrogate(&src, srcEnd);
1873         if ( !src )
1874             return wxCONV_FAILED;
1875 
1876         outLen += BYTES_PER_CHAR;
1877 
1878         if ( outLen > dstLen )
1879             return wxCONV_FAILED;
1880 
1881         *outBuff++ = ch;
1882     }
1883 
1884     return outLen;
1885 }
1886 
1887 // ----------------------------------------------------------------------------
1888 // endian-reversing conversions
1889 // ----------------------------------------------------------------------------
1890 
1891 size_t
ToWChar(wchar_t * dst,size_t dstLen,const char * src,size_t srcLen) const1892 wxMBConvUTF32swap::ToWChar(wchar_t *dst, size_t dstLen,
1893                            const char *src, size_t srcLen) const
1894 {
1895     srcLen = GetLength(src, srcLen);
1896     if ( srcLen == wxNO_LEN )
1897         return wxCONV_FAILED;
1898 
1899     const wxUint32 *inBuff = reinterpret_cast<const wxUint32 *>(src);
1900     const size_t inLen = srcLen / BYTES_PER_CHAR;
1901     size_t outLen = 0;
1902     for ( size_t n = 0; n < inLen; n++, inBuff++ )
1903     {
1904         wxUint16 cc[2] = { 0 };
1905         const size_t numChars = encode_utf16(wxUINT32_SWAP_ALWAYS(*inBuff), cc);
1906         if ( numChars == wxCONV_FAILED )
1907             return wxCONV_FAILED;
1908 
1909         outLen += numChars;
1910         if ( dst )
1911         {
1912             if ( outLen > dstLen )
1913                 return wxCONV_FAILED;
1914 
1915             *dst++ = cc[0];
1916             if ( numChars == 2 )
1917             {
1918                 // second character of a surrogate
1919                 *dst++ = cc[1];
1920             }
1921         }
1922     }
1923 
1924     return outLen;
1925 }
1926 
1927 size_t
FromWChar(char * dst,size_t dstLen,const wchar_t * src,size_t srcLen) const1928 wxMBConvUTF32swap::FromWChar(char *dst, size_t dstLen,
1929                              const wchar_t *src, size_t srcLen) const
1930 {
1931     if ( srcLen == wxNO_LEN )
1932         srcLen = wxWcslen(src) + 1;
1933 
1934     if ( !dst )
1935     {
1936         // optimization: return maximal space which could be needed for this
1937         // string instead of the exact amount which could be less if there are
1938         // any surrogates in the input
1939         //
1940         // we consider that surrogates are rare enough to make it worthwhile to
1941         // avoid running the loop below at the cost of slightly extra memory
1942         // consumption
1943         return srcLen*BYTES_PER_CHAR;
1944     }
1945 
1946     wxUint32 *outBuff = reinterpret_cast<wxUint32 *>(dst);
1947     size_t outLen = 0;
1948     for ( const wchar_t * const srcEnd = src + srcLen; src < srcEnd; )
1949     {
1950         const wxUint32 ch = wxDecodeSurrogate(&src, srcEnd);
1951         if ( !src )
1952             return wxCONV_FAILED;
1953 
1954         outLen += BYTES_PER_CHAR;
1955 
1956         if ( outLen > dstLen )
1957             return wxCONV_FAILED;
1958 
1959         *outBuff++ = wxUINT32_SWAP_ALWAYS(ch);
1960     }
1961 
1962     return outLen;
1963 }
1964 
1965 #else // !WC_UTF16: wchar_t is UTF-32
1966 
1967 // ----------------------------------------------------------------------------
1968 // conversions without endianness change
1969 // ----------------------------------------------------------------------------
1970 
1971 size_t
ToWChar(wchar_t * dst,size_t dstLen,const char * src,size_t srcLen) const1972 wxMBConvUTF32straight::ToWChar(wchar_t *dst, size_t dstLen,
1973                                const char *src, size_t srcLen) const
1974 {
1975     // use memcpy() as it should be much faster than hand-written loop
1976     srcLen = GetLength(src, srcLen);
1977     if ( srcLen == wxNO_LEN )
1978         return wxCONV_FAILED;
1979 
1980     const size_t inLen = srcLen/BYTES_PER_CHAR;
1981     if ( dst )
1982     {
1983         if ( dstLen < inLen )
1984             return wxCONV_FAILED;
1985 
1986         memcpy(dst, src, srcLen);
1987     }
1988 
1989     return inLen;
1990 }
1991 
1992 size_t
FromWChar(char * dst,size_t dstLen,const wchar_t * src,size_t srcLen) const1993 wxMBConvUTF32straight::FromWChar(char *dst, size_t dstLen,
1994                                  const wchar_t *src, size_t srcLen) const
1995 {
1996     if ( srcLen == wxNO_LEN )
1997         srcLen = wxWcslen(src) + 1;
1998 
1999     srcLen *= BYTES_PER_CHAR;
2000 
2001     if ( dst )
2002     {
2003         if ( dstLen < srcLen )
2004             return wxCONV_FAILED;
2005 
2006         memcpy(dst, src, srcLen);
2007     }
2008 
2009     return srcLen;
2010 }
2011 
2012 // ----------------------------------------------------------------------------
2013 // endian-reversing conversions
2014 // ----------------------------------------------------------------------------
2015 
2016 size_t
ToWChar(wchar_t * dst,size_t dstLen,const char * src,size_t srcLen) const2017 wxMBConvUTF32swap::ToWChar(wchar_t *dst, size_t dstLen,
2018                            const char *src, size_t srcLen) const
2019 {
2020     srcLen = GetLength(src, srcLen);
2021     if ( srcLen == wxNO_LEN )
2022         return wxCONV_FAILED;
2023 
2024     srcLen /= BYTES_PER_CHAR;
2025 
2026     if ( dst )
2027     {
2028         if ( dstLen < srcLen )
2029             return wxCONV_FAILED;
2030 
2031         const wxUint32 *inBuff = reinterpret_cast<const wxUint32 *>(src);
2032         for ( size_t n = 0; n < srcLen; n++, inBuff++ )
2033         {
2034             *dst++ = wxUINT32_SWAP_ALWAYS(*inBuff);
2035         }
2036     }
2037 
2038     return srcLen;
2039 }
2040 
2041 size_t
FromWChar(char * dst,size_t dstLen,const wchar_t * src,size_t srcLen) const2042 wxMBConvUTF32swap::FromWChar(char *dst, size_t dstLen,
2043                              const wchar_t *src, size_t srcLen) const
2044 {
2045     if ( srcLen == wxNO_LEN )
2046         srcLen = wxWcslen(src) + 1;
2047 
2048     srcLen *= BYTES_PER_CHAR;
2049 
2050     if ( dst )
2051     {
2052         if ( dstLen < srcLen )
2053             return wxCONV_FAILED;
2054 
2055         wxUint32 *outBuff = reinterpret_cast<wxUint32 *>(dst);
2056         for ( size_t n = 0; n < srcLen; n += BYTES_PER_CHAR, src++ )
2057         {
2058             *outBuff++ = wxUINT32_SWAP_ALWAYS(*src);
2059         }
2060     }
2061 
2062     return srcLen;
2063 }
2064 
2065 #endif // WC_UTF16/!WC_UTF16
2066 
2067 
2068 // ============================================================================
2069 // The classes doing conversion using the iconv_xxx() functions
2070 // ============================================================================
2071 
2072 #ifdef HAVE_ICONV
2073 
2074 // VS: glibc 2.1.3 is broken in that iconv() conversion to/from UCS4 fails with
2075 //     E2BIG if output buffer is _exactly_ as big as needed. Such case is
2076 //     (unless there's yet another bug in glibc) the only case when iconv()
2077 //     returns with (size_t)-1 (which means error) and says there are 0 bytes
2078 //     left in the input buffer -- when _real_ error occurs,
2079 //     bytes-left-in-input buffer is non-zero. Hence, this alternative test for
2080 //     iconv() failure.
2081 //     [This bug does not appear in glibc 2.2.]
2082 #if defined(__GLIBC__) && __GLIBC__ == 2 && __GLIBC_MINOR__ <= 1
2083 #define ICONV_FAILED(cres, bufLeft) ((cres == (size_t)-1) && \
2084                                      (errno != E2BIG || bufLeft != 0))
2085 #else
2086 #define ICONV_FAILED(cres, bufLeft)  (cres == (size_t)-1)
2087 #endif
2088 
2089 #define ICONV_CHAR_CAST(x)  ((ICONV_CONST char **)(x))
2090 
2091 #define ICONV_T_INVALID ((iconv_t)-1)
2092 
2093 #if SIZEOF_WCHAR_T == 4
2094     #define WC_BSWAP    wxUINT32_SWAP_ALWAYS
2095     #define WC_ENC      wxFONTENCODING_UTF32
2096 #elif SIZEOF_WCHAR_T == 2
2097     #define WC_BSWAP    wxUINT16_SWAP_ALWAYS
2098     #define WC_ENC      wxFONTENCODING_UTF16
2099 #else // sizeof(wchar_t) != 2 nor 4
2100     // does this ever happen?
2101     #error "Unknown sizeof(wchar_t): please report this to wx-dev@lists.wxwindows.org"
2102 #endif
2103 
2104 // ----------------------------------------------------------------------------
2105 // wxMBConv_iconv: encapsulates an iconv character set
2106 // ----------------------------------------------------------------------------
2107 
2108 class wxMBConv_iconv : public wxMBConv
2109 {
2110 public:
2111     wxMBConv_iconv(const char *name);
2112     virtual ~wxMBConv_iconv();
2113 
2114     // implement base class virtual methods
2115     virtual size_t ToWChar(wchar_t *dst, size_t dstLen,
2116                            const char *src, size_t srcLen = wxNO_LEN) const;
2117     virtual size_t FromWChar(char *dst, size_t dstLen,
2118                              const wchar_t *src, size_t srcLen = wxNO_LEN) const;
2119     virtual size_t GetMBNulLen() const;
2120 
2121 #if wxUSE_UNICODE_UTF8
2122     virtual bool IsUTF8() const;
2123 #endif
2124 
Clone() const2125     virtual wxMBConv *Clone() const
2126     {
2127         wxMBConv_iconv *p = new wxMBConv_iconv(m_name);
2128         p->m_minMBCharWidth = m_minMBCharWidth;
2129         return p;
2130     }
2131 
IsOk() const2132     bool IsOk() const
2133         { return (m2w != ICONV_T_INVALID) && (w2m != ICONV_T_INVALID); }
2134 
2135 protected:
2136     // the iconv handlers used to translate from multibyte
2137     // to wide char and in the other direction
2138     iconv_t m2w,
2139             w2m;
2140 
2141 #if wxUSE_THREADS
2142     // guards access to m2w and w2m objects
2143     wxMutex m_iconvMutex;
2144 #endif
2145 
2146 private:
2147     // the name (for iconv_open()) of a wide char charset -- if none is
2148     // available on this machine, it will remain NULL
2149     static wxString ms_wcCharsetName;
2150 
2151     // true if the wide char encoding we use (i.e. ms_wcCharsetName) has
2152     // different endian-ness than the native one
2153     static bool ms_wcNeedsSwap;
2154 
2155 
2156     // name of the encoding handled by this conversion
2157     const char *m_name;
2158 
2159     // cached result of GetMBNulLen(); set to 0 meaning "unknown"
2160     // initially
2161     size_t m_minMBCharWidth;
2162 };
2163 
2164 // make the constructor available for unit testing
new_wxMBConv_iconv(const char * name)2165 WXDLLIMPEXP_BASE wxMBConv* new_wxMBConv_iconv( const char* name )
2166 {
2167     wxMBConv_iconv* result = new wxMBConv_iconv( name );
2168     if ( !result->IsOk() )
2169     {
2170         delete result;
2171         return 0;
2172     }
2173 
2174     return result;
2175 }
2176 
2177 wxString wxMBConv_iconv::ms_wcCharsetName;
2178 bool wxMBConv_iconv::ms_wcNeedsSwap = false;
2179 
wxMBConv_iconv(const char * name)2180 wxMBConv_iconv::wxMBConv_iconv(const char *name)
2181               : m_name(wxStrdup(name))
2182 {
2183     m_minMBCharWidth = 0;
2184 
2185     // check for charset that represents wchar_t:
2186     if ( ms_wcCharsetName.empty() )
2187     {
2188         wxLogTrace(TRACE_STRCONV, wxT("Looking for wide char codeset:"));
2189 
2190 #if wxUSE_FONTMAP
2191         const wxChar *const *names = wxFontMapperBase::GetAllEncodingNames(WC_ENC);
2192 #else // !wxUSE_FONTMAP
2193         static const wxChar *const names_static[] =
2194         {
2195 #if SIZEOF_WCHAR_T == 4
2196             wxT("UCS-4"),
2197 #elif SIZEOF_WCHAR_T == 2
2198             wxT("UCS-2"),
2199 #endif
2200             NULL
2201         };
2202         const wxChar *const *names = names_static;
2203 #endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
2204 
2205         for ( ; *names && ms_wcCharsetName.empty(); ++names )
2206         {
2207             const wxString nameCS(*names);
2208 
2209             // first try charset with explicit bytesex info (e.g. "UCS-4LE"):
2210             wxString nameXE(nameCS);
2211 
2212 #ifdef WORDS_BIGENDIAN
2213                 nameXE += wxT("BE");
2214 #else // little endian
2215                 nameXE += wxT("LE");
2216 #endif
2217 
2218             wxLogTrace(TRACE_STRCONV, wxT("  trying charset \"%s\""),
2219                        nameXE.c_str());
2220 
2221             m2w = iconv_open(nameXE.ToAscii(), name);
2222             if ( m2w == ICONV_T_INVALID )
2223             {
2224                 // try charset w/o bytesex info (e.g. "UCS4")
2225                 wxLogTrace(TRACE_STRCONV, wxT("  trying charset \"%s\""),
2226                            nameCS.c_str());
2227                 m2w = iconv_open(nameCS.ToAscii(), name);
2228 
2229                 // and check for bytesex ourselves:
2230                 if ( m2w != ICONV_T_INVALID )
2231                 {
2232                     char    buf[2], *bufPtr;
2233                     wchar_t wbuf[2];
2234                     size_t  insz, outsz;
2235                     size_t  res;
2236 
2237                     buf[0] = 'A';
2238                     buf[1] = 0;
2239                     wbuf[0] = 0;
2240                     insz = 2;
2241                     outsz = SIZEOF_WCHAR_T * 2;
2242                     char* wbufPtr = (char*)wbuf;
2243                     bufPtr = buf;
2244 
2245                     res = iconv(
2246                         m2w, ICONV_CHAR_CAST(&bufPtr), &insz,
2247                         &wbufPtr, &outsz);
2248 
2249                     if (ICONV_FAILED(res, insz))
2250                     {
2251                         wxLogLastError(wxT("iconv"));
2252                         wxLogError(_("Conversion to charset '%s' doesn't work."),
2253                                    nameCS.c_str());
2254                     }
2255                     else // ok, can convert to this encoding, remember it
2256                     {
2257                         ms_wcCharsetName = nameCS;
2258                         ms_wcNeedsSwap = wbuf[0] != (wchar_t)buf[0];
2259                     }
2260                 }
2261             }
2262             else // use charset not requiring byte swapping
2263             {
2264                 ms_wcCharsetName = nameXE;
2265             }
2266         }
2267 
2268         wxLogTrace(TRACE_STRCONV,
2269                    wxT("iconv wchar_t charset is \"%s\"%s"),
2270                    ms_wcCharsetName.empty() ? wxString("<none>")
2271                                             : ms_wcCharsetName,
2272                    ms_wcNeedsSwap ? wxT(" (needs swap)")
2273                                   : wxT(""));
2274     }
2275     else // we already have ms_wcCharsetName
2276     {
2277         m2w = iconv_open(ms_wcCharsetName.ToAscii(), name);
2278     }
2279 
2280     if ( ms_wcCharsetName.empty() )
2281     {
2282         w2m = ICONV_T_INVALID;
2283     }
2284     else
2285     {
2286         w2m = iconv_open(name, ms_wcCharsetName.ToAscii());
2287         if ( w2m == ICONV_T_INVALID )
2288         {
2289             wxLogTrace(TRACE_STRCONV,
2290                        wxT("\"%s\" -> \"%s\" works but not the converse!?"),
2291                        ms_wcCharsetName.c_str(), name);
2292         }
2293     }
2294 }
2295 
~wxMBConv_iconv()2296 wxMBConv_iconv::~wxMBConv_iconv()
2297 {
2298     free(const_cast<char *>(m_name));
2299 
2300     if ( m2w != ICONV_T_INVALID )
2301         iconv_close(m2w);
2302     if ( w2m != ICONV_T_INVALID )
2303         iconv_close(w2m);
2304 }
2305 
2306 size_t
ToWChar(wchar_t * dst,size_t dstLen,const char * src,size_t srcLen) const2307 wxMBConv_iconv::ToWChar(wchar_t *dst, size_t dstLen,
2308                         const char *src, size_t srcLen) const
2309 {
2310     if ( srcLen == wxNO_LEN )
2311     {
2312         // find the string length: notice that must be done differently for
2313         // NUL-terminated strings and UTF-16/32 which are terminated with 2/4
2314         // consecutive NULs
2315         const size_t nulLen = GetMBNulLen();
2316         switch ( nulLen )
2317         {
2318             default:
2319                 return wxCONV_FAILED;
2320 
2321             case 1:
2322                 srcLen = strlen(src); // arguably more optimized than our version
2323                 break;
2324 
2325             case 2:
2326             case 4:
2327                 // for UTF-16/32 not only we need to have 2/4 consecutive NULs
2328                 // but they also have to start at character boundary and not
2329                 // span two adjacent characters
2330                 const char *p;
2331                 for ( p = src; NotAllNULs(p, nulLen); p += nulLen )
2332                     ;
2333                 srcLen = p - src;
2334                 break;
2335         }
2336 
2337         // when we're determining the length of the string ourselves we count
2338         // the terminating NUL(s) as part of it and always NUL-terminate the
2339         // output
2340         srcLen += nulLen;
2341     }
2342 
2343     // we express length in the number of (wide) characters but iconv always
2344     // counts buffer sizes it in bytes
2345     dstLen *= SIZEOF_WCHAR_T;
2346 
2347 #if wxUSE_THREADS
2348     // NB: iconv() is MT-safe, but each thread must use its own iconv_t handle.
2349     //     Unfortunately there are a couple of global wxCSConv objects such as
2350     //     wxConvLocal that are used all over wx code, so we have to make sure
2351     //     the handle is used by at most one thread at the time. Otherwise
2352     //     only a few wx classes would be safe to use from non-main threads
2353     //     as MB<->WC conversion would fail "randomly".
2354     wxMutexLocker lock(wxConstCast(this, wxMBConv_iconv)->m_iconvMutex);
2355 #endif // wxUSE_THREADS
2356 
2357     size_t res, cres;
2358     const char *pszPtr = src;
2359 
2360     if ( dst )
2361     {
2362         char* bufPtr = (char*)dst;
2363 
2364         // have destination buffer, convert there
2365         size_t dstLenOrig = dstLen;
2366         cres = iconv(m2w,
2367                      ICONV_CHAR_CAST(&pszPtr), &srcLen,
2368                      &bufPtr, &dstLen);
2369 
2370         // convert the number of bytes converted as returned by iconv to the
2371         // number of (wide) characters converted that we need
2372         res = (dstLenOrig - dstLen) / SIZEOF_WCHAR_T;
2373 
2374         if (ms_wcNeedsSwap)
2375         {
2376             // convert to native endianness
2377             for ( unsigned i = 0; i < res; i++ )
2378                 dst[i] = WC_BSWAP(dst[i]);
2379         }
2380     }
2381     else // no destination buffer
2382     {
2383         // convert using temp buffer to calculate the size of the buffer needed
2384         wchar_t tbuf[256];
2385         res = 0;
2386 
2387         do
2388         {
2389             char* bufPtr = (char*)tbuf;
2390             dstLen = 8 * SIZEOF_WCHAR_T;
2391 
2392             cres = iconv(m2w,
2393                          ICONV_CHAR_CAST(&pszPtr), &srcLen,
2394                          &bufPtr, &dstLen );
2395 
2396             res += 8 - (dstLen / SIZEOF_WCHAR_T);
2397         }
2398         while ((cres == (size_t)-1) && (errno == E2BIG));
2399     }
2400 
2401     if (ICONV_FAILED(cres, srcLen))
2402     {
2403         //VS: it is ok if iconv fails, hence trace only
2404         wxLogTrace(TRACE_STRCONV, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
2405         return wxCONV_FAILED;
2406     }
2407 
2408     return res;
2409 }
2410 
FromWChar(char * dst,size_t dstLen,const wchar_t * src,size_t srcLen) const2411 size_t wxMBConv_iconv::FromWChar(char *dst, size_t dstLen,
2412                                  const wchar_t *src, size_t srcLen) const
2413 {
2414 #if wxUSE_THREADS
2415     // NB: explained in MB2WC
2416     wxMutexLocker lock(wxConstCast(this, wxMBConv_iconv)->m_iconvMutex);
2417 #endif
2418 
2419     if ( srcLen == wxNO_LEN )
2420         srcLen = wxWcslen(src) + 1;
2421 
2422     size_t inbuflen = srcLen * SIZEOF_WCHAR_T;
2423     size_t outbuflen = dstLen;
2424     size_t res, cres;
2425 
2426     wchar_t *tmpbuf = 0;
2427 
2428     if (ms_wcNeedsSwap)
2429     {
2430         // need to copy to temp buffer to switch endianness
2431         // (doing WC_BSWAP twice on the original buffer won't work, as it
2432         //  could be in read-only memory, or be accessed in some other thread)
2433         tmpbuf = (wchar_t *)malloc(inbuflen);
2434         for ( size_t i = 0; i < srcLen; i++ )
2435             tmpbuf[i] = WC_BSWAP(src[i]);
2436 
2437         src = tmpbuf;
2438     }
2439 
2440     char* inbuf = (char*)src;
2441     if ( dst )
2442     {
2443         // have destination buffer, convert there
2444         cres = iconv(w2m, ICONV_CHAR_CAST(&inbuf), &inbuflen, &dst, &outbuflen);
2445 
2446         res = dstLen - outbuflen;
2447     }
2448     else // no destination buffer
2449     {
2450         // convert using temp buffer to calculate the size of the buffer needed
2451         char tbuf[256];
2452         res = 0;
2453         do
2454         {
2455             dst = tbuf;
2456             outbuflen = WXSIZEOF(tbuf);
2457 
2458             cres = iconv(w2m, ICONV_CHAR_CAST(&inbuf), &inbuflen, &dst, &outbuflen);
2459 
2460             res += WXSIZEOF(tbuf) - outbuflen;
2461         }
2462         while ((cres == (size_t)-1) && (errno == E2BIG));
2463     }
2464 
2465     if (ms_wcNeedsSwap)
2466     {
2467         free(tmpbuf);
2468     }
2469 
2470     if (ICONV_FAILED(cres, inbuflen))
2471     {
2472         wxLogTrace(TRACE_STRCONV, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
2473         return wxCONV_FAILED;
2474     }
2475 
2476     return res;
2477 }
2478 
GetMBNulLen() const2479 size_t wxMBConv_iconv::GetMBNulLen() const
2480 {
2481     if ( m_minMBCharWidth == 0 )
2482     {
2483         wxMBConv_iconv * const self = wxConstCast(this, wxMBConv_iconv);
2484 
2485 #if wxUSE_THREADS
2486         // NB: explained in MB2WC
2487         wxMutexLocker lock(self->m_iconvMutex);
2488 #endif
2489 
2490         const wchar_t *wnul = L"";
2491         char buf[8]; // should be enough for NUL in any encoding
2492         size_t inLen = sizeof(wchar_t),
2493                outLen = WXSIZEOF(buf);
2494         char *inBuff = (char *)wnul;
2495         char *outBuff = buf;
2496         if ( iconv(w2m, ICONV_CHAR_CAST(&inBuff), &inLen, &outBuff, &outLen) == (size_t)-1 )
2497         {
2498             self->m_minMBCharWidth = (size_t)-1;
2499         }
2500         else // ok
2501         {
2502             self->m_minMBCharWidth = outBuff - buf;
2503         }
2504     }
2505 
2506     return m_minMBCharWidth;
2507 }
2508 
2509 #if wxUSE_UNICODE_UTF8
IsUTF8() const2510 bool wxMBConv_iconv::IsUTF8() const
2511 {
2512     return wxStricmp(m_name, "UTF-8") == 0 ||
2513            wxStricmp(m_name, "UTF8") == 0;
2514 }
2515 #endif
2516 
2517 #endif // HAVE_ICONV
2518 
2519 
2520 // ============================================================================
2521 // Win32 conversion classes
2522 // ============================================================================
2523 
2524 #ifdef wxHAVE_WIN32_MB2WC
2525 
2526 // from utils.cpp
2527 #if wxUSE_FONTMAP
2528 extern WXDLLIMPEXP_BASE long wxCharsetToCodepage(const char *charset);
2529 #endif
2530 extern WXDLLIMPEXP_BASE long wxEncodingToCodepage(wxFontEncoding encoding);
2531 
2532 class wxMBConv_win32 : public wxMBConv
2533 {
2534 public:
wxMBConv_win32()2535     wxMBConv_win32()
2536     {
2537         m_CodePage = CP_ACP;
2538         m_minMBCharWidth = 0;
2539     }
2540 
wxMBConv_win32(const wxMBConv_win32 & conv)2541     wxMBConv_win32(const wxMBConv_win32& conv)
2542         : wxMBConv()
2543     {
2544         m_CodePage = conv.m_CodePage;
2545         m_minMBCharWidth = conv.m_minMBCharWidth;
2546     }
2547 
2548 #if wxUSE_FONTMAP
wxMBConv_win32(const char * name)2549     wxMBConv_win32(const char* name)
2550     {
2551         m_CodePage = wxCharsetToCodepage(name);
2552         m_minMBCharWidth = 0;
2553     }
2554 #endif // wxUSE_FONTMAP
2555 
wxMBConv_win32(wxFontEncoding encoding)2556     wxMBConv_win32(wxFontEncoding encoding)
2557     {
2558         m_CodePage = wxEncodingToCodepage(encoding);
2559         m_minMBCharWidth = 0;
2560     }
2561 
MB2WC(wchar_t * buf,const char * psz,size_t n) const2562     virtual size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const
2563     {
2564         // note that we have to use MB_ERR_INVALID_CHARS flag as it without it
2565         // the behaviour is not compatible with the Unix version (using iconv)
2566         // and break the library itself, e.g. wxTextInputStream::NextChar()
2567         // wouldn't work if reading an incomplete MB char didn't result in an
2568         // error
2569         //
2570         // Moreover, MB_ERR_INVALID_CHARS is only supported on Win 2K SP4 or
2571         // Win XP or newer and it is not supported for UTF-[78] so we always
2572         // use our own conversions in this case. See
2573         //     http://blogs.msdn.com/michkap/archive/2005/04/19/409566.aspx
2574         //     http://msdn.microsoft.com/library/en-us/intl/unicode_17si.asp
2575         if ( m_CodePage == CP_UTF8 )
2576         {
2577             return wxMBConvUTF8().MB2WC(buf, psz, n);
2578         }
2579 
2580         if ( m_CodePage == CP_UTF7 )
2581         {
2582             return wxMBConvUTF7().MB2WC(buf, psz, n);
2583         }
2584 
2585         int flags = 0;
2586         if ( (m_CodePage < 50000 && m_CodePage != CP_SYMBOL) &&
2587                 IsAtLeastWin2kSP4() )
2588         {
2589             flags = MB_ERR_INVALID_CHARS;
2590         }
2591 
2592         const size_t len = ::MultiByteToWideChar
2593                              (
2594                                 m_CodePage,     // code page
2595                                 flags,          // flags: fall on error
2596                                 psz,            // input string
2597                                 -1,             // its length (NUL-terminated)
2598                                 buf,            // output string
2599                                 buf ? n : 0     // size of output buffer
2600                              );
2601         if ( !len )
2602         {
2603             // function totally failed
2604             return wxCONV_FAILED;
2605         }
2606 
2607         // if we were really converting and didn't use MB_ERR_INVALID_CHARS,
2608         // check if we succeeded, by doing a double trip:
2609         if ( !flags && buf )
2610         {
2611             const size_t mbLen = strlen(psz);
2612             wxCharBuffer mbBuf(mbLen);
2613             if ( ::WideCharToMultiByte
2614                    (
2615                       m_CodePage,
2616                       0,
2617                       buf,
2618                       -1,
2619                       mbBuf.data(),
2620                       mbLen + 1,        // size in bytes, not length
2621                       NULL,
2622                       NULL
2623                    ) == 0 ||
2624                   strcmp(mbBuf, psz) != 0 )
2625             {
2626                 // we didn't obtain the same thing we started from, hence
2627                 // the conversion was lossy and we consider that it failed
2628                 return wxCONV_FAILED;
2629             }
2630         }
2631 
2632         // note that it returns count of written chars for buf != NULL and size
2633         // of the needed buffer for buf == NULL so in either case the length of
2634         // the string (which never includes the terminating NUL) is one less
2635         return len - 1;
2636     }
2637 
WC2MB(char * buf,const wchar_t * pwz,size_t n) const2638     virtual size_t WC2MB(char *buf, const wchar_t *pwz, size_t n) const
2639     {
2640         /*
2641             we have a problem here: by default, WideCharToMultiByte() may
2642             replace characters unrepresentable in the target code page with bad
2643             quality approximations such as turning "1/2" symbol (U+00BD) into
2644             "1" for the code pages which don't have it and we, obviously, want
2645             to avoid this at any price
2646 
2647             the trouble is that this function does it _silently_, i.e. it won't
2648             even tell us whether it did or not... Win98/2000 and higher provide
2649             WC_NO_BEST_FIT_CHARS but it doesn't work for the older systems and
2650             we have to resort to a round trip, i.e. check that converting back
2651             results in the same string -- this is, of course, expensive but
2652             otherwise we simply can't be sure to not garble the data.
2653          */
2654 
2655         // determine if we can rely on WC_NO_BEST_FIT_CHARS: according to MSDN
2656         // it doesn't work with CJK encodings (which we test for rather roughly
2657         // here...) nor with UTF-7/8 nor, of course, with Windows versions not
2658         // supporting it
2659         BOOL usedDef wxDUMMY_INITIALIZE(false);
2660         BOOL *pUsedDef;
2661         int flags;
2662         if ( CanUseNoBestFit() && m_CodePage < 50000 )
2663         {
2664             // it's our lucky day
2665             flags = WC_NO_BEST_FIT_CHARS;
2666             pUsedDef = &usedDef;
2667         }
2668         else // old system or unsupported encoding
2669         {
2670             flags = 0;
2671             pUsedDef = NULL;
2672         }
2673 
2674         const size_t len = ::WideCharToMultiByte
2675                              (
2676                                 m_CodePage,     // code page
2677                                 flags,          // either none or no best fit
2678                                 pwz,            // input string
2679                                 -1,             // it is (wide) NUL-terminated
2680                                 buf,            // output buffer
2681                                 buf ? n : 0,    // and its size
2682                                 NULL,           // default "replacement" char
2683                                 pUsedDef        // [out] was it used?
2684                              );
2685 
2686         if ( !len )
2687         {
2688             // function totally failed
2689             return wxCONV_FAILED;
2690         }
2691 
2692         // we did something, check if we really succeeded
2693         if ( flags )
2694         {
2695             // check if the conversion failed, i.e. if any replacements
2696             // were done
2697             if ( usedDef )
2698                 return wxCONV_FAILED;
2699         }
2700         else // we must resort to double tripping...
2701         {
2702             // first we need to ensure that we really have the MB data: this is
2703             // not the case if we're called with NULL buffer, in which case we
2704             // need to do the conversion yet again
2705             wxCharBuffer bufDef;
2706             if ( !buf )
2707             {
2708                 bufDef = wxCharBuffer(len);
2709                 buf = bufDef.data();
2710                 if ( !::WideCharToMultiByte(m_CodePage, flags, pwz, -1,
2711                                             buf, len, NULL, NULL) )
2712                     return wxCONV_FAILED;
2713             }
2714 
2715             if ( !n )
2716                 n = wcslen(pwz);
2717             wxWCharBuffer wcBuf(n);
2718             if ( MB2WC(wcBuf.data(), buf, n + 1) == wxCONV_FAILED ||
2719                     wcscmp(wcBuf, pwz) != 0 )
2720             {
2721                 // we didn't obtain the same thing we started from, hence
2722                 // the conversion was lossy and we consider that it failed
2723                 return wxCONV_FAILED;
2724             }
2725         }
2726 
2727         // see the comment above for the reason of "len - 1"
2728         return len - 1;
2729     }
2730 
GetMBNulLen() const2731     virtual size_t GetMBNulLen() const
2732     {
2733         if ( m_minMBCharWidth == 0 )
2734         {
2735             int len = ::WideCharToMultiByte
2736                         (
2737                             m_CodePage,     // code page
2738                             0,              // no flags
2739                             L"",            // input string
2740                             1,              // translate just the NUL
2741                             NULL,           // output buffer
2742                             0,              // and its size
2743                             NULL,           // no replacement char
2744                             NULL            // [out] don't care if it was used
2745                         );
2746 
2747             wxMBConv_win32 * const self = wxConstCast(this, wxMBConv_win32);
2748             switch ( len )
2749             {
2750                 default:
2751                     wxLogDebug(wxT("Unexpected NUL length %d"), len);
2752                     self->m_minMBCharWidth = (size_t)-1;
2753                     break;
2754 
2755                 case 0:
2756                     self->m_minMBCharWidth = (size_t)-1;
2757                     break;
2758 
2759                 case 1:
2760                 case 2:
2761                 case 4:
2762                     self->m_minMBCharWidth = len;
2763                     break;
2764             }
2765         }
2766 
2767         return m_minMBCharWidth;
2768     }
2769 
Clone() const2770     virtual wxMBConv *Clone() const { return new wxMBConv_win32(*this); }
2771 
IsOk() const2772     bool IsOk() const { return m_CodePage != -1; }
2773 
2774 private:
CanUseNoBestFit()2775     static bool CanUseNoBestFit()
2776     {
2777         static int s_isWin98Or2k = -1;
2778 
2779         if ( s_isWin98Or2k == -1 )
2780         {
2781             int verMaj, verMin;
2782             switch ( wxGetOsVersion(&verMaj, &verMin) )
2783             {
2784                 case wxOS_WINDOWS_9X:
2785                     s_isWin98Or2k = verMaj >= 4 && verMin >= 10;
2786                     break;
2787 
2788                 case wxOS_WINDOWS_NT:
2789                     s_isWin98Or2k = verMaj >= 5;
2790                     break;
2791 
2792                 default:
2793                     // unknown: be conservative by default
2794                     s_isWin98Or2k = 0;
2795                     break;
2796             }
2797 
2798             wxASSERT_MSG( s_isWin98Or2k != -1, wxT("should be set above") );
2799         }
2800 
2801         return s_isWin98Or2k == 1;
2802     }
2803 
IsAtLeastWin2kSP4()2804     static bool IsAtLeastWin2kSP4()
2805     {
2806 #ifdef __WXWINCE__
2807         return false;
2808 #else
2809         static int s_isAtLeastWin2kSP4 = -1;
2810 
2811         if ( s_isAtLeastWin2kSP4 == -1 )
2812         {
2813             OSVERSIONINFOEX ver;
2814 
2815             memset(&ver, 0, sizeof(ver));
2816             ver.dwOSVersionInfoSize = sizeof(ver);
2817             GetVersionEx((OSVERSIONINFO*)&ver);
2818 
2819             s_isAtLeastWin2kSP4 =
2820               ((ver.dwMajorVersion > 5) || // Vista+
2821                (ver.dwMajorVersion == 5 && ver.dwMinorVersion > 0) || // XP/2003
2822                (ver.dwMajorVersion == 5 && ver.dwMinorVersion == 0 &&
2823                ver.wServicePackMajor >= 4)) // 2000 SP4+
2824               ? 1 : 0;
2825         }
2826 
2827         return s_isAtLeastWin2kSP4 == 1;
2828 #endif
2829     }
2830 
2831 
2832     // the code page we're working with
2833     long m_CodePage;
2834 
2835     // cached result of GetMBNulLen(), set to 0 initially meaning
2836     // "unknown"
2837     size_t m_minMBCharWidth;
2838 };
2839 
2840 #endif // wxHAVE_WIN32_MB2WC
2841 
2842 
2843 // ============================================================================
2844 // wxEncodingConverter based conversion classes
2845 // ============================================================================
2846 
2847 #if wxUSE_FONTMAP
2848 
2849 class wxMBConv_wxwin : public wxMBConv
2850 {
2851 private:
Init()2852     void Init()
2853     {
2854         // Refuse to use broken wxEncodingConverter code for Mac-specific encodings.
2855         // The wxMBConv_cf class does a better job.
2856         m_ok = (m_enc < wxFONTENCODING_MACMIN || m_enc > wxFONTENCODING_MACMAX) &&
2857                m2w.Init(m_enc, wxFONTENCODING_UNICODE) &&
2858                w2m.Init(wxFONTENCODING_UNICODE, m_enc);
2859     }
2860 
2861 public:
2862     // temporarily just use wxEncodingConverter stuff,
2863     // so that it works while a better implementation is built
wxMBConv_wxwin(const char * name)2864     wxMBConv_wxwin(const char* name)
2865     {
2866         if (name)
2867             m_enc = wxFontMapperBase::Get()->CharsetToEncoding(name, false);
2868         else
2869             m_enc = wxFONTENCODING_SYSTEM;
2870 
2871         Init();
2872     }
2873 
wxMBConv_wxwin(wxFontEncoding enc)2874     wxMBConv_wxwin(wxFontEncoding enc)
2875     {
2876         m_enc = enc;
2877 
2878         Init();
2879     }
2880 
MB2WC(wchar_t * buf,const char * psz,size_t WXUNUSED (n)) const2881     size_t MB2WC(wchar_t *buf, const char *psz, size_t WXUNUSED(n)) const
2882     {
2883         size_t inbuf = strlen(psz);
2884         if (buf)
2885         {
2886             if (!m2w.Convert(psz, buf))
2887                 return wxCONV_FAILED;
2888         }
2889         return inbuf;
2890     }
2891 
WC2MB(char * buf,const wchar_t * psz,size_t WXUNUSED (n)) const2892     size_t WC2MB(char *buf, const wchar_t *psz, size_t WXUNUSED(n)) const
2893     {
2894         const size_t inbuf = wxWcslen(psz);
2895         if (buf)
2896         {
2897             if (!w2m.Convert(psz, buf))
2898                 return wxCONV_FAILED;
2899         }
2900 
2901         return inbuf;
2902     }
2903 
GetMBNulLen() const2904     virtual size_t GetMBNulLen() const
2905     {
2906         switch ( m_enc )
2907         {
2908             case wxFONTENCODING_UTF16BE:
2909             case wxFONTENCODING_UTF16LE:
2910                 return 2;
2911 
2912             case wxFONTENCODING_UTF32BE:
2913             case wxFONTENCODING_UTF32LE:
2914                 return 4;
2915 
2916             default:
2917                 return 1;
2918         }
2919     }
2920 
Clone() const2921     virtual wxMBConv *Clone() const { return new wxMBConv_wxwin(m_enc); }
2922 
IsOk() const2923     bool IsOk() const { return m_ok; }
2924 
2925 public:
2926     wxFontEncoding m_enc;
2927     wxEncodingConverter m2w, w2m;
2928 
2929 private:
2930     // were we initialized successfully?
2931     bool m_ok;
2932 
2933     wxDECLARE_NO_COPY_CLASS(wxMBConv_wxwin);
2934 };
2935 
2936 // make the constructors available for unit testing
new_wxMBConv_wxwin(const char * name)2937 WXDLLIMPEXP_BASE wxMBConv* new_wxMBConv_wxwin( const char* name )
2938 {
2939     wxMBConv_wxwin* result = new wxMBConv_wxwin( name );
2940     if ( !result->IsOk() )
2941     {
2942         delete result;
2943         return 0;
2944     }
2945 
2946     return result;
2947 }
2948 
2949 #endif // wxUSE_FONTMAP
2950 
2951 // ============================================================================
2952 // wxCSConv implementation
2953 // ============================================================================
2954 
Init()2955 void wxCSConv::Init()
2956 {
2957     m_name = NULL;
2958     m_convReal =  NULL;
2959 }
2960 
SetEncoding(wxFontEncoding encoding)2961 void wxCSConv::SetEncoding(wxFontEncoding encoding)
2962 {
2963     switch ( encoding )
2964     {
2965         case wxFONTENCODING_MAX:
2966         case wxFONTENCODING_SYSTEM:
2967             if ( m_name )
2968             {
2969                 // It's ok to not have encoding value if we have a name for it.
2970                 m_encoding = wxFONTENCODING_SYSTEM;
2971             }
2972             else // No name neither.
2973             {
2974                 // Fall back to the system default encoding in this case (not
2975                 // sure how much sense does this make but this is how the old
2976                 // code used to behave).
2977 #if wxUSE_INTL
2978                 m_encoding = wxLocale::GetSystemEncoding();
2979                 if ( m_encoding == wxFONTENCODING_SYSTEM )
2980 #endif // wxUSE_INTL
2981                     m_encoding = wxFONTENCODING_ISO8859_1;
2982             }
2983             break;
2984 
2985         case wxFONTENCODING_DEFAULT:
2986             // wxFONTENCODING_DEFAULT is same as US-ASCII in this context
2987             m_encoding = wxFONTENCODING_ISO8859_1;
2988             break;
2989 
2990         default:
2991             // Just use the provided encoding.
2992             m_encoding = encoding;
2993     }
2994 }
2995 
wxCSConv(const wxString & charset)2996 wxCSConv::wxCSConv(const wxString& charset)
2997 {
2998     Init();
2999 
3000     if ( !charset.empty() )
3001     {
3002         SetName(charset.ToAscii());
3003     }
3004 
3005 #if wxUSE_FONTMAP
3006     SetEncoding(wxFontMapperBase::GetEncodingFromName(charset));
3007 #else
3008     SetEncoding(wxFONTENCODING_SYSTEM);
3009 #endif
3010 
3011     m_convReal = DoCreate();
3012 }
3013 
wxCSConv(wxFontEncoding encoding)3014 wxCSConv::wxCSConv(wxFontEncoding encoding)
3015 {
3016     if ( encoding == wxFONTENCODING_MAX || encoding == wxFONTENCODING_DEFAULT )
3017     {
3018         wxFAIL_MSG( wxT("invalid encoding value in wxCSConv ctor") );
3019 
3020         encoding = wxFONTENCODING_SYSTEM;
3021     }
3022 
3023     Init();
3024 
3025     SetEncoding(encoding);
3026 
3027     m_convReal = DoCreate();
3028 }
3029 
~wxCSConv()3030 wxCSConv::~wxCSConv()
3031 {
3032     Clear();
3033 }
3034 
wxCSConv(const wxCSConv & conv)3035 wxCSConv::wxCSConv(const wxCSConv& conv)
3036         : wxMBConv()
3037 {
3038     Init();
3039 
3040     SetName(conv.m_name);
3041     SetEncoding(conv.m_encoding);
3042 
3043     m_convReal = DoCreate();
3044 }
3045 
operator =(const wxCSConv & conv)3046 wxCSConv& wxCSConv::operator=(const wxCSConv& conv)
3047 {
3048     Clear();
3049 
3050     SetName(conv.m_name);
3051     SetEncoding(conv.m_encoding);
3052 
3053     m_convReal = DoCreate();
3054 
3055     return *this;
3056 }
3057 
Clear()3058 void wxCSConv::Clear()
3059 {
3060     free(m_name);
3061     m_name = NULL;
3062 
3063     wxDELETE(m_convReal);
3064 }
3065 
SetName(const char * charset)3066 void wxCSConv::SetName(const char *charset)
3067 {
3068     if ( charset )
3069         m_name = wxStrdup(charset);
3070 }
3071 
3072 #if wxUSE_FONTMAP
3073 
3074 WX_DECLARE_HASH_MAP( wxFontEncoding, wxString, wxIntegerHash, wxIntegerEqual,
3075                      wxEncodingNameCache );
3076 
3077 static wxEncodingNameCache gs_nameCache;
3078 #endif
3079 
DoCreate() const3080 wxMBConv *wxCSConv::DoCreate() const
3081 {
3082 #if wxUSE_FONTMAP
3083     wxLogTrace(TRACE_STRCONV,
3084                wxT("creating conversion for %s"),
3085                (m_name ? m_name
3086                        : (const char*)wxFontMapperBase::GetEncodingName(m_encoding).mb_str()));
3087 #endif // wxUSE_FONTMAP
3088 
3089     // check for the special case of ASCII or ISO8859-1 charset: as we have
3090     // special knowledge of it anyhow, we don't need to create a special
3091     // conversion object
3092     if ( m_encoding == wxFONTENCODING_ISO8859_1 )
3093     {
3094         // don't convert at all
3095         return NULL;
3096     }
3097 
3098     // we trust OS to do conversion better than we can so try external
3099     // conversion methods first
3100     //
3101     // the full order is:
3102     //      1. OS conversion (iconv() under Unix or Win32 API)
3103     //      2. hard coded conversions for UTF
3104     //      3. wxEncodingConverter as fall back
3105 
3106     // step (1)
3107 #ifdef HAVE_ICONV
3108 #if !wxUSE_FONTMAP
3109     if ( m_name )
3110 #endif // !wxUSE_FONTMAP
3111     {
3112 #if wxUSE_FONTMAP
3113         wxFontEncoding encoding(m_encoding);
3114 #endif
3115 
3116         if ( m_name )
3117         {
3118             wxMBConv_iconv *conv = new wxMBConv_iconv(m_name);
3119             if ( conv->IsOk() )
3120                 return conv;
3121 
3122             delete conv;
3123 
3124 #if wxUSE_FONTMAP
3125             encoding =
3126                 wxFontMapperBase::Get()->CharsetToEncoding(m_name, false);
3127 #endif // wxUSE_FONTMAP
3128         }
3129 #if wxUSE_FONTMAP
3130         {
3131             const wxEncodingNameCache::iterator it = gs_nameCache.find(encoding);
3132             if ( it != gs_nameCache.end() )
3133             {
3134                 if ( it->second.empty() )
3135                     return NULL;
3136 
3137                 wxMBConv_iconv *conv = new wxMBConv_iconv(it->second.ToAscii());
3138                 if ( conv->IsOk() )
3139                     return conv;
3140 
3141                 delete conv;
3142             }
3143 
3144             const wxChar* const* names = wxFontMapperBase::GetAllEncodingNames(encoding);
3145             // CS : in case this does not return valid names (eg for MacRoman)
3146             // encoding got a 'failure' entry in the cache all the same,
3147             // although it just has to be created using a different method, so
3148             // only store failed iconv creation attempts (or perhaps we
3149             // shoulnd't do this at all ?)
3150             if ( names[0] != NULL )
3151             {
3152                 for ( ; *names; ++names )
3153                 {
3154                     // FIXME-UTF8: wxFontMapperBase::GetAllEncodingNames()
3155                     //             will need changes that will obsolete this
3156                     wxString name(*names);
3157                     wxMBConv_iconv *conv = new wxMBConv_iconv(name.ToAscii());
3158                     if ( conv->IsOk() )
3159                     {
3160                         gs_nameCache[encoding] = *names;
3161                         return conv;
3162                     }
3163 
3164                     delete conv;
3165                 }
3166 
3167                 gs_nameCache[encoding] = wxT(""); // cache the failure
3168             }
3169         }
3170 #endif // wxUSE_FONTMAP
3171     }
3172 #endif // HAVE_ICONV
3173 
3174 #ifdef wxHAVE_WIN32_MB2WC
3175     {
3176 #if wxUSE_FONTMAP
3177         wxMBConv_win32 *conv = m_name ? new wxMBConv_win32(m_name)
3178                                       : new wxMBConv_win32(m_encoding);
3179 #else
3180         wxMBConv_win32* conv = new wxMBConv_win32(m_encoding);
3181 #endif
3182         if ( conv->IsOk() )
3183             return conv;
3184 
3185         delete conv;
3186     }
3187 #endif // wxHAVE_WIN32_MB2WC
3188 
3189 #ifdef __DARWIN__
3190     {
3191         // leave UTF16 and UTF32 to the built-ins of wx
3192         if ( m_name || ( m_encoding < wxFONTENCODING_UTF16BE ||
3193             ( m_encoding >= wxFONTENCODING_MACMIN && m_encoding <= wxFONTENCODING_MACMAX ) ) )
3194         {
3195 #if wxUSE_FONTMAP
3196             wxMBConv_cf *conv = m_name ? new wxMBConv_cf(m_name)
3197                                           : new wxMBConv_cf(m_encoding);
3198 #else
3199             wxMBConv_cf *conv = new wxMBConv_cf(m_encoding);
3200 #endif
3201 
3202             if ( conv->IsOk() )
3203                  return conv;
3204 
3205             delete conv;
3206         }
3207     }
3208 #endif // __DARWIN__
3209 
3210     // step (2)
3211     wxFontEncoding enc = m_encoding;
3212 #if wxUSE_FONTMAP
3213     if ( enc == wxFONTENCODING_SYSTEM && m_name )
3214     {
3215         // use "false" to suppress interactive dialogs -- we can be called from
3216         // anywhere and popping up a dialog from here is the last thing we want to
3217         // do
3218         enc = wxFontMapperBase::Get()->CharsetToEncoding(m_name, false);
3219     }
3220 #endif // wxUSE_FONTMAP
3221 
3222     switch ( enc )
3223     {
3224         case wxFONTENCODING_UTF7:
3225              return new wxMBConvUTF7;
3226 
3227         case wxFONTENCODING_UTF8:
3228              return new wxMBConvUTF8;
3229 
3230         case wxFONTENCODING_UTF16BE:
3231              return new wxMBConvUTF16BE;
3232 
3233         case wxFONTENCODING_UTF16LE:
3234              return new wxMBConvUTF16LE;
3235 
3236         case wxFONTENCODING_UTF32BE:
3237              return new wxMBConvUTF32BE;
3238 
3239         case wxFONTENCODING_UTF32LE:
3240              return new wxMBConvUTF32LE;
3241 
3242         default:
3243              // nothing to do but put here to suppress gcc warnings
3244              break;
3245     }
3246 
3247     // step (3)
3248 #if wxUSE_FONTMAP
3249     {
3250         wxMBConv_wxwin *conv = m_name ? new wxMBConv_wxwin(m_name)
3251                                       : new wxMBConv_wxwin(m_encoding);
3252         if ( conv->IsOk() )
3253             return conv;
3254 
3255         delete conv;
3256     }
3257 
3258     wxLogTrace(TRACE_STRCONV,
3259                wxT("encoding \"%s\" is not supported by this system"),
3260                (m_name ? wxString(m_name)
3261                        : wxFontMapperBase::GetEncodingName(m_encoding)));
3262 #endif // wxUSE_FONTMAP
3263 
3264     return NULL;
3265 }
3266 
IsOk() const3267 bool wxCSConv::IsOk() const
3268 {
3269     // special case: no convReal created for wxFONTENCODING_ISO8859_1
3270     if ( m_encoding == wxFONTENCODING_ISO8859_1 )
3271         return true; // always ok as we do it ourselves
3272 
3273     // m_convReal->IsOk() is called at its own creation, so we know it must
3274     // be ok if m_convReal is non-NULL
3275     return m_convReal != NULL;
3276 }
3277 
ToWChar(wchar_t * dst,size_t dstLen,const char * src,size_t srcLen) const3278 size_t wxCSConv::ToWChar(wchar_t *dst, size_t dstLen,
3279                          const char *src, size_t srcLen) const
3280 {
3281     if (m_convReal)
3282         return m_convReal->ToWChar(dst, dstLen, src, srcLen);
3283 
3284     // latin-1 (direct)
3285     if ( srcLen == wxNO_LEN )
3286         srcLen = strlen(src) + 1; // take trailing NUL too
3287 
3288     if ( dst )
3289     {
3290         if ( dstLen < srcLen )
3291             return wxCONV_FAILED;
3292 
3293         for ( size_t n = 0; n < srcLen; n++ )
3294             dst[n] = (unsigned char)(src[n]);
3295     }
3296 
3297     return srcLen;
3298 }
3299 
FromWChar(char * dst,size_t dstLen,const wchar_t * src,size_t srcLen) const3300 size_t wxCSConv::FromWChar(char *dst, size_t dstLen,
3301                            const wchar_t *src, size_t srcLen) const
3302 {
3303     if (m_convReal)
3304         return m_convReal->FromWChar(dst, dstLen, src, srcLen);
3305 
3306     // latin-1 (direct)
3307     if ( srcLen == wxNO_LEN )
3308         srcLen = wxWcslen(src) + 1;
3309 
3310     if ( dst )
3311     {
3312         if ( dstLen < srcLen )
3313             return wxCONV_FAILED;
3314 
3315         for ( size_t n = 0; n < srcLen; n++ )
3316         {
3317             if ( src[n] > 0xFF )
3318                 return wxCONV_FAILED;
3319 
3320             dst[n] = (char)src[n];
3321         }
3322 
3323     }
3324     else // still need to check the input validity
3325     {
3326         for ( size_t n = 0; n < srcLen; n++ )
3327         {
3328             if ( src[n] > 0xFF )
3329                 return wxCONV_FAILED;
3330         }
3331     }
3332 
3333     return srcLen;
3334 }
3335 
GetMBNulLen() const3336 size_t wxCSConv::GetMBNulLen() const
3337 {
3338     if ( m_convReal )
3339         return m_convReal->GetMBNulLen();
3340 
3341     // otherwise, we are ISO-8859-1
3342     return 1;
3343 }
3344 
3345 #if wxUSE_UNICODE_UTF8
IsUTF8() const3346 bool wxCSConv::IsUTF8() const
3347 {
3348     if ( m_convReal )
3349         return m_convReal->IsUTF8();
3350 
3351     // otherwise, we are ISO-8859-1
3352     return false;
3353 }
3354 #endif
3355 
3356 
3357 #if wxUSE_UNICODE
3358 
wxSafeConvertMB2WX(const char * s)3359 wxWCharBuffer wxSafeConvertMB2WX(const char *s)
3360 {
3361     if ( !s )
3362         return wxWCharBuffer();
3363 
3364     wxWCharBuffer wbuf(wxConvLibc.cMB2WX(s));
3365     if ( !wbuf )
3366         wbuf = wxMBConvUTF8().cMB2WX(s);
3367     if ( !wbuf )
3368         wbuf = wxConvISO8859_1.cMB2WX(s);
3369 
3370     return wbuf;
3371 }
3372 
wxSafeConvertWX2MB(const wchar_t * ws)3373 wxCharBuffer wxSafeConvertWX2MB(const wchar_t *ws)
3374 {
3375     if ( !ws )
3376         return wxCharBuffer();
3377 
3378     wxCharBuffer buf(wxConvLibc.cWX2MB(ws));
3379     if ( !buf )
3380         buf = wxMBConvUTF8(wxMBConvUTF8::MAP_INVALID_UTF8_TO_OCTAL).cWX2MB(ws);
3381 
3382     return buf;
3383 }
3384 
3385 #endif // wxUSE_UNICODE
3386 
3387 // ----------------------------------------------------------------------------
3388 // globals
3389 // ----------------------------------------------------------------------------
3390 
3391 // NB: The reason why we create converted objects in this convoluted way,
3392 //     using a factory function instead of global variable, is that they
3393 //     may be used at static initialization time (some of them are used by
3394 //     wxString ctors and there may be a global wxString object). In other
3395 //     words, possibly _before_ the converter global object would be
3396 //     initialized.
3397 
3398 #undef wxConvLibc
3399 #undef wxConvUTF8
3400 #undef wxConvUTF7
3401 #undef wxConvLocal
3402 #undef wxConvISO8859_1
3403 
3404 #define WX_DEFINE_GLOBAL_CONV2(klass, impl_klass, name, ctor_args)      \
3405     WXDLLIMPEXP_DATA_BASE(klass*) name##Ptr = NULL;                     \
3406     WXDLLIMPEXP_BASE klass* wxGet_##name##Ptr()                         \
3407     {                                                                   \
3408         static impl_klass name##Obj ctor_args;                          \
3409         return &name##Obj;                                              \
3410     }                                                                   \
3411     /* this ensures that all global converter objects are created */    \
3412     /* by the time static initialization is done, i.e. before any */    \
3413     /* thread is launched: */                                           \
3414     static klass* gs_##name##instance = wxGet_##name##Ptr()
3415 
3416 #define WX_DEFINE_GLOBAL_CONV(klass, name, ctor_args) \
3417     WX_DEFINE_GLOBAL_CONV2(klass, klass, name, ctor_args)
3418 
3419 #ifdef __INTELC__
3420     // disable warning "variable 'xxx' was declared but never referenced"
3421     #pragma warning(disable: 177)
3422 #endif // Intel C++
3423 
3424 #ifdef __WINDOWS__
3425     WX_DEFINE_GLOBAL_CONV2(wxMBConv, wxMBConv_win32, wxConvLibc, wxEMPTY_PARAMETER_VALUE);
3426 #elif 0 // defined(__WXOSX__)
3427     WX_DEFINE_GLOBAL_CONV2(wxMBConv, wxMBConv_cf, wxConvLibc,  (wxFONTENCODING_UTF8));
3428 #else
3429     WX_DEFINE_GLOBAL_CONV2(wxMBConv, wxMBConvLibc, wxConvLibc, wxEMPTY_PARAMETER_VALUE);
3430 #endif
3431 
3432 // NB: we can't use wxEMPTY_PARAMETER_VALUE as final argument here because it's
3433 //     passed to WX_DEFINE_GLOBAL_CONV2 after a macro expansion and so still
3434 //     provokes an error message about "not enough macro parameters"; and we
3435 //     can't use "()" here as the name##Obj declaration would be parsed as a
3436 //     function declaration then, so use a semicolon and live with an extra
3437 //     empty statement (and hope that no compilers warns about this)
3438 WX_DEFINE_GLOBAL_CONV(wxMBConvStrictUTF8, wxConvUTF8, ;);
3439 WX_DEFINE_GLOBAL_CONV(wxMBConvUTF7, wxConvUTF7, ;);
3440 
3441 WX_DEFINE_GLOBAL_CONV(wxCSConv, wxConvLocal, (wxFONTENCODING_SYSTEM));
3442 WX_DEFINE_GLOBAL_CONV(wxCSConv, wxConvISO8859_1, (wxFONTENCODING_ISO8859_1));
3443 
3444 WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvCurrent = wxGet_wxConvLibcPtr();
3445 WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvUI = wxGet_wxConvLocalPtr();
3446 
3447 #ifdef __DARWIN__
3448 // It is important to use this conversion object under Darwin as it ensures
3449 // that Unicode strings are (re)composed correctly even though xnu kernel uses
3450 // decomposed form internally (at least for the file names).
3451 static wxMBConv_cf wxConvMacUTF8DObj(wxFONTENCODING_UTF8);
3452 #endif
3453 
3454 WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvFileName =
3455 #ifdef __DARWIN__
3456                                     &wxConvMacUTF8DObj;
3457 #else // !__DARWIN__
3458                                     wxGet_wxConvLibcPtr();
3459 #endif // __DARWIN__/!__DARWIN__
3460