1 /////////////////////////////////////////////////////////////////////////////
2 // Name:        src/common/strconv.cpp
3 // Purpose:     Unicode conversion classes
4 // Author:      Ove Kaaven, Robert Roebling, Vadim Zeitlin, Vaclav Slavik,
5 //              Ryan Norton, Fredrik Roubert (UTF7)
6 // Modified by:
7 // Created:     29/01/98
8 // Copyright:   (c) 1999 Ove Kaaven, Robert Roebling, Vaclav Slavik
9 //              (c) 2000-2003 Vadim Zeitlin
10 //              (c) 2004 Ryan Norton, Fredrik Roubert
11 // Licence:     wxWindows licence
12 /////////////////////////////////////////////////////////////////////////////
13 
14 // For compilers that support precompilation, includes "wx.h".
15 #include "wx/wxprec.h"
16 
17 #ifndef WX_PRECOMP
18     #include "wx/intl.h"
19     #include "wx/log.h"
20     #include "wx/utils.h"
21     #include "wx/hashmap.h"
22 #endif
23 
24 #include "wx/strconv.h"
25 
26 #include <errno.h>
27 
28 #include <ctype.h>
29 #include <string.h>
30 #include <stdlib.h>
31 
32 #if defined(__WIN32__)
33     #include "wx/msw/private.h"
34     #include "wx/msw/missing.h"
35     #define wxHAVE_WIN32_MB2WC
36 #endif
37 
38 #ifdef HAVE_ICONV
39     #include <iconv.h>
40     #include "wx/thread.h"
41 #endif
42 
43 #include "wx/encconv.h"
44 #include "wx/fontmap.h"
45 #include "wx/private/unicode.h"
46 
47 #ifdef __DARWIN__
48 #include "wx/osx/core/private/strconv_cf.h"
49 #endif //def __DARWIN__
50 
51 
52 #define TRACE_STRCONV wxT("strconv")
53 
54 // WC_UTF16 is defined only if sizeof(wchar_t) == 2, otherwise it's supposed to
55 // be 4 bytes
56 #if SIZEOF_WCHAR_T == 2
57     #define WC_UTF16
58 #endif
59 
60 
61 // ============================================================================
62 // implementation
63 // ============================================================================
64 
65 // helper function of cMB2WC(): check if n bytes at this location are all NUL
NotAllNULs(const char * p,size_t n)66 static bool NotAllNULs(const char *p, size_t n)
67 {
68     while ( n && *p++ == '\0' )
69         n--;
70 
71     return n != 0;
72 }
73 
74 // ----------------------------------------------------------------------------
75 // UTF-16 en/decoding to/from UCS-4 with surrogates handling
76 // ----------------------------------------------------------------------------
77 
encode_utf16(wxUint32 input,wxUint16 * output)78 static size_t encode_utf16(wxUint32 input, wxUint16 *output)
79 {
80     if (wxUniChar::IsBMP(input))
81     {
82         if (output)
83             *output = (wxUint16) input;
84 
85         return 1;
86     }
87     else if (wxUniChar::IsSupplementary(input))
88     {
89         if (output)
90         {
91             *output++ = wxUniChar::HighSurrogate(input);
92             *output = wxUniChar::LowSurrogate(input);
93         }
94 
95         return 2;
96     }
97     else
98     {
99         return wxCONV_FAILED;
100     }
101 }
102 
103 // Returns the next UTF-32 character from the wchar_t buffer terminated by the
104 // "end" pointer (the caller must ensure that on input "*pSrc < end") and
105 // advances the pointer to the character after this one.
106 //
107 // If an invalid or incomplete character is found, *pSrc is set to NULL, the
108 // caller must check for this.
wxDecodeSurrogate(const wxChar16 ** pSrc,const wxChar16 * end)109 static wxUint32 wxDecodeSurrogate(const wxChar16 **pSrc, const wxChar16* end)
110 {
111     const wxChar16*& src = *pSrc;
112 
113     // Is this a BMP character?
114     const wxUint16 u = *src++;
115     if ((u < 0xd800) || (u > 0xdfff))
116     {
117         // Yes, just return it.
118         return u;
119     }
120 
121     // No, we have the first half of a surrogate, check if we also have the
122     // second half (notice that this check does nothing if end == NULL, as it
123     // is allowed to be, and this is correct).
124     if ( src == end )
125     {
126         // No, we don't because this is the end of input.
127         src = NULL;
128         return 0;
129     }
130 
131     const wxUint16 u2 = *src++;
132     if ( (u2 < 0xdc00) || (u2 > 0xdfff) )
133     {
134         // No, it's not in the low surrogate range.
135         src = NULL;
136         return 0;
137     }
138 
139     // Yes, decode it and return the corresponding Unicode character.
140     return ((u - 0xd7c0) << 10) + (u2 - 0xdc00);
141 }
142 
143 // ----------------------------------------------------------------------------
144 // wxMBConv
145 // ----------------------------------------------------------------------------
146 
147 size_t
ToWChar(wchar_t * dst,size_t dstLen,const char * src,size_t srcLen) const148 wxMBConv::ToWChar(wchar_t *dst, size_t dstLen,
149                   const char *src, size_t srcLen) const
150 {
151     // although new conversion classes are supposed to implement this function
152     // directly, the existing ones only implement the old MB2WC() and so, to
153     // avoid to have to rewrite all conversion classes at once, we provide a
154     // default (but not efficient) implementation of this one in terms of the
155     // old function by copying the input to ensure that it's NUL-terminated and
156     // then using MB2WC() to convert it
157     //
158     // moreover, some conversion classes simply can't implement ToWChar()
159     // directly, the primary example is wxConvLibc: mbstowcs() only handles
160     // NUL-terminated strings
161 
162     // the number of chars [which would be] written to dst [if it were not NULL]
163     size_t dstWritten = 0;
164 
165     // the number of NULs terminating this string
166     size_t nulLen = 0;  // not really needed, but just to avoid warnings
167 
168     // if we were not given the input size we just have to assume that the
169     // string is properly terminated as we have no way of knowing how long it
170     // is anyhow, but if we do have the size check whether there are enough
171     // NULs at the end
172     wxCharBuffer bufTmp;
173     const char *srcEnd;
174     if ( srcLen != wxNO_LEN )
175     {
176         // we need to know how to find the end of this string
177         nulLen = GetMBNulLen();
178         if ( nulLen == wxCONV_FAILED )
179             return wxCONV_FAILED;
180 
181         // if there are enough NULs we can avoid the copy
182         if ( srcLen < nulLen || NotAllNULs(src + srcLen - nulLen, nulLen) )
183         {
184             // make a copy in order to properly NUL-terminate the string
185             bufTmp = wxCharBuffer(srcLen + nulLen - 1 /* 1 will be added */);
186             char * const p = bufTmp.data();
187             memcpy(p, src, srcLen);
188             for ( char *s = p + srcLen; s < p + srcLen + nulLen; s++ )
189                 *s = '\0';
190 
191             src = bufTmp;
192         }
193 
194         srcEnd = src + srcLen;
195     }
196     else // quit after the first loop iteration
197     {
198         srcEnd = NULL;
199     }
200 
201     // the idea of this code is straightforward: it converts a NUL-terminated
202     // chunk of the string during each iteration and updates the output buffer
203     // with the result
204     //
205     // all the complication come from the fact that this function, for
206     // historical reasons, must behave in 2 subtly different ways when it's
207     // called with a fixed number of characters and when it's called for the
208     // entire NUL-terminated string: in the former case (srcEnd != NULL) we
209     // must count all characters we convert, NUL or not; but in the latter we
210     // do not count the trailing NUL -- but still count all the NULs inside the
211     // string
212     //
213     // so for the (simple) former case we just always count the trailing NUL,
214     // but for the latter we need to wait until we see if there is going to be
215     // another loop iteration and only count it then
216     for ( ;; )
217     {
218         // try to convert the current chunk
219         size_t lenChunk = MB2WC(NULL, src, 0);
220         if ( lenChunk == wxCONV_FAILED )
221             return wxCONV_FAILED;
222 
223         dstWritten += lenChunk;
224         if ( !srcEnd )
225             dstWritten++;
226 
227         if ( dst )
228         {
229             if ( dstWritten > dstLen )
230                 return wxCONV_FAILED;
231 
232             // +1 is for trailing NUL
233             if ( MB2WC(dst, src, lenChunk + 1) == wxCONV_FAILED )
234                 return wxCONV_FAILED;
235 
236             dst += lenChunk;
237             if ( !srcEnd )
238                 dst++;
239         }
240 
241         if ( !srcEnd )
242         {
243             // we convert just one chunk in this case as this is the entire
244             // string anyhow (and we don't count the trailing NUL in this case)
245             break;
246         }
247 
248         // advance the input pointer past the end of this chunk: notice that we
249         // will always stop before srcEnd because we know that the chunk is
250         // always properly NUL-terminated
251         while ( NotAllNULs(src, nulLen) )
252         {
253             // notice that we must skip over multiple bytes here as we suppose
254             // that if NUL takes 2 or 4 bytes, then all the other characters do
255             // too and so if advanced by a single byte we might erroneously
256             // detect sequences of NUL bytes in the middle of the input
257             src += nulLen;
258         }
259 
260         // if the buffer ends before this NUL, we shouldn't count it in our
261         // output so skip the code below
262         if ( src == srcEnd )
263             break;
264 
265         // do count this terminator as it's inside the buffer we convert
266         dstWritten++;
267         if ( dst )
268             dst++;
269 
270         src += nulLen; // skip the terminator itself
271 
272         if ( src >= srcEnd )
273             break;
274     }
275 
276     return dstWritten;
277 }
278 
279 size_t
FromWChar(char * dst,size_t dstLen,const wchar_t * src,size_t srcLen) const280 wxMBConv::FromWChar(char *dst, size_t dstLen,
281                     const wchar_t *src, size_t srcLen) const
282 {
283     // the number of chars [which would be] written to dst [if it were not NULL]
284     size_t dstWritten = 0;
285 
286     // if we don't know its length we have no choice but to assume that it is
287     // NUL-terminated (notice that it can still be NUL-terminated even if
288     // explicit length is given but it doesn't change our return value)
289     const bool isNulTerminated = srcLen == wxNO_LEN;
290 
291     // make a copy of the input string unless it is already properly
292     // NUL-terminated
293     wxWCharBuffer bufTmp;
294     if ( isNulTerminated )
295     {
296         srcLen = wxWcslen(src) + 1;
297     }
298     else if ( srcLen != 0 && src[srcLen - 1] != L'\0' )
299     {
300         // make a copy in order to properly NUL-terminate the string
301         bufTmp = wxWCharBuffer(srcLen);
302         memcpy(bufTmp.data(), src, srcLen * sizeof(wchar_t));
303         src = bufTmp;
304     }
305 
306     const size_t lenNul = GetMBNulLen();
307     for ( const wchar_t * const srcEnd = src + srcLen;
308           src < srcEnd;
309           src++ /* skip L'\0' too */ )
310     {
311         // try to convert the current chunk
312         size_t lenChunk = WC2MB(NULL, src, 0);
313         if ( lenChunk == wxCONV_FAILED )
314             return wxCONV_FAILED;
315 
316         dstWritten += lenChunk;
317 
318         const wchar_t * const
319             chunkEnd = isNulTerminated ? srcEnd - 1 : src + wxWcslen(src);
320 
321         // our return value accounts for the trailing NUL(s), unlike that of
322         // WC2MB(), however don't do it for the last NUL we artificially added
323         // ourselves above
324         if ( chunkEnd < srcEnd )
325             dstWritten += lenNul;
326 
327         if ( dst )
328         {
329             if ( dstWritten > dstLen )
330                 return wxCONV_FAILED;
331 
332             // if we know that there is enough space in the destination buffer
333             // (because we accounted for lenNul in dstWritten above), we can
334             // convert directly in place -- but otherwise we need another
335             // temporary buffer to ensure that we don't overwrite the output
336             wxCharBuffer dstBuf;
337             char *dstTmp;
338             if ( chunkEnd == srcEnd )
339             {
340                 dstBuf = wxCharBuffer(lenChunk + lenNul - 1);
341                 dstTmp = dstBuf.data();
342             }
343             else
344             {
345                 dstTmp = dst;
346             }
347 
348             if ( WC2MB(dstTmp, src, lenChunk + lenNul) == wxCONV_FAILED )
349                 return wxCONV_FAILED;
350 
351             if ( dstTmp != dst )
352             {
353                 // copy everything up to but excluding the terminating NUL(s)
354                 // into the real output buffer
355                 memcpy(dst, dstTmp, lenChunk);
356 
357                 // micro-optimization: if dstTmp != dst it means that chunkEnd
358                 // == srcEnd and so we're done, no need to update anything below
359                 break;
360             }
361 
362             dst += lenChunk;
363             if ( chunkEnd < srcEnd )
364                 dst += lenNul;
365         }
366 
367         src = chunkEnd;
368     }
369 
370     return dstWritten;
371 }
372 
MB2WC(wchar_t * outBuff,const char * inBuff,size_t outLen) const373 size_t wxMBConv::MB2WC(wchar_t *outBuff, const char *inBuff, size_t outLen) const
374 {
375     size_t rc = ToWChar(outBuff, outLen, inBuff);
376     if ( rc != wxCONV_FAILED )
377     {
378         // ToWChar() returns the buffer length, i.e. including the trailing
379         // NUL, while this method doesn't take it into account
380         rc--;
381     }
382 
383     return rc;
384 }
385 
WC2MB(char * outBuff,const wchar_t * inBuff,size_t outLen) const386 size_t wxMBConv::WC2MB(char *outBuff, const wchar_t *inBuff, size_t outLen) const
387 {
388     size_t rc = FromWChar(outBuff, outLen, inBuff);
389     if ( rc != wxCONV_FAILED )
390     {
391         rc -= GetMBNulLen();
392     }
393 
394     return rc;
395 }
396 
397 wxWCharBuffer
cMB2WC(const char * inBuff,size_t inLen,size_t * outLen) const398 wxMBConv::cMB2WC(const char *inBuff, size_t inLen, size_t *outLen) const
399 {
400     const size_t dstLen = ToWChar(NULL, 0, inBuff, inLen);
401     if ( dstLen != wxCONV_FAILED )
402     {
403         // notice that we allocate space for dstLen+1 wide characters here
404         // because we want the buffer to always be NUL-terminated, even if the
405         // input isn't (as otherwise the caller has no way to know its length)
406         wxWCharBuffer wbuf(dstLen);
407         if ( ToWChar(wbuf.data(), dstLen, inBuff, inLen) != wxCONV_FAILED )
408         {
409             if ( outLen )
410             {
411                 *outLen = dstLen;
412 
413                 // we also need to handle NUL-terminated input strings
414                 // specially: for them the output is the length of the string
415                 // excluding the trailing NUL, however if we're asked to
416                 // convert a specific number of characters we return the length
417                 // of the resulting output even if it's NUL-terminated
418                 if ( inLen == wxNO_LEN )
419                     (*outLen)--;
420             }
421 
422             return wbuf;
423         }
424     }
425 
426     if ( outLen )
427         *outLen = 0;
428 
429     return wxWCharBuffer();
430 }
431 
432 wxCharBuffer
cWC2MB(const wchar_t * inBuff,size_t inLen,size_t * outLen) const433 wxMBConv::cWC2MB(const wchar_t *inBuff, size_t inLen, size_t *outLen) const
434 {
435     size_t dstLen = FromWChar(NULL, 0, inBuff, inLen);
436     if ( dstLen != wxCONV_FAILED )
437     {
438         const size_t nulLen = GetMBNulLen();
439 
440         // as above, ensure that the buffer is always NUL-terminated, even if
441         // the input is not
442         wxCharBuffer buf(dstLen + nulLen - 1);
443         memset(buf.data() + dstLen, 0, nulLen);
444 
445         // Notice that return value of the call to FromWChar() here may be
446         // different from the one above as it could have overestimated the
447         // space needed, while what we get here is the exact length.
448         dstLen = FromWChar(buf.data(), dstLen, inBuff, inLen);
449         if ( dstLen != wxCONV_FAILED )
450         {
451             if ( outLen )
452             {
453                 *outLen = dstLen;
454 
455                 if ( inLen == wxNO_LEN )
456                 {
457                     // in this case both input and output are NUL-terminated
458                     // and we're not supposed to count NUL
459                     *outLen -= nulLen;
460                 }
461             }
462 
463             return buf;
464         }
465     }
466 
467     if ( outLen )
468         *outLen = 0;
469 
470     return wxCharBuffer();
471 }
472 
DoConvertMB2WC(const char * buf,size_t srcLen) const473 wxWCharBuffer wxMBConv::DoConvertMB2WC(const char* buf, size_t srcLen) const
474 {
475     // Notice that converting NULL pointer should work, i.e. return an empty
476     // buffer instead of crashing, so we need to check both the length and the
477     // pointer because length is wxNO_LEN if it's a raw pointer and doesn't
478     // come from wxScopedCharBuffer.
479     if ( srcLen && buf )
480     {
481         const size_t dstLen = ToWChar(NULL, 0, buf, srcLen);
482         if ( dstLen != wxCONV_FAILED )
483         {
484             wxWCharBuffer wbuf(dstLen);
485             wbuf.data()[dstLen] = L'\0';
486             if ( ToWChar(wbuf.data(), dstLen, buf, srcLen) != wxCONV_FAILED )
487             {
488                 // If the input string was NUL-terminated, we shouldn't include
489                 // the length of the trailing NUL into the length of the return
490                 // value.
491                 if ( srcLen == wxNO_LEN )
492                     wbuf.shrink(dstLen - 1);
493 
494                 return wbuf;
495             }
496         }
497     }
498 
499     return wxWCharBuffer();
500 }
501 
DoConvertWC2MB(const wchar_t * wbuf,size_t srcLen) const502 wxCharBuffer wxMBConv::DoConvertWC2MB(const wchar_t* wbuf, size_t srcLen) const
503 {
504     if ( srcLen && wbuf )
505     {
506         const size_t dstLen = FromWChar(NULL, 0, wbuf, srcLen);
507         if ( dstLen != wxCONV_FAILED )
508         {
509             wxCharBuffer buf(dstLen);
510             buf.data()[dstLen] = '\0';
511             if ( FromWChar(buf.data(), dstLen, wbuf, srcLen) != wxCONV_FAILED )
512             {
513                 // As above, in DoConvertMB2WC(), except that the length of the
514                 // trailing NUL is variable in this case.
515                 if ( srcLen == wxNO_LEN )
516                     buf.shrink(dstLen - GetMBNulLen());
517 
518                 return buf;
519             }
520         }
521     }
522 
523     return wxCharBuffer();
524 }
525 
526 // ----------------------------------------------------------------------------
527 // wxMBConvLibc
528 // ----------------------------------------------------------------------------
529 
MB2WC(wchar_t * buf,const char * psz,size_t n) const530 size_t wxMBConvLibc::MB2WC(wchar_t *buf, const char *psz, size_t n) const
531 {
532     return wxMB2WC(buf, psz, n);
533 }
534 
WC2MB(char * buf,const wchar_t * psz,size_t n) const535 size_t wxMBConvLibc::WC2MB(char *buf, const wchar_t *psz, size_t n) const
536 {
537     return wxWC2MB(buf, psz, n);
538 }
539 
540 // ----------------------------------------------------------------------------
541 // wxConvBrokenFileNames
542 // ----------------------------------------------------------------------------
543 
544 #ifdef __UNIX__
545 
wxConvBrokenFileNames(const wxString & charset)546 wxConvBrokenFileNames::wxConvBrokenFileNames(const wxString& charset)
547 {
548     if ( wxStricmp(charset, wxT("UTF-8")) == 0 ||
549          wxStricmp(charset, wxT("UTF8")) == 0  )
550         m_conv = new wxMBConvUTF8(wxMBConvUTF8::MAP_INVALID_UTF8_TO_PUA);
551     else
552         m_conv = new wxCSConv(charset);
553 }
554 
555 #endif // __UNIX__
556 
557 // ----------------------------------------------------------------------------
558 // UTF-7
559 // ----------------------------------------------------------------------------
560 
561 // Implementation (C) 2004 Fredrik Roubert
562 //
563 // Changes to work in streaming mode (C) 2008 Vadim Zeitlin
564 
565 //
566 // BASE64 decoding table
567 //
568 static const unsigned char utf7unb64[] =
569 {
570     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
571     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
572     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
573     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
574     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
575     0xff, 0xff, 0xff, 0x3e, 0xff, 0xff, 0xff, 0x3f,
576     0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x3b,
577     0x3c, 0x3d, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
578     0xff, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06,
579     0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e,
580     0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16,
581     0x17, 0x18, 0x19, 0xff, 0xff, 0xff, 0xff, 0xff,
582     0xff, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 0x20,
583     0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28,
584     0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, 0x30,
585     0x31, 0x32, 0x33, 0xff, 0xff, 0xff, 0xff, 0xff,
586     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
587     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
588     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
589     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
590     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
591     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
592     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
593     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
594     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
595     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
596     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
597     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
598     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
599     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
600     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
601     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
602 };
603 
ToWChar(wchar_t * dst,size_t dstLen,const char * src,size_t srcLen) const604 size_t wxMBConvUTF7::ToWChar(wchar_t *dst, size_t dstLen,
605                              const char *src, size_t srcLen) const
606 {
607     DecoderState stateOrig,
608                 *statePtr;
609     if ( srcLen == wxNO_LEN )
610     {
611         // convert the entire string, up to and including the trailing NUL
612         srcLen = strlen(src) + 1;
613 
614         // when working on the entire strings we don't update nor use the shift
615         // state from the previous call
616         statePtr = &stateOrig;
617     }
618     else // when working with partial strings we do use the shift state
619     {
620         statePtr = const_cast<DecoderState *>(&m_stateDecoder);
621 
622         // also save the old state to be able to rollback to it on error
623         stateOrig = m_stateDecoder;
624     }
625 
626     // but to simplify the code below we use this variable in both cases
627     DecoderState& state = *statePtr;
628 
629 
630     // number of characters [which would have been] written to dst [if it were
631     // not NULL]
632     size_t len = 0;
633 
634     const char * const srcEnd = src + srcLen;
635 
636     while ( (src < srcEnd) && (!dst || (len < dstLen)) )
637     {
638         const unsigned char cc = *src++;
639 
640         if ( state.IsShifted() )
641         {
642             const unsigned char dc = utf7unb64[cc];
643             if ( dc == 0xff )
644             {
645                 // end of encoded part, check that nothing was left: there can
646                 // be up to 4 bits of 0 padding but nothing else (we also need
647                 // to check isLSB as we count bits modulo 8 while a valid UTF-7
648                 // encoded sequence must contain an integral number of UTF-16
649                 // characters)
650                 if ( state.isLSB || state.bit > 4 ||
651                         (state.accum & ((1 << state.bit) - 1)) )
652                 {
653                     if ( !len )
654                         state = stateOrig;
655 
656                     return wxCONV_FAILED;
657                 }
658 
659                 state.ToDirect();
660 
661                 // re-parse this character normally below unless it's '-' which
662                 // is consumed by the decoder
663                 if ( cc == '-' )
664                     continue;
665             }
666             else // valid encoded character
667             {
668                 // mini base64 decoder: each character is 6 bits
669                 state.bit += 6;
670                 state.accum <<= 6;
671                 state.accum += dc;
672 
673                 if ( state.bit >= 8 )
674                 {
675                     // got the full byte, consume it
676                     state.bit -= 8;
677                     unsigned char b = (state.accum >> state.bit) & 0x00ff;
678 
679                     if ( state.isLSB )
680                     {
681                         // we've got the full word, output it
682                         if ( dst )
683                             *dst++ = (state.msb << 8) | b;
684                         len++;
685                         state.isLSB = false;
686                     }
687                     else // MSB
688                     {
689                         // just store it while we wait for LSB
690                         state.msb = b;
691                         state.isLSB = true;
692                     }
693                 }
694             }
695         }
696 
697         if ( state.IsDirect() )
698         {
699             // start of an encoded segment?
700             if ( cc == '+' )
701             {
702                 // Can't end with a plus sign.
703                 if ( src == srcEnd )
704                     return wxCONV_FAILED;
705 
706                 if ( *src == '-' )
707                 {
708                     // just the encoded plus sign, don't switch to shifted mode
709                     if ( dst )
710                         *dst++ = '+';
711                     len++;
712                     src++;
713                 }
714                 else if ( utf7unb64[(unsigned)*src] == 0xff )
715                 {
716                     // empty encoded chunks are not allowed
717                     if ( !len )
718                         state = stateOrig;
719 
720                     return wxCONV_FAILED;
721                 }
722                 else // base-64 encoded chunk follows
723                 {
724                     state.ToShifted();
725                 }
726             }
727             else // not '+'
728             {
729                 // only printable 7 bit ASCII characters (with the exception of
730                 // NUL, TAB, CR and LF) can be used directly
731                 if ( cc >= 0x7f || (cc < ' ' &&
732                       !(cc == '\0' || cc == '\t' || cc == '\r' || cc == '\n')) )
733                     return wxCONV_FAILED;
734 
735                 if ( dst )
736                     *dst++ = cc;
737                 len++;
738             }
739         }
740     }
741 
742     if ( !len )
743     {
744         // as we didn't read any characters we should be called with the same
745         // data (followed by some more new data) again later so don't save our
746         // state
747         state = stateOrig;
748 
749         return wxCONV_FAILED;
750     }
751 
752     return len;
753 }
754 
755 //
756 // BASE64 encoding table
757 //
758 static const unsigned char utf7enb64[] =
759 {
760     'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H',
761     'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P',
762     'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
763     'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f',
764     'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n',
765     'o', 'p', 'q', 'r', 's', 't', 'u', 'v',
766     'w', 'x', 'y', 'z', '0', '1', '2', '3',
767     '4', '5', '6', '7', '8', '9', '+', '/'
768 };
769 
770 //
771 // UTF-7 encoding table
772 //
773 // 0 - Set D (directly encoded characters)
774 // 1 - Set O (optional direct characters)
775 // 2 - whitespace characters (optional)
776 // 3 - special characters
777 //
778 static const unsigned char utf7encode[128] =
779 {
780     0, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
781     3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
782     2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 3,
783     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
784     1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
785     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
786     1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
787     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3
788 };
789 
wxIsUTF7Direct(wchar_t wc)790 static inline bool wxIsUTF7Direct(wchar_t wc)
791 {
792     return wc < 0x80 && utf7encode[wc] < 1;
793 }
794 
FromWChar(char * dst,size_t dstLen,const wchar_t * src,size_t srcLen) const795 size_t wxMBConvUTF7::FromWChar(char *dst, size_t dstLen,
796                                const wchar_t *src, size_t srcLen) const
797 {
798     EncoderState stateOrig,
799                 *statePtr;
800     if ( srcLen == wxNO_LEN )
801     {
802         // we don't apply the stored state when operating on entire strings at
803         // once
804         statePtr = &stateOrig;
805 
806         srcLen = wxWcslen(src) + 1;
807     }
808     else // do use the mode we left the output in previously
809     {
810         stateOrig = m_stateEncoder;
811         statePtr = const_cast<EncoderState *>(&m_stateEncoder);
812     }
813 
814     EncoderState& state = *statePtr;
815 
816 
817     size_t len = 0;
818 
819     const wchar_t * const srcEnd = src + srcLen;
820     while ( src < srcEnd && (!dst || len < dstLen) )
821     {
822         wchar_t cc = *src++;
823         if ( wxIsUTF7Direct(cc) )
824         {
825             if ( state.IsShifted() )
826             {
827                 // pad with zeros the last encoded block if necessary
828                 if ( state.bit )
829                 {
830                     if ( dst )
831                         *dst++ = utf7enb64[((state.accum % 16) << (6 - state.bit)) % 64];
832                     len++;
833                 }
834 
835                 state.ToDirect();
836 
837                 if ( dst )
838                     *dst++ = '-';
839                 len++;
840             }
841 
842             if ( dst )
843                 *dst++ = (char)cc;
844             len++;
845         }
846         else if ( cc == '+' && state.IsDirect() )
847         {
848             if ( dst )
849             {
850                 *dst++ = '+';
851                 *dst++ = '-';
852             }
853 
854             len += 2;
855         }
856 #ifndef WC_UTF16
857         else if (((wxUint32)cc) > 0xffff)
858         {
859             // no surrogate pair generation (yet?)
860             return wxCONV_FAILED;
861         }
862 #endif
863         else
864         {
865             if ( state.IsDirect() )
866             {
867                 state.ToShifted();
868 
869                 if ( dst )
870                     *dst++ = '+';
871                 len++;
872             }
873 
874             // BASE64 encode string
875             for ( ;; )
876             {
877                 for ( unsigned lsb = 0; lsb < 2; lsb++ )
878                 {
879                     state.accum <<= 8;
880                     state.accum += lsb ? cc & 0xff : (cc & 0xff00) >> 8;
881 
882                     for (state.bit += 8; state.bit >= 6; )
883                     {
884                         state.bit -= 6;
885                         if ( dst )
886                             *dst++ = utf7enb64[(state.accum >> state.bit) % 64];
887                         len++;
888                     }
889                 }
890 
891                 if ( src == srcEnd || wxIsUTF7Direct(cc = *src) )
892                     break;
893 
894                 src++;
895             }
896         }
897     }
898 
899     // we need to restore the original encoder state if we were called just to
900     // calculate the amount of space needed as we will presumably be called
901     // again to really convert the data now
902     if ( !dst )
903         state = stateOrig;
904 
905     return len;
906 }
907 
908 // ----------------------------------------------------------------------------
909 // UTF-8
910 // ----------------------------------------------------------------------------
911 
912 static const wxUint32 utf8_max[]=
913     { 0x7f, 0x7ff, 0xffff, 0x1fffff, 0x3ffffff, 0x7fffffff, 0xffffffff };
914 
915 // boundaries of the private use area we use to (temporarily) remap invalid
916 // characters invalid in a UTF-8 encoded string
917 const wxUint32 wxUnicodePUA = 0x100000;
918 const wxUint32 wxUnicodePUAEnd = wxUnicodePUA + 256;
919 
920 // this table gives the length of the UTF-8 encoding from its first character:
921 extern const unsigned char tableUtf8Lengths[256] = {
922     // single-byte sequences (ASCII):
923     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 00..0F
924     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 10..1F
925     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 20..2F
926     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 30..3F
927     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 40..4F
928     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 50..5F
929     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 60..6F
930     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 70..7F
931 
932     // these are invalid:
933     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // 80..8F
934     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // 90..9F
935     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // A0..AF
936     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // B0..BF
937     0, 0,                                            // C0,C1
938 
939     // two-byte sequences:
940           2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,  // C2..CF
941     2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,  // D0..DF
942 
943     // three-byte sequences:
944     3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,  // E0..EF
945 
946     // four-byte sequences:
947     4, 4, 4, 4, 4,                                   // F0..F4
948 
949     // these are invalid again (5- or 6-byte
950     // sequences and sequences for code points
951     // above U+10FFFF, as restricted by RFC 3629):
952                    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0   // F5..FF
953 };
954 
955 size_t
ToWChar(wchar_t * dst,size_t dstLen,const char * src,size_t srcLen) const956 wxMBConvStrictUTF8::ToWChar(wchar_t *dst, size_t dstLen,
957                             const char *src, size_t srcLen) const
958 {
959     wchar_t *out = dstLen ? dst : NULL;
960     size_t written = 0;
961 
962     if ( srcLen == wxNO_LEN )
963         srcLen = strlen(src) + 1;
964 
965     for ( const char *p = src; ; p++ )
966     {
967         if ( (srcLen == wxNO_LEN ? !*p : !srcLen) )
968         {
969             // all done successfully, just add the trailing NULL if we are not
970             // using explicit length
971             if ( srcLen == wxNO_LEN )
972             {
973                 if ( out )
974                 {
975                     if ( !dstLen )
976                         break;
977 
978                     *out = L'\0';
979                 }
980 
981                 written++;
982             }
983 
984             return written;
985         }
986 
987         if ( out && !dstLen-- )
988             break;
989 
990         wxUint32 code;
991         unsigned char c = *p;
992 
993         if ( c < 0x80 )
994         {
995             if ( srcLen == 0 ) // the test works for wxNO_LEN too
996                 break;
997 
998             if ( srcLen != wxNO_LEN )
999                 srcLen--;
1000 
1001             code = c;
1002         }
1003         else
1004         {
1005             unsigned len = tableUtf8Lengths[c];
1006             if ( !len )
1007                 break;
1008 
1009             if ( srcLen < len ) // the test works for wxNO_LEN too
1010                 break;
1011 
1012             if ( srcLen != wxNO_LEN )
1013                 srcLen -= len;
1014 
1015             //   Char. number range   |        UTF-8 octet sequence
1016             //      (hexadecimal)     |              (binary)
1017             //  ----------------------+----------------------------------------
1018             //  0000 0000 - 0000 007F | 0xxxxxxx
1019             //  0000 0080 - 0000 07FF | 110xxxxx 10xxxxxx
1020             //  0000 0800 - 0000 FFFF | 1110xxxx 10xxxxxx 10xxxxxx
1021             //  0001 0000 - 0010 FFFF | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
1022             //
1023             //  Code point value is stored in bits marked with 'x',
1024             //  lowest-order bit of the value on the right side in the diagram
1025             //  above.                                         (from RFC 3629)
1026 
1027             // mask to extract lead byte's value ('x' bits above), by sequence
1028             // length:
1029             static const unsigned char leadValueMask[] = { 0x7F, 0x1F, 0x0F, 0x07 };
1030 
1031             len--; // it's more convenient to work with 0-based length here
1032 
1033             code = c & leadValueMask[len];
1034 
1035             // all remaining bytes, if any, are handled in the same way
1036             // regardless of sequence's length:
1037             for ( ; len; --len )
1038             {
1039                 c = *++p;
1040                 if ( (c & 0xC0) != 0x80 )
1041                     return wxCONV_FAILED;
1042 
1043                 code <<= 6;
1044                 code |= c & 0x3F;
1045             }
1046         }
1047 
1048 #ifdef WC_UTF16
1049         // cast is ok because wchar_t == wxUint16 if WC_UTF16
1050         if ( encode_utf16(code, (wxUint16 *)out) == 2 )
1051         {
1052             if ( out )
1053                 out++;
1054             written++;
1055         }
1056 #else // !WC_UTF16
1057         if ( out )
1058             *out = code;
1059 #endif // WC_UTF16/!WC_UTF16
1060 
1061         if ( out )
1062             out++;
1063 
1064         written++;
1065     }
1066 
1067     return wxCONV_FAILED;
1068 }
1069 
1070 size_t
FromWChar(char * dst,size_t dstLen,const wchar_t * src,size_t srcLen) const1071 wxMBConvStrictUTF8::FromWChar(char *dst, size_t dstLen,
1072                               const wchar_t *src, size_t srcLen) const
1073 {
1074     char *out = dstLen ? dst : NULL;
1075     size_t written = 0;
1076 
1077     const wchar_t* const end = srcLen == wxNO_LEN ? NULL : src + srcLen;
1078     for ( const wchar_t *wp = src; ; )
1079     {
1080         if ( end ? wp == end : !*wp )
1081         {
1082             // all done successfully, just add the trailing NULL if we are not
1083             // using explicit length
1084             if ( srcLen == wxNO_LEN )
1085             {
1086                 if ( out )
1087                 {
1088                     if ( !dstLen )
1089                         break;
1090 
1091                     *out = '\0';
1092                 }
1093 
1094                 written++;
1095             }
1096 
1097             return written;
1098         }
1099 
1100         wxUint32 code;
1101 #ifdef WC_UTF16
1102         code = wxDecodeSurrogate(&wp, end);
1103         if ( !wp )
1104             return wxCONV_FAILED;
1105 #else // wchar_t is UTF-32
1106         code = *wp++ & 0x7fffffff;
1107 #endif
1108 
1109         unsigned len;
1110         if ( code <= 0x7F )
1111         {
1112             len = 1;
1113             if ( out )
1114             {
1115                 if ( dstLen < len )
1116                     break;
1117 
1118                 out[0] = (char)code;
1119             }
1120         }
1121         else if ( code <= 0x07FF )
1122         {
1123             len = 2;
1124             if ( out )
1125             {
1126                 if ( dstLen < len )
1127                     break;
1128 
1129                 // NB: this line takes 6 least significant bits, encodes them as
1130                 // 10xxxxxx and discards them so that the next byte can be encoded:
1131                 out[1] = 0x80 | (code & 0x3F);  code >>= 6;
1132                 out[0] = 0xC0 | code;
1133             }
1134         }
1135         else if ( code <= 0xFFFF )
1136         {
1137             len = 3;
1138             if ( out )
1139             {
1140                 if ( dstLen < len )
1141                     break;
1142 
1143                 out[2] = 0x80 | (code & 0x3F);  code >>= 6;
1144                 out[1] = 0x80 | (code & 0x3F);  code >>= 6;
1145                 out[0] = 0xE0 | code;
1146             }
1147         }
1148         else if ( code <= 0x10FFFF )
1149         {
1150             len = 4;
1151             if ( out )
1152             {
1153                 if ( dstLen < len )
1154                     break;
1155 
1156                 out[3] = 0x80 | (code & 0x3F);  code >>= 6;
1157                 out[2] = 0x80 | (code & 0x3F);  code >>= 6;
1158                 out[1] = 0x80 | (code & 0x3F);  code >>= 6;
1159                 out[0] = 0xF0 | code;
1160             }
1161         }
1162         else
1163         {
1164             wxFAIL_MSG( wxT("trying to encode undefined Unicode character") );
1165             break;
1166         }
1167 
1168         if ( out )
1169         {
1170             out += len;
1171             dstLen -= len;
1172         }
1173 
1174         written += len;
1175     }
1176 
1177     // we only get here if an error occurs during decoding
1178     return wxCONV_FAILED;
1179 }
1180 
ToWChar(wchar_t * buf,size_t n,const char * psz,size_t srcLen) const1181 size_t wxMBConvUTF8::ToWChar(wchar_t *buf, size_t n,
1182                              const char *psz, size_t srcLen) const
1183 {
1184     if ( m_options == MAP_INVALID_UTF8_NOT )
1185         return wxMBConvStrictUTF8::ToWChar(buf, n, psz, srcLen);
1186 
1187     size_t len = 0;
1188 
1189     // The length can be either given explicitly or computed implicitly for the
1190     // NUL-terminated strings.
1191     const bool isNulTerminated = srcLen == wxNO_LEN;
1192     while ((isNulTerminated ? *psz : srcLen--) && ((!buf) || (len < n)))
1193     {
1194         const char *opsz = psz;
1195         unsigned char cc = *psz++, fc = cc;
1196         unsigned cnt;
1197         for (cnt = 0; fc & 0x80; cnt++)
1198             fc <<= 1;
1199 
1200         if (!cnt)
1201         {
1202             // plain ASCII char
1203             if (buf)
1204                 *buf++ = cc;
1205             len++;
1206 
1207             // escape the escape character for octal escapes
1208             if ((m_options & MAP_INVALID_UTF8_TO_OCTAL)
1209                     && cc == '\\' && (!buf || len < n))
1210             {
1211                 if (buf)
1212                     *buf++ = cc;
1213                 len++;
1214             }
1215         }
1216         else
1217         {
1218             bool invalid = false;
1219             cnt--;
1220             if (!cnt)
1221             {
1222                 // invalid UTF-8 sequence
1223                 invalid = true;
1224             }
1225             else
1226             {
1227                 unsigned ocnt = cnt - 1;
1228                 wxUint32 res = cc & (0x3f >> cnt);
1229                 while (cnt--)
1230                 {
1231                     if (!isNulTerminated && !srcLen)
1232                     {
1233                         // invalid UTF-8 sequence ending before the end of code
1234                         // point.
1235                         invalid = true;
1236                         break;
1237                     }
1238 
1239                     cc = *psz;
1240                     if ((cc & 0xC0) != 0x80)
1241                     {
1242                         // invalid UTF-8 sequence
1243                         invalid = true;
1244                         break;
1245                     }
1246 
1247                     psz++;
1248                     if (!isNulTerminated)
1249                         srcLen--;
1250                     res = (res << 6) | (cc & 0x3f);
1251                 }
1252 
1253                 if (invalid || res <= utf8_max[ocnt])
1254                 {
1255                     // illegal UTF-8 encoding
1256                     invalid = true;
1257                 }
1258                 else if ((m_options & MAP_INVALID_UTF8_TO_PUA) &&
1259                         res >= wxUnicodePUA && res < wxUnicodePUAEnd)
1260                 {
1261                     // if one of our PUA characters turns up externally
1262                     // it must also be treated as an illegal sequence
1263                     // (a bit like you have to escape an escape character)
1264                     invalid = true;
1265                 }
1266                 else
1267                 {
1268 #ifdef WC_UTF16
1269                     // cast is ok because wchar_t == wxUint16 if WC_UTF16
1270                     size_t pa = encode_utf16(res, (wxUint16 *)buf);
1271                     if (pa == wxCONV_FAILED)
1272                     {
1273                         invalid = true;
1274                     }
1275                     else
1276                     {
1277                         if (buf)
1278                             buf += pa;
1279                         len += pa;
1280                     }
1281 #else // !WC_UTF16
1282                     if (buf)
1283                         *buf++ = (wchar_t)res;
1284                     len++;
1285 #endif // WC_UTF16/!WC_UTF16
1286                 }
1287             }
1288 
1289             if (invalid)
1290             {
1291                 if (m_options & MAP_INVALID_UTF8_TO_PUA)
1292                 {
1293                     while (opsz < psz && (!buf || len < n))
1294                     {
1295 #ifdef WC_UTF16
1296                         // cast is ok because wchar_t == wxUuint16 if WC_UTF16
1297                         size_t pa = encode_utf16((unsigned char)*opsz + wxUnicodePUA, (wxUint16 *)buf);
1298                         wxASSERT(pa != wxCONV_FAILED);
1299                         if (buf)
1300                             buf += pa;
1301                         opsz++;
1302                         len += pa;
1303 #else
1304                         if (buf)
1305                             *buf++ = (wchar_t)(wxUnicodePUA + (unsigned char)*opsz);
1306                         opsz++;
1307                         len++;
1308 #endif
1309                     }
1310                 }
1311                 else if (m_options & MAP_INVALID_UTF8_TO_OCTAL)
1312                 {
1313                     while (opsz < psz && (!buf || len < n))
1314                     {
1315                         if ( buf && len + 3 < n )
1316                         {
1317                             unsigned char on = *opsz;
1318                             *buf++ = L'\\';
1319                             *buf++ = (wchar_t)( L'0' + on / 0100 );
1320                             *buf++ = (wchar_t)( L'0' + (on % 0100) / 010 );
1321                             *buf++ = (wchar_t)( L'0' + on % 010 );
1322                         }
1323 
1324                         opsz++;
1325                         len += 4;
1326                     }
1327                 }
1328                 else // MAP_INVALID_UTF8_NOT
1329                 {
1330                     return wxCONV_FAILED;
1331                 }
1332             }
1333         }
1334     }
1335 
1336     if ( isNulTerminated )
1337     {
1338         // Add the trailing NUL in this case if we have a large enough buffer.
1339         if ( buf && (len < n) )
1340             *buf = 0;
1341 
1342         // And count it in any case.
1343         len++;
1344     }
1345 
1346     return len;
1347 }
1348 
isoctal(wchar_t wch)1349 static inline bool isoctal(wchar_t wch)
1350 {
1351     return L'0' <= wch && wch <= L'7';
1352 }
1353 
FromWChar(char * buf,size_t n,const wchar_t * psz,size_t srcLen) const1354 size_t wxMBConvUTF8::FromWChar(char *buf, size_t n,
1355                                const wchar_t *psz, size_t srcLen) const
1356 {
1357     if ( m_options == MAP_INVALID_UTF8_NOT )
1358         return wxMBConvStrictUTF8::FromWChar(buf, n, psz, srcLen);
1359 
1360     size_t len = 0;
1361 
1362     // The length can be either given explicitly or computed implicitly for the
1363     // NUL-terminated strings.
1364     const wchar_t* const end = srcLen == wxNO_LEN ? NULL : psz + srcLen;
1365     while ((end ? psz < end : *psz) && ((!buf) || (len < n)))
1366     {
1367         wxUint32 cc;
1368 
1369 #ifdef WC_UTF16
1370         cc = wxDecodeSurrogate(&psz, end);
1371         if ( !psz )
1372             return wxCONV_FAILED;
1373 #else
1374         cc = (*psz++) & 0x7fffffff;
1375 #endif
1376 
1377         if ( (m_options & MAP_INVALID_UTF8_TO_PUA)
1378                 && cc >= wxUnicodePUA && cc < wxUnicodePUAEnd )
1379         {
1380             if (buf)
1381                 *buf++ = (char)(cc - wxUnicodePUA);
1382             len++;
1383         }
1384         else if ( (m_options & MAP_INVALID_UTF8_TO_OCTAL)
1385                     && cc == L'\\' && psz[0] == L'\\' )
1386         {
1387             if (buf)
1388                 *buf++ = (char)cc;
1389             psz++;
1390             len++;
1391         }
1392         else if ( (m_options & MAP_INVALID_UTF8_TO_OCTAL) &&
1393                     cc == L'\\' &&
1394                         isoctal(psz[0]) && isoctal(psz[1]) && isoctal(psz[2]) )
1395         {
1396             if (buf)
1397             {
1398                 *buf++ = (char) ((psz[0] - L'0') * 0100 +
1399                                  (psz[1] - L'0') * 010 +
1400                                  (psz[2] - L'0'));
1401             }
1402 
1403             psz += 3;
1404             len++;
1405         }
1406         else
1407         {
1408             unsigned cnt;
1409             for (cnt = 0; cc > utf8_max[cnt]; cnt++)
1410             {
1411             }
1412 
1413             if (!cnt)
1414             {
1415                 // plain ASCII char
1416                 if (buf)
1417                     *buf++ = (char) cc;
1418                 len++;
1419             }
1420             else
1421             {
1422                 len += cnt + 1;
1423                 if (buf)
1424                 {
1425                     *buf++ = (char) ((~0x7fu >> cnt) | ((cc >> (cnt * 6)) & (0x3f >> cnt)));
1426                     while (cnt--)
1427                         *buf++ = (char) (0x80 | ((cc >> (cnt * 6)) & 0x3f));
1428                 }
1429             }
1430         }
1431     }
1432 
1433     if ( !end )
1434     {
1435         // Add the trailing NUL in this case if we have a large enough buffer.
1436         if ( buf && (len < n) )
1437             *buf = 0;
1438 
1439         // And count it in any case.
1440         len++;
1441     }
1442 
1443     return len;
1444 }
1445 
1446 // ============================================================================
1447 // UTF-16
1448 // ============================================================================
1449 
1450 #ifdef WORDS_BIGENDIAN
1451     #define wxMBConvUTF16straight wxMBConvUTF16BE
1452     #define wxMBConvUTF16swap     wxMBConvUTF16LE
1453 #else
1454     #define wxMBConvUTF16swap     wxMBConvUTF16BE
1455     #define wxMBConvUTF16straight wxMBConvUTF16LE
1456 #endif
1457 
1458 /* static */
GetLength(const char * src,size_t srcLen)1459 size_t wxMBConvUTF16Base::GetLength(const char *src, size_t srcLen)
1460 {
1461     if ( srcLen == wxNO_LEN )
1462     {
1463         // count the number of bytes in input, including the trailing NULs
1464         const wxUint16 *inBuff = reinterpret_cast<const wxUint16 *>(src);
1465         for ( srcLen = 1; *inBuff++; srcLen++ )
1466             ;
1467 
1468         srcLen *= BYTES_PER_CHAR;
1469     }
1470     else // we already have the length
1471     {
1472         // we can only convert an entire number of UTF-16 characters
1473         if ( srcLen % BYTES_PER_CHAR )
1474             return wxCONV_FAILED;
1475     }
1476 
1477     return srcLen;
1478 }
1479 
1480 // case when in-memory representation is UTF-16 too
1481 #ifdef WC_UTF16
1482 
1483 // ----------------------------------------------------------------------------
1484 // conversions without endianness change
1485 // ----------------------------------------------------------------------------
1486 
1487 size_t
ToWChar(wchar_t * dst,size_t dstLen,const char * src,size_t srcLen) const1488 wxMBConvUTF16straight::ToWChar(wchar_t *dst, size_t dstLen,
1489                                const char *src, size_t srcLen) const
1490 {
1491     // set up the scene for using memcpy() (which is presumably more efficient
1492     // than copying the bytes one by one)
1493     srcLen = GetLength(src, srcLen);
1494     if ( srcLen == wxNO_LEN )
1495         return wxCONV_FAILED;
1496 
1497     const size_t inLen = srcLen / BYTES_PER_CHAR;
1498     if ( dst )
1499     {
1500         if ( dstLen < inLen )
1501             return wxCONV_FAILED;
1502 
1503         memcpy(dst, src, srcLen);
1504     }
1505 
1506     return inLen;
1507 }
1508 
1509 size_t
FromWChar(char * dst,size_t dstLen,const wchar_t * src,size_t srcLen) const1510 wxMBConvUTF16straight::FromWChar(char *dst, size_t dstLen,
1511                                  const wchar_t *src, size_t srcLen) const
1512 {
1513     if ( srcLen == wxNO_LEN )
1514         srcLen = wxWcslen(src) + 1;
1515 
1516     srcLen *= BYTES_PER_CHAR;
1517 
1518     if ( dst )
1519     {
1520         if ( dstLen < srcLen )
1521             return wxCONV_FAILED;
1522 
1523         memcpy(dst, src, srcLen);
1524     }
1525 
1526     return srcLen;
1527 }
1528 
1529 // ----------------------------------------------------------------------------
1530 // endian-reversing conversions
1531 // ----------------------------------------------------------------------------
1532 
1533 size_t
ToWChar(wchar_t * dst,size_t dstLen,const char * src,size_t srcLen) const1534 wxMBConvUTF16swap::ToWChar(wchar_t *dst, size_t dstLen,
1535                            const char *src, size_t srcLen) const
1536 {
1537     srcLen = GetLength(src, srcLen);
1538     if ( srcLen == wxNO_LEN )
1539         return wxCONV_FAILED;
1540 
1541     srcLen /= BYTES_PER_CHAR;
1542 
1543     if ( dst )
1544     {
1545         if ( dstLen < srcLen )
1546             return wxCONV_FAILED;
1547 
1548         const wxUint16 *inBuff = reinterpret_cast<const wxUint16 *>(src);
1549         for ( size_t n = 0; n < srcLen; n++, inBuff++ )
1550         {
1551             *dst++ = wxUINT16_SWAP_ALWAYS(*inBuff);
1552         }
1553     }
1554 
1555     return srcLen;
1556 }
1557 
1558 size_t
FromWChar(char * dst,size_t dstLen,const wchar_t * src,size_t srcLen) const1559 wxMBConvUTF16swap::FromWChar(char *dst, size_t dstLen,
1560                              const wchar_t *src, size_t srcLen) const
1561 {
1562     if ( srcLen == wxNO_LEN )
1563         srcLen = wxWcslen(src) + 1;
1564 
1565     srcLen *= BYTES_PER_CHAR;
1566 
1567     if ( dst )
1568     {
1569         if ( dstLen < srcLen )
1570             return wxCONV_FAILED;
1571 
1572         wxUint16 *outBuff = reinterpret_cast<wxUint16 *>(dst);
1573         for ( size_t n = 0; n < srcLen; n += BYTES_PER_CHAR, src++ )
1574         {
1575             *outBuff++ = wxUINT16_SWAP_ALWAYS(*src);
1576         }
1577     }
1578 
1579     return srcLen;
1580 }
1581 
1582 #else // !WC_UTF16: wchar_t is UTF-32
1583 
1584 // ----------------------------------------------------------------------------
1585 // conversions without endianness change
1586 // ----------------------------------------------------------------------------
1587 
1588 size_t
ToWChar(wchar_t * dst,size_t dstLen,const char * src,size_t srcLen) const1589 wxMBConvUTF16straight::ToWChar(wchar_t *dst, size_t dstLen,
1590                                const char *src, size_t srcLen) const
1591 {
1592     srcLen = GetLength(src, srcLen);
1593     if ( srcLen == wxNO_LEN )
1594         return wxCONV_FAILED;
1595 
1596     const size_t inLen = srcLen / BYTES_PER_CHAR;
1597     size_t outLen = 0;
1598     const wxUint16 *inBuff = reinterpret_cast<const wxUint16 *>(src);
1599     for ( const wxUint16 * const inEnd = inBuff + inLen; inBuff < inEnd; )
1600     {
1601         const wxUint32 ch = wxDecodeSurrogate(&inBuff, inEnd);
1602         if ( !inBuff )
1603             return wxCONV_FAILED;
1604 
1605         outLen++;
1606 
1607         if ( dst )
1608         {
1609             if ( outLen > dstLen )
1610                 return wxCONV_FAILED;
1611 
1612             *dst++ = ch;
1613         }
1614     }
1615 
1616 
1617     return outLen;
1618 }
1619 
1620 size_t
FromWChar(char * dst,size_t dstLen,const wchar_t * src,size_t srcLen) const1621 wxMBConvUTF16straight::FromWChar(char *dst, size_t dstLen,
1622                                  const wchar_t *src, size_t srcLen) const
1623 {
1624     if ( srcLen == wxNO_LEN )
1625         srcLen = wxWcslen(src) + 1;
1626 
1627     size_t outLen = 0;
1628     wxUint16 *outBuff = reinterpret_cast<wxUint16 *>(dst);
1629     for ( size_t n = 0; n < srcLen; n++ )
1630     {
1631         wxUint16 cc[2] = { 0 };
1632         const size_t numChars = encode_utf16(*src++, cc);
1633         if ( numChars == wxCONV_FAILED )
1634             return wxCONV_FAILED;
1635 
1636         outLen += numChars * BYTES_PER_CHAR;
1637         if ( outBuff )
1638         {
1639             if ( outLen > dstLen )
1640                 return wxCONV_FAILED;
1641 
1642             *outBuff++ = cc[0];
1643             if ( numChars == 2 )
1644             {
1645                 // second character of a surrogate
1646                 *outBuff++ = cc[1];
1647             }
1648         }
1649     }
1650 
1651     return outLen;
1652 }
1653 
1654 // ----------------------------------------------------------------------------
1655 // endian-reversing conversions
1656 // ----------------------------------------------------------------------------
1657 
1658 size_t
ToWChar(wchar_t * dst,size_t dstLen,const char * src,size_t srcLen) const1659 wxMBConvUTF16swap::ToWChar(wchar_t *dst, size_t dstLen,
1660                            const char *src, size_t srcLen) const
1661 {
1662     srcLen = GetLength(src, srcLen);
1663     if ( srcLen == wxNO_LEN )
1664         return wxCONV_FAILED;
1665 
1666     const size_t inLen = srcLen / BYTES_PER_CHAR;
1667     size_t outLen = 0;
1668     const wxUint16 *inBuff = reinterpret_cast<const wxUint16 *>(src);
1669     for ( const wxUint16 * const inEnd = inBuff + inLen; inBuff < inEnd; )
1670     {
1671         wxUint16 tmp[2];
1672         const wxUint16* tmpEnd = tmp;
1673 
1674         tmp[0] = wxUINT16_SWAP_ALWAYS(*inBuff);
1675         tmpEnd++;
1676 
1677         if ( inBuff + 1 < inEnd )
1678         {
1679             // Normal case, we have a next character to decode.
1680             tmp[1] = wxUINT16_SWAP_ALWAYS(inBuff[1]);
1681             tmpEnd++;
1682         }
1683 
1684         const wxUint16* p = tmp;
1685         const wxUint32 ch = wxDecodeSurrogate(&p, tmpEnd);
1686         if ( !p )
1687             return wxCONV_FAILED;
1688 
1689         // Move the real pointer by the same amount as "p" was updated by.
1690         inBuff += p - tmp;
1691 
1692         outLen++;
1693 
1694         if ( dst )
1695         {
1696             if ( outLen > dstLen )
1697                 return wxCONV_FAILED;
1698 
1699             *dst++ = ch;
1700         }
1701     }
1702 
1703 
1704     return outLen;
1705 }
1706 
1707 size_t
FromWChar(char * dst,size_t dstLen,const wchar_t * src,size_t srcLen) const1708 wxMBConvUTF16swap::FromWChar(char *dst, size_t dstLen,
1709                              const wchar_t *src, size_t srcLen) const
1710 {
1711     if ( srcLen == wxNO_LEN )
1712         srcLen = wxWcslen(src) + 1;
1713 
1714     size_t outLen = 0;
1715     wxUint16 *outBuff = reinterpret_cast<wxUint16 *>(dst);
1716     for ( const wchar_t *srcEnd = src + srcLen; src < srcEnd; src++ )
1717     {
1718         wxUint16 cc[2] = { 0 };
1719         const size_t numChars = encode_utf16(*src, cc);
1720         if ( numChars == wxCONV_FAILED )
1721             return wxCONV_FAILED;
1722 
1723         outLen += numChars * BYTES_PER_CHAR;
1724         if ( outBuff )
1725         {
1726             if ( outLen > dstLen )
1727                 return wxCONV_FAILED;
1728 
1729             *outBuff++ = wxUINT16_SWAP_ALWAYS(cc[0]);
1730             if ( numChars == 2 )
1731             {
1732                 // second character of a surrogate
1733                 *outBuff++ = wxUINT16_SWAP_ALWAYS(cc[1]);
1734             }
1735         }
1736     }
1737 
1738     return outLen;
1739 }
1740 
1741 #endif // WC_UTF16/!WC_UTF16
1742 
1743 
1744 // ============================================================================
1745 // UTF-32
1746 // ============================================================================
1747 
1748 #ifdef WORDS_BIGENDIAN
1749     #define wxMBConvUTF32straight  wxMBConvUTF32BE
1750     #define wxMBConvUTF32swap      wxMBConvUTF32LE
1751 #else
1752     #define wxMBConvUTF32swap      wxMBConvUTF32BE
1753     #define wxMBConvUTF32straight  wxMBConvUTF32LE
1754 #endif
1755 
1756 
1757 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32LE) wxConvUTF32LE;
1758 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32BE) wxConvUTF32BE;
1759 
1760 /* static */
GetLength(const char * src,size_t srcLen)1761 size_t wxMBConvUTF32Base::GetLength(const char *src, size_t srcLen)
1762 {
1763     if ( srcLen == wxNO_LEN )
1764     {
1765         // count the number of bytes in input, including the trailing NULs
1766         const wxUint32 *inBuff = reinterpret_cast<const wxUint32 *>(src);
1767         for ( srcLen = 1; *inBuff++; srcLen++ )
1768             ;
1769 
1770         srcLen *= BYTES_PER_CHAR;
1771     }
1772     else // we already have the length
1773     {
1774         // we can only convert an entire number of UTF-32 characters
1775         if ( srcLen % BYTES_PER_CHAR )
1776             return wxCONV_FAILED;
1777     }
1778 
1779     return srcLen;
1780 }
1781 
1782 // case when in-memory representation is UTF-16
1783 #ifdef WC_UTF16
1784 
1785 // ----------------------------------------------------------------------------
1786 // conversions without endianness change
1787 // ----------------------------------------------------------------------------
1788 
1789 size_t
ToWChar(wchar_t * dst,size_t dstLen,const char * src,size_t srcLen) const1790 wxMBConvUTF32straight::ToWChar(wchar_t *dst, size_t dstLen,
1791                                const char *src, size_t srcLen) const
1792 {
1793     srcLen = GetLength(src, srcLen);
1794     if ( srcLen == wxNO_LEN )
1795         return wxCONV_FAILED;
1796 
1797     const wxUint32 *inBuff = reinterpret_cast<const wxUint32 *>(src);
1798     const size_t inLen = srcLen / BYTES_PER_CHAR;
1799     size_t outLen = 0;
1800     for ( size_t n = 0; n < inLen; n++ )
1801     {
1802         wxUint16 cc[2] = { 0 };
1803         const size_t numChars = encode_utf16(*inBuff++, cc);
1804         if ( numChars == wxCONV_FAILED )
1805             return wxCONV_FAILED;
1806 
1807         outLen += numChars;
1808         if ( dst )
1809         {
1810             if ( outLen > dstLen )
1811                 return wxCONV_FAILED;
1812 
1813             *dst++ = cc[0];
1814             if ( numChars == 2 )
1815             {
1816                 // second character of a surrogate
1817                 *dst++ = cc[1];
1818             }
1819         }
1820     }
1821 
1822     return outLen;
1823 }
1824 
1825 size_t
FromWChar(char * dst,size_t dstLen,const wchar_t * src,size_t srcLen) const1826 wxMBConvUTF32straight::FromWChar(char *dst, size_t dstLen,
1827                                  const wchar_t *src, size_t srcLen) const
1828 {
1829     if ( srcLen == wxNO_LEN )
1830         srcLen = wxWcslen(src) + 1;
1831 
1832     wxUint32 *outBuff = reinterpret_cast<wxUint32 *>(dst);
1833     size_t outLen = 0;
1834     for ( const wchar_t * const srcEnd = src + srcLen; src < srcEnd; )
1835     {
1836         const wxUint32 ch = wxDecodeSurrogate(&src, srcEnd);
1837         if ( !src )
1838             return wxCONV_FAILED;
1839 
1840         outLen += BYTES_PER_CHAR;
1841 
1842         if ( outBuff )
1843         {
1844             if ( outLen > dstLen )
1845                 return wxCONV_FAILED;
1846 
1847             *outBuff++ = ch;
1848         }
1849     }
1850 
1851     return outLen;
1852 }
1853 
1854 // ----------------------------------------------------------------------------
1855 // endian-reversing conversions
1856 // ----------------------------------------------------------------------------
1857 
1858 size_t
ToWChar(wchar_t * dst,size_t dstLen,const char * src,size_t srcLen) const1859 wxMBConvUTF32swap::ToWChar(wchar_t *dst, size_t dstLen,
1860                            const char *src, size_t srcLen) const
1861 {
1862     srcLen = GetLength(src, srcLen);
1863     if ( srcLen == wxNO_LEN )
1864         return wxCONV_FAILED;
1865 
1866     const wxUint32 *inBuff = reinterpret_cast<const wxUint32 *>(src);
1867     const size_t inLen = srcLen / BYTES_PER_CHAR;
1868     size_t outLen = 0;
1869     for ( size_t n = 0; n < inLen; n++, inBuff++ )
1870     {
1871         wxUint16 cc[2] = { 0 };
1872         const size_t numChars = encode_utf16(wxUINT32_SWAP_ALWAYS(*inBuff), cc);
1873         if ( numChars == wxCONV_FAILED )
1874             return wxCONV_FAILED;
1875 
1876         outLen += numChars;
1877         if ( dst )
1878         {
1879             if ( outLen > dstLen )
1880                 return wxCONV_FAILED;
1881 
1882             *dst++ = cc[0];
1883             if ( numChars == 2 )
1884             {
1885                 // second character of a surrogate
1886                 *dst++ = cc[1];
1887             }
1888         }
1889     }
1890 
1891     return outLen;
1892 }
1893 
1894 size_t
FromWChar(char * dst,size_t dstLen,const wchar_t * src,size_t srcLen) const1895 wxMBConvUTF32swap::FromWChar(char *dst, size_t dstLen,
1896                              const wchar_t *src, size_t srcLen) const
1897 {
1898     if ( srcLen == wxNO_LEN )
1899         srcLen = wxWcslen(src) + 1;
1900 
1901     wxUint32 *outBuff = reinterpret_cast<wxUint32 *>(dst);
1902     size_t outLen = 0;
1903     for ( const wchar_t * const srcEnd = src + srcLen; src < srcEnd; )
1904     {
1905         const wxUint32 ch = wxDecodeSurrogate(&src, srcEnd);
1906         if ( !src )
1907             return wxCONV_FAILED;
1908 
1909         outLen += BYTES_PER_CHAR;
1910 
1911         if ( outBuff )
1912         {
1913             if ( outLen > dstLen )
1914                 return wxCONV_FAILED;
1915 
1916             *outBuff++ = wxUINT32_SWAP_ALWAYS(ch);
1917         }
1918     }
1919 
1920     return outLen;
1921 }
1922 
1923 #else // !WC_UTF16: wchar_t is UTF-32
1924 
1925 // ----------------------------------------------------------------------------
1926 // conversions without endianness change
1927 // ----------------------------------------------------------------------------
1928 
1929 size_t
ToWChar(wchar_t * dst,size_t dstLen,const char * src,size_t srcLen) const1930 wxMBConvUTF32straight::ToWChar(wchar_t *dst, size_t dstLen,
1931                                const char *src, size_t srcLen) const
1932 {
1933     // use memcpy() as it should be much faster than hand-written loop
1934     srcLen = GetLength(src, srcLen);
1935     if ( srcLen == wxNO_LEN )
1936         return wxCONV_FAILED;
1937 
1938     const size_t inLen = srcLen/BYTES_PER_CHAR;
1939     if ( dst )
1940     {
1941         if ( dstLen < inLen )
1942             return wxCONV_FAILED;
1943 
1944         memcpy(dst, src, srcLen);
1945     }
1946 
1947     return inLen;
1948 }
1949 
1950 size_t
FromWChar(char * dst,size_t dstLen,const wchar_t * src,size_t srcLen) const1951 wxMBConvUTF32straight::FromWChar(char *dst, size_t dstLen,
1952                                  const wchar_t *src, size_t srcLen) const
1953 {
1954     if ( srcLen == wxNO_LEN )
1955         srcLen = wxWcslen(src) + 1;
1956 
1957     srcLen *= BYTES_PER_CHAR;
1958 
1959     if ( dst )
1960     {
1961         if ( dstLen < srcLen )
1962             return wxCONV_FAILED;
1963 
1964         memcpy(dst, src, srcLen);
1965     }
1966 
1967     return srcLen;
1968 }
1969 
1970 // ----------------------------------------------------------------------------
1971 // endian-reversing conversions
1972 // ----------------------------------------------------------------------------
1973 
1974 size_t
ToWChar(wchar_t * dst,size_t dstLen,const char * src,size_t srcLen) const1975 wxMBConvUTF32swap::ToWChar(wchar_t *dst, size_t dstLen,
1976                            const char *src, size_t srcLen) const
1977 {
1978     srcLen = GetLength(src, srcLen);
1979     if ( srcLen == wxNO_LEN )
1980         return wxCONV_FAILED;
1981 
1982     srcLen /= BYTES_PER_CHAR;
1983 
1984     if ( dst )
1985     {
1986         if ( dstLen < srcLen )
1987             return wxCONV_FAILED;
1988 
1989         const wxUint32 *inBuff = reinterpret_cast<const wxUint32 *>(src);
1990         for ( size_t n = 0; n < srcLen; n++, inBuff++ )
1991         {
1992             *dst++ = wxUINT32_SWAP_ALWAYS(*inBuff);
1993         }
1994     }
1995 
1996     return srcLen;
1997 }
1998 
1999 size_t
FromWChar(char * dst,size_t dstLen,const wchar_t * src,size_t srcLen) const2000 wxMBConvUTF32swap::FromWChar(char *dst, size_t dstLen,
2001                              const wchar_t *src, size_t srcLen) const
2002 {
2003     if ( srcLen == wxNO_LEN )
2004         srcLen = wxWcslen(src) + 1;
2005 
2006     srcLen *= BYTES_PER_CHAR;
2007 
2008     if ( dst )
2009     {
2010         if ( dstLen < srcLen )
2011             return wxCONV_FAILED;
2012 
2013         wxUint32 *outBuff = reinterpret_cast<wxUint32 *>(dst);
2014         for ( size_t n = 0; n < srcLen; n += BYTES_PER_CHAR, src++ )
2015         {
2016             *outBuff++ = wxUINT32_SWAP_ALWAYS(*src);
2017         }
2018     }
2019 
2020     return srcLen;
2021 }
2022 
2023 #endif // WC_UTF16/!WC_UTF16
2024 
2025 
2026 // ============================================================================
2027 // The classes doing conversion using the iconv_xxx() functions
2028 // ============================================================================
2029 
2030 #ifdef HAVE_ICONV
2031 
2032 // VS: glibc 2.1.3 is broken in that iconv() conversion to/from UCS4 fails with
2033 //     E2BIG if output buffer is _exactly_ as big as needed. Such case is
2034 //     (unless there's yet another bug in glibc) the only case when iconv()
2035 //     returns with (size_t)-1 (which means error) and says there are 0 bytes
2036 //     left in the input buffer -- when _real_ error occurs,
2037 //     bytes-left-in-input buffer is non-zero. Hence, this alternative test for
2038 //     iconv() failure.
2039 //     [This bug does not appear in glibc 2.2.]
2040 #if defined(__GLIBC__) && __GLIBC__ == 2 && __GLIBC_MINOR__ <= 1
2041 #define ICONV_FAILED(cres, bufLeft) ((cres == (size_t)-1) && \
2042                                      (errno != E2BIG || bufLeft != 0))
2043 #else
2044 #define ICONV_FAILED(cres, bufLeft)  (cres == (size_t)-1)
2045 #endif
2046 
2047 #define ICONV_CHAR_CAST(x) const_cast<ICONV_CONST char**>(x)
2048 
2049 #define ICONV_T_INVALID ((iconv_t)-1)
2050 
2051 #if SIZEOF_WCHAR_T == 4
2052     #define WC_BSWAP    wxUINT32_SWAP_ALWAYS
2053     #define WC_ENC      wxFONTENCODING_UTF32
2054 #elif SIZEOF_WCHAR_T == 2
2055     #define WC_BSWAP    wxUINT16_SWAP_ALWAYS
2056     #define WC_ENC      wxFONTENCODING_UTF16
2057 #else // sizeof(wchar_t) != 2 nor 4
2058     // does this ever happen?
2059     #error "Unknown sizeof(wchar_t): please report this to wx-dev@lists.wxwindows.org"
2060 #endif
2061 
2062 // ----------------------------------------------------------------------------
2063 // wxMBConv_iconv: encapsulates an iconv character set
2064 // ----------------------------------------------------------------------------
2065 
2066 class wxMBConv_iconv : public wxMBConv
2067 {
2068 public:
2069     wxMBConv_iconv(const char *name);
2070     virtual ~wxMBConv_iconv();
2071 
2072     // implement base class virtual methods
2073     virtual size_t ToWChar(wchar_t *dst, size_t dstLen,
2074                            const char *src, size_t srcLen = wxNO_LEN) const wxOVERRIDE;
2075     virtual size_t FromWChar(char *dst, size_t dstLen,
2076                              const wchar_t *src, size_t srcLen = wxNO_LEN) const wxOVERRIDE;
2077     virtual size_t GetMBNulLen() const wxOVERRIDE;
2078 
2079     virtual bool IsUTF8() const wxOVERRIDE;
2080 
Clone() const2081     virtual wxMBConv *Clone() const wxOVERRIDE
2082     {
2083         wxMBConv_iconv *p = new wxMBConv_iconv(m_name);
2084         p->m_minMBCharWidth = m_minMBCharWidth;
2085         return p;
2086     }
2087 
IsOk() const2088     bool IsOk() const
2089         { return (m2w != ICONV_T_INVALID) && (w2m != ICONV_T_INVALID); }
2090 
2091 protected:
2092     // the iconv handlers used to translate from multibyte
2093     // to wide char and in the other direction
2094     iconv_t m2w,
2095             w2m;
2096 
2097 #if wxUSE_THREADS
2098     // guards access to m2w and w2m objects
2099     wxMutex m_iconvMutex;
2100 #endif
2101 
2102 private:
2103     // the name (for iconv_open()) of a wide char charset -- if none is
2104     // available on this machine, it will remain NULL
2105     static wxString ms_wcCharsetName;
2106 
2107     // true if the wide char encoding we use (i.e. ms_wcCharsetName) has
2108     // different endian-ness than the native one
2109     static bool ms_wcNeedsSwap;
2110 
2111 
2112     // name of the encoding handled by this conversion
2113     const char *m_name;
2114 
2115     // cached result of GetMBNulLen(); set to 0 meaning "unknown"
2116     // initially
2117     size_t m_minMBCharWidth;
2118 };
2119 
2120 // make the constructor available for unit testing
new_wxMBConv_iconv(const char * name)2121 WXDLLIMPEXP_BASE wxMBConv* new_wxMBConv_iconv( const char* name )
2122 {
2123     wxMBConv_iconv* result = new wxMBConv_iconv( name );
2124     if ( !result->IsOk() )
2125     {
2126         delete result;
2127         return 0;
2128     }
2129 
2130     return result;
2131 }
2132 
2133 wxString wxMBConv_iconv::ms_wcCharsetName;
2134 bool wxMBConv_iconv::ms_wcNeedsSwap = false;
2135 
wxMBConv_iconv(const char * name)2136 wxMBConv_iconv::wxMBConv_iconv(const char *name)
2137               : m_name(wxStrdup(name))
2138 {
2139     m_minMBCharWidth = 0;
2140 
2141     // check for charset that represents wchar_t:
2142     if ( ms_wcCharsetName.empty() )
2143     {
2144         wxLogTrace(TRACE_STRCONV, wxT("Looking for wide char codeset:"));
2145 
2146 #if wxUSE_FONTMAP
2147         const wxChar *const *names = wxFontMapperBase::GetAllEncodingNames(WC_ENC);
2148 #else // !wxUSE_FONTMAP
2149         static const wxChar *const names_static[] =
2150         {
2151 #if SIZEOF_WCHAR_T == 4
2152             wxT("UCS-4"),
2153 #elif SIZEOF_WCHAR_T == 2
2154             wxT("UCS-2"),
2155 #endif
2156             NULL
2157         };
2158         const wxChar *const *names = names_static;
2159 #endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
2160 
2161         for ( ; *names && ms_wcCharsetName.empty(); ++names )
2162         {
2163             const wxString nameCS(*names);
2164 
2165             // first try charset with explicit bytesex info (e.g. "UCS-4LE"):
2166             wxString nameXE(nameCS);
2167 
2168 #ifdef WORDS_BIGENDIAN
2169                 nameXE += wxT("BE");
2170 #else // little endian
2171                 nameXE += wxT("LE");
2172 #endif
2173 
2174             wxLogTrace(TRACE_STRCONV, wxT("  trying charset \"%s\""),
2175                        nameXE.c_str());
2176 
2177             m2w = iconv_open(nameXE.ToAscii(), name);
2178             if ( m2w == ICONV_T_INVALID )
2179             {
2180                 // try charset w/o bytesex info (e.g. "UCS4")
2181                 wxLogTrace(TRACE_STRCONV, wxT("  trying charset \"%s\""),
2182                            nameCS.c_str());
2183                 m2w = iconv_open(nameCS.ToAscii(), name);
2184 
2185                 // and check for bytesex ourselves:
2186                 if ( m2w != ICONV_T_INVALID )
2187                 {
2188                     char    buf[2], *bufPtr;
2189                     wchar_t wbuf[2];
2190                     size_t  insz, outsz;
2191                     size_t  res;
2192 
2193                     buf[0] = 'A';
2194                     buf[1] = 0;
2195                     wbuf[0] = 0;
2196                     insz = 2;
2197                     outsz = SIZEOF_WCHAR_T * 2;
2198                     char* wbufPtr = (char*)wbuf;
2199                     bufPtr = buf;
2200 
2201                     res = iconv(
2202                         m2w, ICONV_CHAR_CAST(&bufPtr), &insz,
2203                         &wbufPtr, &outsz);
2204 
2205                     if (ICONV_FAILED(res, insz))
2206                     {
2207                         wxLogLastError(wxT("iconv"));
2208                         wxLogError(_("Conversion to charset '%s' doesn't work."),
2209                                    nameCS.c_str());
2210                     }
2211                     else // ok, can convert to this encoding, remember it
2212                     {
2213                         ms_wcCharsetName = nameCS;
2214                         ms_wcNeedsSwap = wbuf[0] != (wchar_t)buf[0];
2215                     }
2216                 }
2217             }
2218             else // use charset not requiring byte swapping
2219             {
2220                 ms_wcCharsetName = nameXE;
2221             }
2222         }
2223 
2224         wxLogTrace(TRACE_STRCONV,
2225                    wxT("iconv wchar_t charset is \"%s\"%s"),
2226                    ms_wcCharsetName.empty() ? wxString("<none>")
2227                                             : ms_wcCharsetName,
2228                    ms_wcNeedsSwap ? wxT(" (needs swap)")
2229                                   : wxT(""));
2230     }
2231     else // we already have ms_wcCharsetName
2232     {
2233         m2w = iconv_open(ms_wcCharsetName.ToAscii(), name);
2234     }
2235 
2236     if ( ms_wcCharsetName.empty() )
2237     {
2238         w2m = ICONV_T_INVALID;
2239     }
2240     else
2241     {
2242         w2m = iconv_open(name, ms_wcCharsetName.ToAscii());
2243         if ( w2m == ICONV_T_INVALID )
2244         {
2245             wxLogTrace(TRACE_STRCONV,
2246                        wxT("\"%s\" -> \"%s\" works but not the converse!?"),
2247                        ms_wcCharsetName.c_str(), name);
2248         }
2249     }
2250 }
2251 
~wxMBConv_iconv()2252 wxMBConv_iconv::~wxMBConv_iconv()
2253 {
2254     free(const_cast<char *>(m_name));
2255 
2256     if ( m2w != ICONV_T_INVALID )
2257         iconv_close(m2w);
2258     if ( w2m != ICONV_T_INVALID )
2259         iconv_close(w2m);
2260 }
2261 
2262 size_t
ToWChar(wchar_t * dst,size_t dstLen,const char * src,size_t srcLen) const2263 wxMBConv_iconv::ToWChar(wchar_t *dst, size_t dstLen,
2264                         const char *src, size_t srcLen) const
2265 {
2266     if ( srcLen == wxNO_LEN )
2267     {
2268         // find the string length: notice that must be done differently for
2269         // NUL-terminated strings and UTF-16/32 which are terminated with 2/4
2270         // consecutive NULs
2271         const size_t nulLen = GetMBNulLen();
2272         switch ( nulLen )
2273         {
2274             default:
2275                 return wxCONV_FAILED;
2276 
2277             case 1:
2278                 srcLen = strlen(src); // arguably more optimized than our version
2279                 break;
2280 
2281             case 2:
2282             case 4:
2283                 // for UTF-16/32 not only we need to have 2/4 consecutive NULs
2284                 // but they also have to start at character boundary and not
2285                 // span two adjacent characters
2286                 const char *p;
2287                 for ( p = src; NotAllNULs(p, nulLen); p += nulLen )
2288                     ;
2289                 srcLen = p - src;
2290                 break;
2291         }
2292 
2293         // when we're determining the length of the string ourselves we count
2294         // the terminating NUL(s) as part of it and always NUL-terminate the
2295         // output
2296         srcLen += nulLen;
2297     }
2298 
2299     // we express length in the number of (wide) characters but iconv always
2300     // counts buffer sizes it in bytes
2301     dstLen *= SIZEOF_WCHAR_T;
2302 
2303 #if wxUSE_THREADS
2304     // NB: iconv() is MT-safe, but each thread must use its own iconv_t handle.
2305     //     Unfortunately there are a couple of global wxCSConv objects such as
2306     //     wxConvLocal that are used all over wx code, so we have to make sure
2307     //     the handle is used by at most one thread at the time. Otherwise
2308     //     only a few wx classes would be safe to use from non-main threads
2309     //     as MB<->WC conversion would fail "randomly".
2310     wxMutexLocker lock(wxConstCast(this, wxMBConv_iconv)->m_iconvMutex);
2311 #endif // wxUSE_THREADS
2312 
2313     size_t res, cres;
2314     const char *pszPtr = src;
2315 
2316     if ( dst )
2317     {
2318         char* bufPtr = (char*)dst;
2319 
2320         // have destination buffer, convert there
2321         size_t dstLenOrig = dstLen;
2322         cres = iconv(m2w,
2323                      ICONV_CHAR_CAST(&pszPtr), &srcLen,
2324                      &bufPtr, &dstLen);
2325 
2326         // convert the number of bytes converted as returned by iconv to the
2327         // number of (wide) characters converted that we need
2328         res = (dstLenOrig - dstLen) / SIZEOF_WCHAR_T;
2329 
2330         if (ms_wcNeedsSwap)
2331         {
2332             // convert to native endianness
2333             for ( unsigned i = 0; i < res; i++ )
2334                 dst[i] = WC_BSWAP(dst[i]);
2335         }
2336     }
2337     else // no destination buffer
2338     {
2339         // convert using temp buffer to calculate the size of the buffer needed
2340         wchar_t tbuf[256];
2341         res = 0;
2342 
2343         do
2344         {
2345             char* bufPtr = (char*)tbuf;
2346             dstLen = 8 * SIZEOF_WCHAR_T;
2347 
2348             cres = iconv(m2w,
2349                          ICONV_CHAR_CAST(&pszPtr), &srcLen,
2350                          &bufPtr, &dstLen );
2351 
2352             res += 8 - (dstLen / SIZEOF_WCHAR_T);
2353         }
2354         while ((cres == (size_t)-1) && (errno == E2BIG));
2355     }
2356 
2357     if (ICONV_FAILED(cres, srcLen))
2358     {
2359         //VS: it is ok if iconv fails, hence trace only
2360         wxLogTrace(TRACE_STRCONV, wxT("iconv failed: %s"), wxSysErrorMsgStr(wxSysErrorCode()));
2361         return wxCONV_FAILED;
2362     }
2363 
2364     return res;
2365 }
2366 
FromWChar(char * dst,size_t dstLen,const wchar_t * src,size_t srcLen) const2367 size_t wxMBConv_iconv::FromWChar(char *dst, size_t dstLen,
2368                                  const wchar_t *src, size_t srcLen) const
2369 {
2370 #if wxUSE_THREADS
2371     // NB: explained in MB2WC
2372     wxMutexLocker lock(wxConstCast(this, wxMBConv_iconv)->m_iconvMutex);
2373 #endif
2374 
2375     if ( srcLen == wxNO_LEN )
2376         srcLen = wxWcslen(src) + 1;
2377 
2378     size_t inbuflen = srcLen * SIZEOF_WCHAR_T;
2379     size_t outbuflen = dstLen;
2380     size_t res, cres;
2381 
2382     wchar_t *tmpbuf = 0;
2383 
2384     if (ms_wcNeedsSwap)
2385     {
2386         // need to copy to temp buffer to switch endianness
2387         // (doing WC_BSWAP twice on the original buffer won't work, as it
2388         //  could be in read-only memory, or be accessed in some other thread)
2389         tmpbuf = (wchar_t *)malloc(inbuflen);
2390         for ( size_t i = 0; i < srcLen; i++ )
2391             tmpbuf[i] = WC_BSWAP(src[i]);
2392 
2393         src = tmpbuf;
2394     }
2395 
2396     const char* inbuf = reinterpret_cast<const char*>(src);
2397     if ( dst )
2398     {
2399         // have destination buffer, convert there
2400         cres = iconv(w2m, ICONV_CHAR_CAST(&inbuf), &inbuflen, &dst, &outbuflen);
2401 
2402         res = dstLen - outbuflen;
2403     }
2404     else // no destination buffer
2405     {
2406         // convert using temp buffer to calculate the size of the buffer needed
2407         char tbuf[256];
2408         res = 0;
2409         do
2410         {
2411             dst = tbuf;
2412             outbuflen = WXSIZEOF(tbuf);
2413 
2414             cres = iconv(w2m, ICONV_CHAR_CAST(&inbuf), &inbuflen, &dst, &outbuflen);
2415 
2416             res += WXSIZEOF(tbuf) - outbuflen;
2417         }
2418         while ((cres == (size_t)-1) && (errno == E2BIG));
2419     }
2420 
2421     if (ms_wcNeedsSwap)
2422     {
2423         free(tmpbuf);
2424     }
2425 
2426     if (ICONV_FAILED(cres, inbuflen))
2427     {
2428         wxLogTrace(TRACE_STRCONV, wxT("iconv failed: %s"), wxSysErrorMsgStr(wxSysErrorCode()));
2429         return wxCONV_FAILED;
2430     }
2431 
2432     return res;
2433 }
2434 
GetMBNulLen() const2435 size_t wxMBConv_iconv::GetMBNulLen() const
2436 {
2437     if ( m_minMBCharWidth == 0 )
2438     {
2439         wxMBConv_iconv * const self = wxConstCast(this, wxMBConv_iconv);
2440 
2441 #if wxUSE_THREADS
2442         // NB: explained in MB2WC
2443         wxMutexLocker lock(self->m_iconvMutex);
2444 #endif
2445 
2446         const wchar_t *wnul = L"";
2447         char buf[8]; // should be enough for NUL in any encoding
2448         size_t inLen = sizeof(wchar_t),
2449                outLen = WXSIZEOF(buf);
2450         const char* inBuff = reinterpret_cast<const char*>(wnul);
2451         char *outBuff = buf;
2452         if ( iconv(w2m, ICONV_CHAR_CAST(&inBuff), &inLen, &outBuff, &outLen) == (size_t)-1 )
2453         {
2454             self->m_minMBCharWidth = (size_t)-1;
2455         }
2456         else // ok
2457         {
2458             self->m_minMBCharWidth = outBuff - buf;
2459         }
2460     }
2461 
2462     return m_minMBCharWidth;
2463 }
2464 
IsUTF8() const2465 bool wxMBConv_iconv::IsUTF8() const
2466 {
2467     return wxStricmp(m_name, "UTF-8") == 0 ||
2468            wxStricmp(m_name, "UTF8") == 0;
2469 }
2470 
2471 #endif // HAVE_ICONV
2472 
2473 
2474 // ============================================================================
2475 // Win32 conversion classes
2476 // ============================================================================
2477 
2478 #ifdef wxHAVE_WIN32_MB2WC
2479 
2480 // from utils.cpp
2481 #if wxUSE_FONTMAP
2482 extern WXDLLIMPEXP_BASE long wxCharsetToCodepage(const char *charset);
2483 #endif
2484 extern WXDLLIMPEXP_BASE long wxEncodingToCodepage(wxFontEncoding encoding);
2485 
2486 class wxMBConv_win32 : public wxMBConv
2487 {
2488 public:
wxMBConv_win32()2489     wxMBConv_win32()
2490     {
2491         m_CodePage = CP_ACP;
2492         m_minMBCharWidth = 0;
2493     }
2494 
wxMBConv_win32(const wxMBConv_win32 & conv)2495     wxMBConv_win32(const wxMBConv_win32& conv)
2496         : wxMBConv()
2497     {
2498         m_CodePage = conv.m_CodePage;
2499         m_minMBCharWidth = conv.m_minMBCharWidth;
2500     }
2501 
2502 #if wxUSE_FONTMAP
wxMBConv_win32(const char * name)2503     wxMBConv_win32(const char* name)
2504     {
2505         m_CodePage = wxCharsetToCodepage(name);
2506         m_minMBCharWidth = 0;
2507     }
2508 #endif // wxUSE_FONTMAP
2509 
wxMBConv_win32(wxFontEncoding encoding)2510     wxMBConv_win32(wxFontEncoding encoding)
2511     {
2512         m_CodePage = wxEncodingToCodepage(encoding);
2513         m_minMBCharWidth = 0;
2514     }
2515 
MB2WC(wchar_t * buf,const char * psz,size_t n) const2516     virtual size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const wxOVERRIDE
2517     {
2518         // note that we have to use MB_ERR_INVALID_CHARS flag as it without it
2519         // the behaviour is not compatible with the Unix version (using iconv)
2520         // and break the library itself, e.g. wxTextInputStream::NextChar()
2521         // wouldn't work if reading an incomplete MB char didn't result in an
2522         // error
2523         //
2524         // Moreover, MB_ERR_INVALID_CHARS is not supported for UTF-8 under XP
2525         // and for UTF-7 under any Windows version, so we always use our own
2526         // conversions in this case.
2527         if ( m_CodePage == CP_UTF8 )
2528         {
2529             return wxMBConvUTF8().MB2WC(buf, psz, n);
2530         }
2531 
2532         if ( m_CodePage == CP_UTF7 )
2533         {
2534             return wxMBConvUTF7().MB2WC(buf, psz, n);
2535         }
2536 
2537         const size_t len = ::MultiByteToWideChar
2538                              (
2539                                 m_CodePage,     // code page
2540                                 MB_ERR_INVALID_CHARS,  // flags: fall on error
2541                                 psz,            // input string
2542                                 -1,             // its length (NUL-terminated)
2543                                 buf,            // output string
2544                                 buf ? n : 0     // size of output buffer
2545                              );
2546         if ( !len )
2547             return wxCONV_FAILED;
2548 
2549         // note that it returns count of written chars for buf != NULL and size
2550         // of the needed buffer for buf == NULL so in either case the length of
2551         // the string (which never includes the terminating NUL) is one less
2552         return len - 1;
2553     }
2554 
WC2MB(char * buf,const wchar_t * pwz,size_t n) const2555     virtual size_t WC2MB(char *buf, const wchar_t *pwz, size_t n) const wxOVERRIDE
2556     {
2557         /*
2558             We need to WC_NO_BEST_FIT_CHARS to prevent WideCharToMultiByte()
2559             from replacing characters unrepresentable in the target code page
2560             with bad quality approximations such as turning "1/2" symbol
2561             (U+00BD) into "1" for the code pages which don't have the fraction
2562             symbol.
2563 
2564             Unfortunately this flag can't be used with CJK encodings nor
2565             UTF-7/8 and so if the code page is one of those, we need to resort
2566             to a round trip to verify that no replacements have been done.
2567          */
2568         BOOL usedDef wxDUMMY_INITIALIZE(false);
2569         BOOL *pUsedDef;
2570         int flags;
2571         if ( m_CodePage < 50000 )
2572         {
2573             // it's our lucky day
2574             flags = WC_NO_BEST_FIT_CHARS;
2575             pUsedDef = &usedDef;
2576         }
2577         else // old system or unsupported encoding
2578         {
2579             flags = 0;
2580             pUsedDef = NULL;
2581         }
2582 
2583         const size_t len = ::WideCharToMultiByte
2584                              (
2585                                 m_CodePage,     // code page
2586                                 flags,          // either none or no best fit
2587                                 pwz,            // input string
2588                                 -1,             // it is (wide) NUL-terminated
2589                                 buf,            // output buffer
2590                                 buf ? n : 0,    // and its size
2591                                 NULL,           // default "replacement" char
2592                                 pUsedDef        // [out] was it used?
2593                              );
2594 
2595         if ( !len )
2596         {
2597             // function totally failed
2598             return wxCONV_FAILED;
2599         }
2600 
2601         // we did something, check if we really succeeded
2602         if ( flags )
2603         {
2604             // check if the conversion failed, i.e. if any replacements
2605             // were done
2606             if ( usedDef )
2607                 return wxCONV_FAILED;
2608         }
2609         else // we must resort to double tripping...
2610         {
2611             // first we need to ensure that we really have the MB data: this is
2612             // not the case if we're called with NULL buffer, in which case we
2613             // need to do the conversion yet again
2614             wxCharBuffer bufDef;
2615             if ( !buf )
2616             {
2617                 bufDef = wxCharBuffer(len);
2618                 buf = bufDef.data();
2619                 if ( !::WideCharToMultiByte(m_CodePage, flags, pwz, -1,
2620                                             buf, len, NULL, NULL) )
2621                     return wxCONV_FAILED;
2622             }
2623 
2624             if ( !n )
2625                 n = wcslen(pwz);
2626             wxWCharBuffer wcBuf(n);
2627             if ( MB2WC(wcBuf.data(), buf, n + 1) == wxCONV_FAILED ||
2628                     wcscmp(wcBuf, pwz) != 0 )
2629             {
2630                 // we didn't obtain the same thing we started from, hence
2631                 // the conversion was lossy and we consider that it failed
2632                 return wxCONV_FAILED;
2633             }
2634         }
2635 
2636         // see the comment above for the reason of "len - 1"
2637         return len - 1;
2638     }
2639 
GetMBNulLen() const2640     virtual size_t GetMBNulLen() const wxOVERRIDE
2641     {
2642         if ( m_minMBCharWidth == 0 )
2643         {
2644             int len = ::WideCharToMultiByte
2645                         (
2646                             m_CodePage,     // code page
2647                             0,              // no flags
2648                             L"",            // input string
2649                             1,              // translate just the NUL
2650                             NULL,           // output buffer
2651                             0,              // and its size
2652                             NULL,           // no replacement char
2653                             NULL            // [out] don't care if it was used
2654                         );
2655 
2656             wxMBConv_win32 * const self = wxConstCast(this, wxMBConv_win32);
2657             switch ( len )
2658             {
2659                 default:
2660                     wxLogDebug(wxT("Unexpected NUL length %d"), len);
2661                     self->m_minMBCharWidth = (size_t)-1;
2662                     break;
2663 
2664                 case 0:
2665                     self->m_minMBCharWidth = (size_t)-1;
2666                     break;
2667 
2668                 case 1:
2669                 case 2:
2670                 case 4:
2671                     self->m_minMBCharWidth = len;
2672                     break;
2673             }
2674         }
2675 
2676         return m_minMBCharWidth;
2677     }
2678 
Clone() const2679     virtual wxMBConv *Clone() const wxOVERRIDE { return new wxMBConv_win32(*this); }
2680 
IsOk() const2681     bool IsOk() const { return m_CodePage != -1; }
2682 
2683 private:
2684     // the code page we're working with
2685     long m_CodePage;
2686 
2687     // cached result of GetMBNulLen(), set to 0 initially meaning
2688     // "unknown"
2689     size_t m_minMBCharWidth;
2690 };
2691 
2692 #endif // wxHAVE_WIN32_MB2WC
2693 
2694 
2695 // ============================================================================
2696 // wxEncodingConverter based conversion classes
2697 // ============================================================================
2698 
2699 #if wxUSE_FONTMAP
2700 
2701 class wxMBConv_wxwin : public wxMBConv
2702 {
2703 private:
Init()2704     void Init()
2705     {
2706         // Refuse to use broken wxEncodingConverter code for Mac-specific encodings.
2707         // The wxMBConv_cf class does a better job.
2708         m_ok = (m_enc < wxFONTENCODING_MACMIN || m_enc > wxFONTENCODING_MACMAX) &&
2709                m2w.Init(m_enc, wxFONTENCODING_UNICODE) &&
2710                w2m.Init(wxFONTENCODING_UNICODE, m_enc);
2711     }
2712 
2713 public:
2714     // temporarily just use wxEncodingConverter stuff,
2715     // so that it works while a better implementation is built
wxMBConv_wxwin(const char * name)2716     wxMBConv_wxwin(const char* name)
2717     {
2718         if (name)
2719             m_enc = wxFontMapperBase::Get()->CharsetToEncoding(name, false);
2720         else
2721             m_enc = wxFONTENCODING_SYSTEM;
2722 
2723         Init();
2724     }
2725 
wxMBConv_wxwin(wxFontEncoding enc)2726     wxMBConv_wxwin(wxFontEncoding enc)
2727     {
2728         m_enc = enc;
2729 
2730         Init();
2731     }
2732 
MB2WC(wchar_t * buf,const char * psz,size_t WXUNUSED (n)) const2733     size_t MB2WC(wchar_t *buf, const char *psz, size_t WXUNUSED(n)) const wxOVERRIDE
2734     {
2735         size_t inbuf = strlen(psz);
2736         if (buf)
2737         {
2738             if (!m2w.Convert(psz, buf))
2739                 return wxCONV_FAILED;
2740         }
2741         return inbuf;
2742     }
2743 
WC2MB(char * buf,const wchar_t * psz,size_t WXUNUSED (n)) const2744     size_t WC2MB(char *buf, const wchar_t *psz, size_t WXUNUSED(n)) const wxOVERRIDE
2745     {
2746         const size_t inbuf = wxWcslen(psz);
2747         if (buf)
2748         {
2749             if (!w2m.Convert(psz, buf))
2750                 return wxCONV_FAILED;
2751         }
2752 
2753         return inbuf;
2754     }
2755 
GetMBNulLen() const2756     virtual size_t GetMBNulLen() const wxOVERRIDE
2757     {
2758         switch ( m_enc )
2759         {
2760             case wxFONTENCODING_UTF16BE:
2761             case wxFONTENCODING_UTF16LE:
2762                 return 2;
2763 
2764             case wxFONTENCODING_UTF32BE:
2765             case wxFONTENCODING_UTF32LE:
2766                 return 4;
2767 
2768             default:
2769                 return 1;
2770         }
2771     }
2772 
Clone() const2773     virtual wxMBConv *Clone() const wxOVERRIDE { return new wxMBConv_wxwin(m_enc); }
2774 
IsOk() const2775     bool IsOk() const { return m_ok; }
2776 
2777 public:
2778     wxFontEncoding m_enc;
2779     wxEncodingConverter m2w, w2m;
2780 
2781 private:
2782     // were we initialized successfully?
2783     bool m_ok;
2784 
2785     wxDECLARE_NO_COPY_CLASS(wxMBConv_wxwin);
2786 };
2787 
2788 // make the constructors available for unit testing
new_wxMBConv_wxwin(const char * name)2789 WXDLLIMPEXP_BASE wxMBConv* new_wxMBConv_wxwin( const char* name )
2790 {
2791     wxMBConv_wxwin* result = new wxMBConv_wxwin( name );
2792     if ( !result->IsOk() )
2793     {
2794         delete result;
2795         return 0;
2796     }
2797 
2798     return result;
2799 }
2800 
2801 #endif // wxUSE_FONTMAP
2802 
2803 // ============================================================================
2804 // wxCSConv implementation
2805 // ============================================================================
2806 
Init()2807 void wxCSConv::Init()
2808 {
2809     m_name = NULL;
2810     m_convReal =  NULL;
2811 }
2812 
SetEncoding(wxFontEncoding encoding)2813 void wxCSConv::SetEncoding(wxFontEncoding encoding)
2814 {
2815     switch ( encoding )
2816     {
2817         case wxFONTENCODING_MAX:
2818         case wxFONTENCODING_SYSTEM:
2819             if ( m_name )
2820             {
2821                 // It's ok to not have encoding value if we have a name for it.
2822                 m_encoding = wxFONTENCODING_SYSTEM;
2823             }
2824             else // No name neither.
2825             {
2826                 // Fall back to the system default encoding in this case (not
2827                 // sure how much sense does this make but this is how the old
2828                 // code used to behave).
2829 #if wxUSE_INTL
2830                 m_encoding = wxLocale::GetSystemEncoding();
2831                 if ( m_encoding == wxFONTENCODING_SYSTEM )
2832 #endif // wxUSE_INTL
2833                     m_encoding = wxFONTENCODING_ISO8859_1;
2834             }
2835             break;
2836 
2837         case wxFONTENCODING_DEFAULT:
2838             // wxFONTENCODING_DEFAULT is same as US-ASCII in this context
2839             m_encoding = wxFONTENCODING_ISO8859_1;
2840             break;
2841 
2842         default:
2843             // Just use the provided encoding.
2844             m_encoding = encoding;
2845     }
2846 }
2847 
wxCSConv(const wxString & charset)2848 wxCSConv::wxCSConv(const wxString& charset)
2849 {
2850     Init();
2851 
2852     if ( !charset.empty() )
2853     {
2854         SetName(charset.ToAscii());
2855     }
2856 
2857 #if wxUSE_FONTMAP
2858     SetEncoding(wxFontMapperBase::GetEncodingFromName(charset));
2859 #else
2860     SetEncoding(wxFONTENCODING_SYSTEM);
2861 #endif
2862 
2863     m_convReal = DoCreate();
2864 }
2865 
wxCSConv(wxFontEncoding encoding)2866 wxCSConv::wxCSConv(wxFontEncoding encoding)
2867 {
2868     if ( encoding == wxFONTENCODING_MAX || encoding == wxFONTENCODING_DEFAULT )
2869     {
2870         wxFAIL_MSG( wxT("invalid encoding value in wxCSConv ctor") );
2871 
2872         encoding = wxFONTENCODING_SYSTEM;
2873     }
2874 
2875     Init();
2876 
2877     SetEncoding(encoding);
2878 
2879     m_convReal = DoCreate();
2880 }
2881 
~wxCSConv()2882 wxCSConv::~wxCSConv()
2883 {
2884     Clear();
2885 }
2886 
wxCSConv(const wxCSConv & conv)2887 wxCSConv::wxCSConv(const wxCSConv& conv)
2888         : wxMBConv()
2889 {
2890     Init();
2891 
2892     SetName(conv.m_name);
2893     SetEncoding(conv.m_encoding);
2894 
2895     m_convReal = DoCreate();
2896 }
2897 
operator =(const wxCSConv & conv)2898 wxCSConv& wxCSConv::operator=(const wxCSConv& conv)
2899 {
2900     Clear();
2901 
2902     SetName(conv.m_name);
2903     SetEncoding(conv.m_encoding);
2904 
2905     m_convReal = DoCreate();
2906 
2907     return *this;
2908 }
2909 
Clear()2910 void wxCSConv::Clear()
2911 {
2912     free(m_name);
2913     m_name = NULL;
2914 
2915     wxDELETE(m_convReal);
2916 }
2917 
SetName(const char * charset)2918 void wxCSConv::SetName(const char *charset)
2919 {
2920     if ( charset )
2921         m_name = wxStrdup(charset);
2922 }
2923 
2924 #if wxUSE_FONTMAP
2925 
2926 WX_DECLARE_HASH_MAP( wxFontEncoding, wxString, wxIntegerHash, wxIntegerEqual,
2927                      wxEncodingNameCache );
2928 
2929 static wxEncodingNameCache gs_nameCache;
2930 #endif
2931 
DoCreate() const2932 wxMBConv *wxCSConv::DoCreate() const
2933 {
2934 #if wxUSE_FONTMAP
2935     wxLogTrace(TRACE_STRCONV,
2936                wxT("creating conversion for %s"),
2937                (m_name ? m_name
2938                        : (const char*)wxFontMapperBase::GetEncodingName(m_encoding).mb_str()));
2939 #endif // wxUSE_FONTMAP
2940 
2941     // check for the special case of ASCII or ISO8859-1 charset: as we have
2942     // special knowledge of it anyhow, we don't need to create a special
2943     // conversion object
2944     if ( m_encoding == wxFONTENCODING_ISO8859_1 )
2945     {
2946         // don't convert at all
2947         return NULL;
2948     }
2949 
2950     // we trust OS to do conversion better than we can so try external
2951     // conversion methods first
2952     //
2953     // the full order is:
2954     //      1. OS conversion (iconv() under Unix or Win32 API)
2955     //      2. hard coded conversions for UTF
2956     //      3. wxEncodingConverter as fall back
2957 
2958     // step (1)
2959 #ifdef HAVE_ICONV
2960 #if !wxUSE_FONTMAP
2961     if ( m_name )
2962 #endif // !wxUSE_FONTMAP
2963     {
2964 #if wxUSE_FONTMAP
2965         wxFontEncoding encoding(m_encoding);
2966 #endif
2967 
2968         if ( m_name )
2969         {
2970             wxMBConv_iconv *conv = new wxMBConv_iconv(m_name);
2971             if ( conv->IsOk() )
2972                 return conv;
2973 
2974             delete conv;
2975 
2976 #if wxUSE_FONTMAP
2977             encoding =
2978                 wxFontMapperBase::Get()->CharsetToEncoding(m_name, false);
2979 #endif // wxUSE_FONTMAP
2980         }
2981 #if wxUSE_FONTMAP
2982         {
2983             const wxEncodingNameCache::iterator it = gs_nameCache.find(encoding);
2984             if ( it != gs_nameCache.end() )
2985             {
2986                 if ( it->second.empty() )
2987                     return NULL;
2988 
2989                 wxMBConv_iconv *conv = new wxMBConv_iconv(it->second.ToAscii());
2990                 if ( conv->IsOk() )
2991                     return conv;
2992 
2993                 delete conv;
2994             }
2995 
2996             const wxChar* const* names = wxFontMapperBase::GetAllEncodingNames(encoding);
2997             // CS : in case this does not return valid names (eg for MacRoman)
2998             // encoding got a 'failure' entry in the cache all the same,
2999             // although it just has to be created using a different method, so
3000             // only store failed iconv creation attempts (or perhaps we
3001             // shoulnd't do this at all ?)
3002             if ( names[0] != NULL )
3003             {
3004                 for ( ; *names; ++names )
3005                 {
3006                     // FIXME-UTF8: wxFontMapperBase::GetAllEncodingNames()
3007                     //             will need changes that will obsolete this
3008                     wxString name(*names);
3009                     wxMBConv_iconv *conv = new wxMBConv_iconv(name.ToAscii());
3010                     if ( conv->IsOk() )
3011                     {
3012                         gs_nameCache[encoding] = *names;
3013                         return conv;
3014                     }
3015 
3016                     delete conv;
3017                 }
3018 
3019                 gs_nameCache[encoding] = wxT(""); // cache the failure
3020             }
3021         }
3022 #endif // wxUSE_FONTMAP
3023     }
3024 #endif // HAVE_ICONV
3025 
3026 #ifdef wxHAVE_WIN32_MB2WC
3027     {
3028 #if wxUSE_FONTMAP
3029         wxMBConv_win32 *conv = m_name ? new wxMBConv_win32(m_name)
3030                                       : new wxMBConv_win32(m_encoding);
3031 #else
3032         wxMBConv_win32* conv = new wxMBConv_win32(m_encoding);
3033 #endif
3034         if ( conv->IsOk() )
3035             return conv;
3036 
3037         delete conv;
3038     }
3039 #endif // wxHAVE_WIN32_MB2WC
3040 
3041 #ifdef __DARWIN__
3042     {
3043         // leave UTF16 and UTF32 to the built-ins of wx
3044         if ( m_name || ( m_encoding < wxFONTENCODING_UTF16BE ||
3045             ( m_encoding >= wxFONTENCODING_MACMIN && m_encoding <= wxFONTENCODING_MACMAX ) ) )
3046         {
3047 #if wxUSE_FONTMAP
3048             wxMBConv_cf *conv = m_name ? new wxMBConv_cf(m_name)
3049                                           : new wxMBConv_cf(m_encoding);
3050 #else
3051             wxMBConv_cf *conv = new wxMBConv_cf(m_encoding);
3052 #endif
3053 
3054             if ( conv->IsOk() )
3055                  return conv;
3056 
3057             delete conv;
3058         }
3059     }
3060 #endif // __DARWIN__
3061 
3062     // step (2)
3063     wxFontEncoding enc = m_encoding;
3064 #if wxUSE_FONTMAP
3065     if ( enc == wxFONTENCODING_SYSTEM && m_name )
3066     {
3067         // use "false" to suppress interactive dialogs -- we can be called from
3068         // anywhere and popping up a dialog from here is the last thing we want to
3069         // do
3070         enc = wxFontMapperBase::Get()->CharsetToEncoding(m_name, false);
3071     }
3072 #endif // wxUSE_FONTMAP
3073 
3074     switch ( enc )
3075     {
3076         case wxFONTENCODING_UTF7:
3077              return new wxMBConvUTF7;
3078 
3079         case wxFONTENCODING_UTF8:
3080              return new wxMBConvUTF8;
3081 
3082         case wxFONTENCODING_UTF16BE:
3083              return new wxMBConvUTF16BE;
3084 
3085         case wxFONTENCODING_UTF16LE:
3086              return new wxMBConvUTF16LE;
3087 
3088         case wxFONTENCODING_UTF32BE:
3089              return new wxMBConvUTF32BE;
3090 
3091         case wxFONTENCODING_UTF32LE:
3092              return new wxMBConvUTF32LE;
3093 
3094         default:
3095              // nothing to do but put here to suppress gcc warnings
3096              break;
3097     }
3098 
3099     // step (3)
3100 #if wxUSE_FONTMAP
3101     {
3102         wxMBConv_wxwin *conv = m_name ? new wxMBConv_wxwin(m_name)
3103                                       : new wxMBConv_wxwin(m_encoding);
3104         if ( conv->IsOk() )
3105             return conv;
3106 
3107         delete conv;
3108     }
3109 
3110     wxLogTrace(TRACE_STRCONV,
3111                wxT("encoding \"%s\" is not supported by this system"),
3112                (m_name ? wxString(m_name)
3113                        : wxFontMapperBase::GetEncodingName(m_encoding)));
3114 #endif // wxUSE_FONTMAP
3115 
3116     return NULL;
3117 }
3118 
IsOk() const3119 bool wxCSConv::IsOk() const
3120 {
3121     // special case: no convReal created for wxFONTENCODING_ISO8859_1
3122     if ( m_encoding == wxFONTENCODING_ISO8859_1 )
3123         return true; // always ok as we do it ourselves
3124 
3125     // m_convReal->IsOk() is called at its own creation, so we know it must
3126     // be ok if m_convReal is non-NULL
3127     return m_convReal != NULL;
3128 }
3129 
ToWChar(wchar_t * dst,size_t dstLen,const char * src,size_t srcLen) const3130 size_t wxCSConv::ToWChar(wchar_t *dst, size_t dstLen,
3131                          const char *src, size_t srcLen) const
3132 {
3133     if (m_convReal)
3134         return m_convReal->ToWChar(dst, dstLen, src, srcLen);
3135 
3136     // latin-1 (direct)
3137     if ( srcLen == wxNO_LEN )
3138         srcLen = strlen(src) + 1; // take trailing NUL too
3139 
3140     if ( dst )
3141     {
3142         if ( dstLen < srcLen )
3143             return wxCONV_FAILED;
3144 
3145         for ( size_t n = 0; n < srcLen; n++ )
3146             dst[n] = (unsigned char)(src[n]);
3147     }
3148 
3149     return srcLen;
3150 }
3151 
FromWChar(char * dst,size_t dstLen,const wchar_t * src,size_t srcLen) const3152 size_t wxCSConv::FromWChar(char *dst, size_t dstLen,
3153                            const wchar_t *src, size_t srcLen) const
3154 {
3155     if (m_convReal)
3156         return m_convReal->FromWChar(dst, dstLen, src, srcLen);
3157 
3158     // latin-1 (direct)
3159     if ( srcLen == wxNO_LEN )
3160         srcLen = wxWcslen(src) + 1;
3161 
3162     if ( dst )
3163     {
3164         if ( dstLen < srcLen )
3165             return wxCONV_FAILED;
3166 
3167         for ( size_t n = 0; n < srcLen; n++ )
3168         {
3169             if ( src[n] > 0xFF )
3170                 return wxCONV_FAILED;
3171 
3172             dst[n] = (char)src[n];
3173         }
3174 
3175     }
3176     else // still need to check the input validity
3177     {
3178         for ( size_t n = 0; n < srcLen; n++ )
3179         {
3180             if ( src[n] > 0xFF )
3181                 return wxCONV_FAILED;
3182         }
3183     }
3184 
3185     return srcLen;
3186 }
3187 
GetMBNulLen() const3188 size_t wxCSConv::GetMBNulLen() const
3189 {
3190     if ( m_convReal )
3191         return m_convReal->GetMBNulLen();
3192 
3193     // otherwise, we are ISO-8859-1
3194     return 1;
3195 }
3196 
IsUTF8() const3197 bool wxCSConv::IsUTF8() const
3198 {
3199     if ( m_convReal )
3200         return m_convReal->IsUTF8();
3201 
3202     // otherwise, we are ISO-8859-1
3203     return false;
3204 }
3205 
3206 
3207 // ============================================================================
3208 // wxWhateverWorksConv
3209 // ============================================================================
3210 
3211 size_t
ToWChar(wchar_t * dst,size_t dstLen,const char * src,size_t srcLen) const3212 wxWhateverWorksConv::ToWChar(wchar_t *dst, size_t dstLen,
3213                              const char *src, size_t srcLen) const
3214 {
3215     size_t rc = wxConvUTF8.ToWChar(dst, dstLen, src, srcLen);
3216     if ( rc != wxCONV_FAILED )
3217         return rc;
3218 
3219     rc = wxConvLibc.ToWChar(dst, dstLen, src, srcLen);
3220     if ( rc != wxCONV_FAILED )
3221         return rc;
3222 
3223     rc = wxConvISO8859_1.ToWChar(dst, dstLen, src, srcLen);
3224 
3225     return rc;
3226 }
3227 
3228 size_t
FromWChar(char * dst,size_t dstLen,const wchar_t * src,size_t srcLen) const3229 wxWhateverWorksConv::FromWChar(char *dst, size_t dstLen,
3230                                const wchar_t *src, size_t srcLen) const
3231 {
3232     size_t rc = wxConvLibc.FromWChar(dst, dstLen, src, srcLen);
3233     if ( rc != wxCONV_FAILED )
3234         return rc;
3235 
3236     rc = wxConvUTF8.FromWChar(dst, dstLen, src, srcLen);
3237 
3238     return rc;
3239 }
3240 
3241 // ----------------------------------------------------------------------------
3242 // globals
3243 // ----------------------------------------------------------------------------
3244 
3245 // NB: The reason why we create converted objects in this convoluted way,
3246 //     using a factory function instead of global variable, is that they
3247 //     may be used at static initialization time (some of them are used by
3248 //     wxString ctors and there may be a global wxString object). In other
3249 //     words, possibly _before_ the converter global object would be
3250 //     initialized.
3251 
3252 #undef wxConvLibc
3253 #undef wxConvUTF8
3254 #undef wxConvUTF7
3255 #undef wxConvWhateverWorks
3256 #undef wxConvLocal
3257 #undef wxConvISO8859_1
3258 
3259 #define WX_DEFINE_GLOBAL_CONV2(klass, impl_klass, name, ctor_args)      \
3260     WXDLLIMPEXP_DATA_BASE(klass*) name##Ptr = NULL;                     \
3261     WXDLLIMPEXP_BASE klass* wxGet_##name##Ptr()                         \
3262     {                                                                   \
3263         static impl_klass name##Obj ctor_args;                          \
3264         return &name##Obj;                                              \
3265     }                                                                   \
3266     /* this ensures that all global converter objects are created */    \
3267     /* by the time static initialization is done, i.e. before any */    \
3268     /* thread is launched: */                                           \
3269     static klass* gs_##name##instance = wxGet_##name##Ptr()
3270 
3271 #define WX_DEFINE_GLOBAL_CONV(klass, name, ctor_args) \
3272     WX_DEFINE_GLOBAL_CONV2(klass, klass, name, ctor_args)
3273 
3274 #ifdef __INTELC__
3275     // disable warning "variable 'xxx' was declared but never referenced"
3276     #pragma warning(disable: 177)
3277 #endif // Intel C++
3278 
3279 #ifdef __WINDOWS__
3280     WX_DEFINE_GLOBAL_CONV2(wxMBConv, wxMBConv_win32, wxConvLibc, wxEMPTY_PARAMETER_VALUE);
3281 #elif 0 // defined(__WXOSX__)
3282     WX_DEFINE_GLOBAL_CONV2(wxMBConv, wxMBConv_cf, wxConvLibc,  (wxFONTENCODING_UTF8));
3283 #else
3284     WX_DEFINE_GLOBAL_CONV2(wxMBConv, wxMBConvLibc, wxConvLibc, wxEMPTY_PARAMETER_VALUE);
3285 #endif
3286 
3287 // NB: we can't use wxEMPTY_PARAMETER_VALUE as final argument here because it's
3288 //     passed to WX_DEFINE_GLOBAL_CONV2 after a macro expansion and so still
3289 //     provokes an error message about "not enough macro parameters"; and we
3290 //     can't use "()" here as the name##Obj declaration would be parsed as a
3291 //     function declaration then, so use a semicolon and live with an extra
3292 //     empty statement (and hope that no compilers warns about this)
3293 WX_DEFINE_GLOBAL_CONV(wxMBConvStrictUTF8, wxConvUTF8, ;);
3294 WX_DEFINE_GLOBAL_CONV(wxMBConvUTF7, wxConvUTF7, ;);
3295 WX_DEFINE_GLOBAL_CONV(wxWhateverWorksConv, wxConvWhateverWorks, ;);
3296 
3297 WX_DEFINE_GLOBAL_CONV(wxCSConv, wxConvLocal, (wxFONTENCODING_SYSTEM));
3298 WX_DEFINE_GLOBAL_CONV(wxCSConv, wxConvISO8859_1, (wxFONTENCODING_ISO8859_1));
3299 
3300 WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvCurrent = wxGet_wxConvLibcPtr();
3301 WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvUI = wxGet_wxConvLocalPtr();
3302 
3303 #ifdef __DARWIN__
3304 // It is important to use this conversion object under Darwin as it ensures
3305 // that Unicode strings are (re)composed correctly even though xnu kernel uses
3306 // decomposed form internally (at least for the file names).
3307 static wxMBConvD_cf wxConvMacUTF8DObj(wxFONTENCODING_UTF8);
3308 #endif
3309 
3310 WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvFileName =
3311 #ifdef __DARWIN__
3312                                     &wxConvMacUTF8DObj;
3313 #else // !__DARWIN__
3314                                     wxGet_wxConvWhateverWorksPtr();
3315 #endif // __DARWIN__/!__DARWIN__
3316