1 /////////////////////////////////////////////////////////////////////////////
2 // Name: src/common/strconv.cpp
3 // Purpose: Unicode conversion classes
4 // Author: Ove Kaaven, Robert Roebling, Vadim Zeitlin, Vaclav Slavik,
5 // Ryan Norton, Fredrik Roubert (UTF7)
6 // Modified by:
7 // Created: 29/01/98
8 // Copyright: (c) 1999 Ove Kaaven, Robert Roebling, Vaclav Slavik
9 // (c) 2000-2003 Vadim Zeitlin
10 // (c) 2004 Ryan Norton, Fredrik Roubert
11 // Licence: wxWindows licence
12 /////////////////////////////////////////////////////////////////////////////
13
14 // For compilers that support precompilation, includes "wx.h".
15 #include "wx/wxprec.h"
16
17 #ifdef __BORLANDC__
18 #pragma hdrstop
19 #endif //__BORLANDC__
20
21 #ifndef WX_PRECOMP
22 #include "wx/intl.h"
23 #include "wx/log.h"
24 #include "wx/utils.h"
25 #include "wx/hashmap.h"
26 #endif
27
28 #include "wx/strconv.h"
29
30 #ifndef __WXWINCE__
31 #include <errno.h>
32 #endif
33
34 #include <ctype.h>
35 #include <string.h>
36 #include <stdlib.h>
37
38 #if defined(__WIN32__) && !defined(__WXMICROWIN__)
39 #include "wx/msw/private.h"
40 #include "wx/msw/missing.h"
41 #define wxHAVE_WIN32_MB2WC
42 #endif
43
44 #ifdef HAVE_ICONV
45 #include <iconv.h>
46 #include "wx/thread.h"
47 #endif
48
49 #include "wx/encconv.h"
50 #include "wx/fontmap.h"
51
52 #ifdef __DARWIN__
53 #include "wx/osx/core/private/strconv_cf.h"
54 #endif //def __DARWIN__
55
56
57 #define TRACE_STRCONV wxT("strconv")
58
59 // WC_UTF16 is defined only if sizeof(wchar_t) == 2, otherwise it's supposed to
60 // be 4 bytes
61 #if SIZEOF_WCHAR_T == 2
62 #define WC_UTF16
63 #endif
64
65
66 // ============================================================================
67 // implementation
68 // ============================================================================
69
70 // helper function of cMB2WC(): check if n bytes at this location are all NUL
NotAllNULs(const char * p,size_t n)71 static bool NotAllNULs(const char *p, size_t n)
72 {
73 while ( n && *p++ == '\0' )
74 n--;
75
76 return n != 0;
77 }
78
79 // ----------------------------------------------------------------------------
80 // UTF-16 en/decoding to/from UCS-4 with surrogates handling
81 // ----------------------------------------------------------------------------
82
encode_utf16(wxUint32 input,wxUint16 * output)83 static size_t encode_utf16(wxUint32 input, wxUint16 *output)
84 {
85 if (input <= 0xffff)
86 {
87 if (output)
88 *output = (wxUint16) input;
89
90 return 1;
91 }
92 else if (input >= 0x110000)
93 {
94 return wxCONV_FAILED;
95 }
96 else
97 {
98 if (output)
99 {
100 *output++ = (wxUint16) ((input >> 10) + 0xd7c0);
101 *output = (wxUint16) ((input & 0x3ff) + 0xdc00);
102 }
103
104 return 2;
105 }
106 }
107
108 // Returns the next UTF-32 character from the wchar_t buffer terminated by the
109 // "end" pointer (the caller must ensure that on input "*pSrc < end") and
110 // advances the pointer to the character after this one.
111 //
112 // If an invalid or incomplete character is found, *pSrc is set to NULL, the
113 // caller must check for this.
wxDecodeSurrogate(const wxChar16 ** pSrc,const wxChar16 * end)114 static wxUint32 wxDecodeSurrogate(const wxChar16 **pSrc, const wxChar16* end)
115 {
116 const wxChar16*& src = *pSrc;
117
118 // Is this a BMP character?
119 const wxUint16 u = *src++;
120 if ((u < 0xd800) || (u > 0xdfff))
121 {
122 // Yes, just return it.
123 return u;
124 }
125
126 // No, we have the first half of a surrogate, check if we also have the
127 // second half (notice that this check does nothing if end == NULL, as it
128 // is allowed to be, and this is correct).
129 if ( src == end )
130 {
131 // No, we don't because this is the end of input.
132 src = NULL;
133 return 0;
134 }
135
136 const wxUint16 u2 = *src++;
137 if ( (u2 < 0xdc00) || (u2 > 0xdfff) )
138 {
139 // No, it's not in the low surrogate range.
140 src = NULL;
141 return 0;
142 }
143
144 // Yes, decode it and return the corresponding Unicode character.
145 return ((u - 0xd7c0) << 10) + (u2 - 0xdc00);
146 }
147
148 // ----------------------------------------------------------------------------
149 // wxMBConv
150 // ----------------------------------------------------------------------------
151
152 size_t
ToWChar(wchar_t * dst,size_t dstLen,const char * src,size_t srcLen) const153 wxMBConv::ToWChar(wchar_t *dst, size_t dstLen,
154 const char *src, size_t srcLen) const
155 {
156 // although new conversion classes are supposed to implement this function
157 // directly, the existing ones only implement the old MB2WC() and so, to
158 // avoid to have to rewrite all conversion classes at once, we provide a
159 // default (but not efficient) implementation of this one in terms of the
160 // old function by copying the input to ensure that it's NUL-terminated and
161 // then using MB2WC() to convert it
162 //
163 // moreover, some conversion classes simply can't implement ToWChar()
164 // directly, the primary example is wxConvLibc: mbstowcs() only handles
165 // NUL-terminated strings
166
167 // the number of chars [which would be] written to dst [if it were not NULL]
168 size_t dstWritten = 0;
169
170 // the number of NULs terminating this string
171 size_t nulLen = 0; // not really needed, but just to avoid warnings
172
173 // if we were not given the input size we just have to assume that the
174 // string is properly terminated as we have no way of knowing how long it
175 // is anyhow, but if we do have the size check whether there are enough
176 // NULs at the end
177 wxCharBuffer bufTmp;
178 const char *srcEnd;
179 if ( srcLen != wxNO_LEN )
180 {
181 // we need to know how to find the end of this string
182 nulLen = GetMBNulLen();
183 if ( nulLen == wxCONV_FAILED )
184 return wxCONV_FAILED;
185
186 // if there are enough NULs we can avoid the copy
187 if ( srcLen < nulLen || NotAllNULs(src + srcLen - nulLen, nulLen) )
188 {
189 // make a copy in order to properly NUL-terminate the string
190 bufTmp = wxCharBuffer(srcLen + nulLen - 1 /* 1 will be added */);
191 char * const p = bufTmp.data();
192 memcpy(p, src, srcLen);
193 for ( char *s = p + srcLen; s < p + srcLen + nulLen; s++ )
194 *s = '\0';
195
196 src = bufTmp;
197 }
198
199 srcEnd = src + srcLen;
200 }
201 else // quit after the first loop iteration
202 {
203 srcEnd = NULL;
204 }
205
206 // the idea of this code is straightforward: it converts a NUL-terminated
207 // chunk of the string during each iteration and updates the output buffer
208 // with the result
209 //
210 // all the complication come from the fact that this function, for
211 // historical reasons, must behave in 2 subtly different ways when it's
212 // called with a fixed number of characters and when it's called for the
213 // entire NUL-terminated string: in the former case (srcEnd != NULL) we
214 // must count all characters we convert, NUL or not; but in the latter we
215 // do not count the trailing NUL -- but still count all the NULs inside the
216 // string
217 //
218 // so for the (simple) former case we just always count the trailing NUL,
219 // but for the latter we need to wait until we see if there is going to be
220 // another loop iteration and only count it then
221 for ( ;; )
222 {
223 // try to convert the current chunk
224 size_t lenChunk = MB2WC(NULL, src, 0);
225 if ( lenChunk == wxCONV_FAILED )
226 return wxCONV_FAILED;
227
228 dstWritten += lenChunk;
229 if ( !srcEnd )
230 dstWritten++;
231
232 if ( dst )
233 {
234 if ( dstWritten > dstLen )
235 return wxCONV_FAILED;
236
237 // +1 is for trailing NUL
238 if ( MB2WC(dst, src, lenChunk + 1) == wxCONV_FAILED )
239 return wxCONV_FAILED;
240
241 dst += lenChunk;
242 if ( !srcEnd )
243 dst++;
244 }
245
246 if ( !srcEnd )
247 {
248 // we convert just one chunk in this case as this is the entire
249 // string anyhow (and we don't count the trailing NUL in this case)
250 break;
251 }
252
253 // advance the input pointer past the end of this chunk: notice that we
254 // will always stop before srcEnd because we know that the chunk is
255 // always properly NUL-terminated
256 while ( NotAllNULs(src, nulLen) )
257 {
258 // notice that we must skip over multiple bytes here as we suppose
259 // that if NUL takes 2 or 4 bytes, then all the other characters do
260 // too and so if advanced by a single byte we might erroneously
261 // detect sequences of NUL bytes in the middle of the input
262 src += nulLen;
263 }
264
265 // if the buffer ends before this NUL, we shouldn't count it in our
266 // output so skip the code below
267 if ( src == srcEnd )
268 break;
269
270 // do count this terminator as it's inside the buffer we convert
271 dstWritten++;
272 if ( dst )
273 dst++;
274
275 src += nulLen; // skip the terminator itself
276
277 if ( src >= srcEnd )
278 break;
279 }
280
281 return dstWritten;
282 }
283
284 size_t
FromWChar(char * dst,size_t dstLen,const wchar_t * src,size_t srcLen) const285 wxMBConv::FromWChar(char *dst, size_t dstLen,
286 const wchar_t *src, size_t srcLen) const
287 {
288 // the number of chars [which would be] written to dst [if it were not NULL]
289 size_t dstWritten = 0;
290
291 // if we don't know its length we have no choice but to assume that it is
292 // NUL-terminated (notice that it can still be NUL-terminated even if
293 // explicit length is given but it doesn't change our return value)
294 const bool isNulTerminated = srcLen == wxNO_LEN;
295
296 // make a copy of the input string unless it is already properly
297 // NUL-terminated
298 wxWCharBuffer bufTmp;
299 if ( isNulTerminated )
300 {
301 srcLen = wxWcslen(src) + 1;
302 }
303 else if ( srcLen != 0 && src[srcLen - 1] != L'\0' )
304 {
305 // make a copy in order to properly NUL-terminate the string
306 bufTmp = wxWCharBuffer(srcLen);
307 memcpy(bufTmp.data(), src, srcLen * sizeof(wchar_t));
308 src = bufTmp;
309 }
310
311 const size_t lenNul = GetMBNulLen();
312 for ( const wchar_t * const srcEnd = src + srcLen;
313 src < srcEnd;
314 src++ /* skip L'\0' too */ )
315 {
316 // try to convert the current chunk
317 size_t lenChunk = WC2MB(NULL, src, 0);
318 if ( lenChunk == wxCONV_FAILED )
319 return wxCONV_FAILED;
320
321 dstWritten += lenChunk;
322
323 const wchar_t * const
324 chunkEnd = isNulTerminated ? srcEnd - 1 : src + wxWcslen(src);
325
326 // our return value accounts for the trailing NUL(s), unlike that of
327 // WC2MB(), however don't do it for the last NUL we artificially added
328 // ourselves above
329 if ( chunkEnd < srcEnd )
330 dstWritten += lenNul;
331
332 if ( dst )
333 {
334 if ( dstWritten > dstLen )
335 return wxCONV_FAILED;
336
337 // if we know that there is enough space in the destination buffer
338 // (because we accounted for lenNul in dstWritten above), we can
339 // convert directly in place -- but otherwise we need another
340 // temporary buffer to ensure that we don't overwrite the output
341 wxCharBuffer dstBuf;
342 char *dstTmp;
343 if ( chunkEnd == srcEnd )
344 {
345 dstBuf = wxCharBuffer(lenChunk + lenNul - 1);
346 dstTmp = dstBuf.data();
347 }
348 else
349 {
350 dstTmp = dst;
351 }
352
353 if ( WC2MB(dstTmp, src, lenChunk + lenNul) == wxCONV_FAILED )
354 return wxCONV_FAILED;
355
356 if ( dstTmp != dst )
357 {
358 // copy everything up to but excluding the terminating NUL(s)
359 // into the real output buffer
360 memcpy(dst, dstTmp, lenChunk);
361
362 // micro-optimization: if dstTmp != dst it means that chunkEnd
363 // == srcEnd and so we're done, no need to update anything below
364 break;
365 }
366
367 dst += lenChunk;
368 if ( chunkEnd < srcEnd )
369 dst += lenNul;
370 }
371
372 src = chunkEnd;
373 }
374
375 return dstWritten;
376 }
377
MB2WC(wchar_t * outBuff,const char * inBuff,size_t outLen) const378 size_t wxMBConv::MB2WC(wchar_t *outBuff, const char *inBuff, size_t outLen) const
379 {
380 size_t rc = ToWChar(outBuff, outLen, inBuff);
381 if ( rc != wxCONV_FAILED )
382 {
383 // ToWChar() returns the buffer length, i.e. including the trailing
384 // NUL, while this method doesn't take it into account
385 rc--;
386 }
387
388 return rc;
389 }
390
WC2MB(char * outBuff,const wchar_t * inBuff,size_t outLen) const391 size_t wxMBConv::WC2MB(char *outBuff, const wchar_t *inBuff, size_t outLen) const
392 {
393 size_t rc = FromWChar(outBuff, outLen, inBuff);
394 if ( rc != wxCONV_FAILED )
395 {
396 rc -= GetMBNulLen();
397 }
398
399 return rc;
400 }
401
~wxMBConv()402 wxMBConv::~wxMBConv()
403 {
404 // nothing to do here (necessary for Darwin linking probably)
405 }
406
cMB2WC(const char * psz) const407 const wxWCharBuffer wxMBConv::cMB2WC(const char *psz) const
408 {
409 if ( psz )
410 {
411 // calculate the length of the buffer needed first
412 const size_t nLen = ToWChar(NULL, 0, psz);
413 if ( nLen != wxCONV_FAILED )
414 {
415 // now do the actual conversion
416 wxWCharBuffer buf(nLen - 1 /* +1 added implicitly */);
417
418 // +1 for the trailing NULL
419 if ( ToWChar(buf.data(), nLen, psz) != wxCONV_FAILED )
420 return buf;
421 }
422 }
423
424 return wxWCharBuffer();
425 }
426
cWC2MB(const wchar_t * pwz) const427 const wxCharBuffer wxMBConv::cWC2MB(const wchar_t *pwz) const
428 {
429 if ( pwz )
430 {
431 const size_t nLen = FromWChar(NULL, 0, pwz);
432 if ( nLen != wxCONV_FAILED )
433 {
434 wxCharBuffer buf(nLen - 1);
435 if ( FromWChar(buf.data(), nLen, pwz) != wxCONV_FAILED )
436 return buf;
437 }
438 }
439
440 return wxCharBuffer();
441 }
442
443 const wxWCharBuffer
cMB2WC(const char * inBuff,size_t inLen,size_t * outLen) const444 wxMBConv::cMB2WC(const char *inBuff, size_t inLen, size_t *outLen) const
445 {
446 const size_t dstLen = ToWChar(NULL, 0, inBuff, inLen);
447 if ( dstLen != wxCONV_FAILED )
448 {
449 // notice that we allocate space for dstLen+1 wide characters here
450 // because we want the buffer to always be NUL-terminated, even if the
451 // input isn't (as otherwise the caller has no way to know its length)
452 wxWCharBuffer wbuf(dstLen);
453 if ( ToWChar(wbuf.data(), dstLen, inBuff, inLen) != wxCONV_FAILED )
454 {
455 if ( outLen )
456 {
457 *outLen = dstLen;
458
459 // we also need to handle NUL-terminated input strings
460 // specially: for them the output is the length of the string
461 // excluding the trailing NUL, however if we're asked to
462 // convert a specific number of characters we return the length
463 // of the resulting output even if it's NUL-terminated
464 if ( inLen == wxNO_LEN )
465 (*outLen)--;
466 }
467
468 return wbuf;
469 }
470 }
471
472 if ( outLen )
473 *outLen = 0;
474
475 return wxWCharBuffer();
476 }
477
478 const wxCharBuffer
cWC2MB(const wchar_t * inBuff,size_t inLen,size_t * outLen) const479 wxMBConv::cWC2MB(const wchar_t *inBuff, size_t inLen, size_t *outLen) const
480 {
481 size_t dstLen = FromWChar(NULL, 0, inBuff, inLen);
482 if ( dstLen != wxCONV_FAILED )
483 {
484 const size_t nulLen = GetMBNulLen();
485
486 // as above, ensure that the buffer is always NUL-terminated, even if
487 // the input is not
488 wxCharBuffer buf(dstLen + nulLen - 1);
489 memset(buf.data() + dstLen, 0, nulLen);
490 if ( FromWChar(buf.data(), dstLen, inBuff, inLen) != wxCONV_FAILED )
491 {
492 if ( outLen )
493 {
494 *outLen = dstLen;
495
496 if ( inLen == wxNO_LEN )
497 {
498 // in this case both input and output are NUL-terminated
499 // and we're not supposed to count NUL
500 *outLen -= nulLen;
501 }
502 }
503
504 return buf;
505 }
506 }
507
508 if ( outLen )
509 *outLen = 0;
510
511 return wxCharBuffer();
512 }
513
cMB2WC(const wxScopedCharBuffer & buf) const514 const wxWCharBuffer wxMBConv::cMB2WC(const wxScopedCharBuffer& buf) const
515 {
516 const size_t srcLen = buf.length();
517 if ( srcLen )
518 {
519 const size_t dstLen = ToWChar(NULL, 0, buf, srcLen);
520 if ( dstLen != wxCONV_FAILED )
521 {
522 wxWCharBuffer wbuf(dstLen);
523 wbuf.data()[dstLen] = L'\0';
524 if ( ToWChar(wbuf.data(), dstLen, buf, srcLen) != wxCONV_FAILED )
525 return wbuf;
526 }
527 }
528
529 return wxScopedWCharBuffer::CreateNonOwned(L"", 0);
530 }
531
cWC2MB(const wxScopedWCharBuffer & wbuf) const532 const wxCharBuffer wxMBConv::cWC2MB(const wxScopedWCharBuffer& wbuf) const
533 {
534 const size_t srcLen = wbuf.length();
535 if ( srcLen )
536 {
537 const size_t dstLen = FromWChar(NULL, 0, wbuf, srcLen);
538 if ( dstLen != wxCONV_FAILED )
539 {
540 wxCharBuffer buf(dstLen);
541 buf.data()[dstLen] = '\0';
542 if ( FromWChar(buf.data(), dstLen, wbuf, srcLen) != wxCONV_FAILED )
543 return buf;
544 }
545 }
546
547 return wxScopedCharBuffer::CreateNonOwned("", 0);
548 }
549
550 // ----------------------------------------------------------------------------
551 // wxMBConvLibc
552 // ----------------------------------------------------------------------------
553
MB2WC(wchar_t * buf,const char * psz,size_t n) const554 size_t wxMBConvLibc::MB2WC(wchar_t *buf, const char *psz, size_t n) const
555 {
556 return wxMB2WC(buf, psz, n);
557 }
558
WC2MB(char * buf,const wchar_t * psz,size_t n) const559 size_t wxMBConvLibc::WC2MB(char *buf, const wchar_t *psz, size_t n) const
560 {
561 return wxWC2MB(buf, psz, n);
562 }
563
564 // ----------------------------------------------------------------------------
565 // wxConvBrokenFileNames
566 // ----------------------------------------------------------------------------
567
568 #ifdef __UNIX__
569
wxConvBrokenFileNames(const wxString & charset)570 wxConvBrokenFileNames::wxConvBrokenFileNames(const wxString& charset)
571 {
572 if ( wxStricmp(charset, wxT("UTF-8")) == 0 ||
573 wxStricmp(charset, wxT("UTF8")) == 0 )
574 m_conv = new wxMBConvUTF8(wxMBConvUTF8::MAP_INVALID_UTF8_TO_PUA);
575 else
576 m_conv = new wxCSConv(charset);
577 }
578
579 #endif // __UNIX__
580
581 // ----------------------------------------------------------------------------
582 // UTF-7
583 // ----------------------------------------------------------------------------
584
585 // Implementation (C) 2004 Fredrik Roubert
586 //
587 // Changes to work in streaming mode (C) 2008 Vadim Zeitlin
588
589 //
590 // BASE64 decoding table
591 //
592 static const unsigned char utf7unb64[] =
593 {
594 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
595 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
596 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
597 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
598 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
599 0xff, 0xff, 0xff, 0x3e, 0xff, 0xff, 0xff, 0x3f,
600 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x3b,
601 0x3c, 0x3d, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
602 0xff, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06,
603 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e,
604 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16,
605 0x17, 0x18, 0x19, 0xff, 0xff, 0xff, 0xff, 0xff,
606 0xff, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 0x20,
607 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28,
608 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, 0x30,
609 0x31, 0x32, 0x33, 0xff, 0xff, 0xff, 0xff, 0xff,
610 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
611 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
612 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
613 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
614 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
615 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
616 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
617 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
618 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
619 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
620 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
621 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
622 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
623 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
624 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
625 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
626 };
627
ToWChar(wchar_t * dst,size_t dstLen,const char * src,size_t srcLen) const628 size_t wxMBConvUTF7::ToWChar(wchar_t *dst, size_t dstLen,
629 const char *src, size_t srcLen) const
630 {
631 DecoderState stateOrig,
632 *statePtr;
633 if ( srcLen == wxNO_LEN )
634 {
635 // convert the entire string, up to and including the trailing NUL
636 srcLen = strlen(src) + 1;
637
638 // when working on the entire strings we don't update nor use the shift
639 // state from the previous call
640 statePtr = &stateOrig;
641 }
642 else // when working with partial strings we do use the shift state
643 {
644 statePtr = const_cast<DecoderState *>(&m_stateDecoder);
645
646 // also save the old state to be able to rollback to it on error
647 stateOrig = m_stateDecoder;
648 }
649
650 // but to simplify the code below we use this variable in both cases
651 DecoderState& state = *statePtr;
652
653
654 // number of characters [which would have been] written to dst [if it were
655 // not NULL]
656 size_t len = 0;
657
658 const char * const srcEnd = src + srcLen;
659
660 while ( (src < srcEnd) && (!dst || (len < dstLen)) )
661 {
662 const unsigned char cc = *src++;
663
664 if ( state.IsShifted() )
665 {
666 const unsigned char dc = utf7unb64[cc];
667 if ( dc == 0xff )
668 {
669 // end of encoded part, check that nothing was left: there can
670 // be up to 4 bits of 0 padding but nothing else (we also need
671 // to check isLSB as we count bits modulo 8 while a valid UTF-7
672 // encoded sequence must contain an integral number of UTF-16
673 // characters)
674 if ( state.isLSB || state.bit > 4 ||
675 (state.accum & ((1 << state.bit) - 1)) )
676 {
677 if ( !len )
678 state = stateOrig;
679
680 return wxCONV_FAILED;
681 }
682
683 state.ToDirect();
684
685 // re-parse this character normally below unless it's '-' which
686 // is consumed by the decoder
687 if ( cc == '-' )
688 continue;
689 }
690 else // valid encoded character
691 {
692 // mini base64 decoder: each character is 6 bits
693 state.bit += 6;
694 state.accum <<= 6;
695 state.accum += dc;
696
697 if ( state.bit >= 8 )
698 {
699 // got the full byte, consume it
700 state.bit -= 8;
701 unsigned char b = (state.accum >> state.bit) & 0x00ff;
702
703 if ( state.isLSB )
704 {
705 // we've got the full word, output it
706 if ( dst )
707 *dst++ = (state.msb << 8) | b;
708 len++;
709 state.isLSB = false;
710 }
711 else // MSB
712 {
713 // just store it while we wait for LSB
714 state.msb = b;
715 state.isLSB = true;
716 }
717 }
718 }
719 }
720
721 if ( state.IsDirect() )
722 {
723 // start of an encoded segment?
724 if ( cc == '+' )
725 {
726 // Can't end with a plus sign.
727 if ( src == srcEnd )
728 return wxCONV_FAILED;
729
730 if ( *src == '-' )
731 {
732 // just the encoded plus sign, don't switch to shifted mode
733 if ( dst )
734 *dst++ = '+';
735 len++;
736 src++;
737 }
738 else if ( utf7unb64[(unsigned)*src] == 0xff )
739 {
740 // empty encoded chunks are not allowed
741 if ( !len )
742 state = stateOrig;
743
744 return wxCONV_FAILED;
745 }
746 else // base-64 encoded chunk follows
747 {
748 state.ToShifted();
749 }
750 }
751 else // not '+'
752 {
753 // only printable 7 bit ASCII characters (with the exception of
754 // NUL, TAB, CR and LF) can be used directly
755 if ( cc >= 0x7f || (cc < ' ' &&
756 !(cc == '\0' || cc == '\t' || cc == '\r' || cc == '\n')) )
757 return wxCONV_FAILED;
758
759 if ( dst )
760 *dst++ = cc;
761 len++;
762 }
763 }
764 }
765
766 if ( !len )
767 {
768 // as we didn't read any characters we should be called with the same
769 // data (followed by some more new data) again later so don't save our
770 // state
771 state = stateOrig;
772
773 return wxCONV_FAILED;
774 }
775
776 return len;
777 }
778
779 //
780 // BASE64 encoding table
781 //
782 static const unsigned char utf7enb64[] =
783 {
784 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H',
785 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P',
786 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
787 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f',
788 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n',
789 'o', 'p', 'q', 'r', 's', 't', 'u', 'v',
790 'w', 'x', 'y', 'z', '0', '1', '2', '3',
791 '4', '5', '6', '7', '8', '9', '+', '/'
792 };
793
794 //
795 // UTF-7 encoding table
796 //
797 // 0 - Set D (directly encoded characters)
798 // 1 - Set O (optional direct characters)
799 // 2 - whitespace characters (optional)
800 // 3 - special characters
801 //
802 static const unsigned char utf7encode[128] =
803 {
804 0, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
805 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
806 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 3,
807 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
808 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
809 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
810 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
811 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3
812 };
813
wxIsUTF7Direct(wchar_t wc)814 static inline bool wxIsUTF7Direct(wchar_t wc)
815 {
816 return wc < 0x80 && utf7encode[wc] < 1;
817 }
818
FromWChar(char * dst,size_t dstLen,const wchar_t * src,size_t srcLen) const819 size_t wxMBConvUTF7::FromWChar(char *dst, size_t dstLen,
820 const wchar_t *src, size_t srcLen) const
821 {
822 EncoderState stateOrig,
823 *statePtr;
824 if ( srcLen == wxNO_LEN )
825 {
826 // we don't apply the stored state when operating on entire strings at
827 // once
828 statePtr = &stateOrig;
829
830 srcLen = wxWcslen(src) + 1;
831 }
832 else // do use the mode we left the output in previously
833 {
834 stateOrig = m_stateEncoder;
835 statePtr = const_cast<EncoderState *>(&m_stateEncoder);
836 }
837
838 EncoderState& state = *statePtr;
839
840
841 size_t len = 0;
842
843 const wchar_t * const srcEnd = src + srcLen;
844 while ( src < srcEnd && (!dst || len < dstLen) )
845 {
846 wchar_t cc = *src++;
847 if ( wxIsUTF7Direct(cc) )
848 {
849 if ( state.IsShifted() )
850 {
851 // pad with zeros the last encoded block if necessary
852 if ( state.bit )
853 {
854 if ( dst )
855 *dst++ = utf7enb64[((state.accum % 16) << (6 - state.bit)) % 64];
856 len++;
857 }
858
859 state.ToDirect();
860
861 if ( dst )
862 *dst++ = '-';
863 len++;
864 }
865
866 if ( dst )
867 *dst++ = (char)cc;
868 len++;
869 }
870 else if ( cc == '+' && state.IsDirect() )
871 {
872 if ( dst )
873 {
874 *dst++ = '+';
875 *dst++ = '-';
876 }
877
878 len += 2;
879 }
880 #ifndef WC_UTF16
881 else if (((wxUint32)cc) > 0xffff)
882 {
883 // no surrogate pair generation (yet?)
884 return wxCONV_FAILED;
885 }
886 #endif
887 else
888 {
889 if ( state.IsDirect() )
890 {
891 state.ToShifted();
892
893 if ( dst )
894 *dst++ = '+';
895 len++;
896 }
897
898 // BASE64 encode string
899 for ( ;; )
900 {
901 for ( unsigned lsb = 0; lsb < 2; lsb++ )
902 {
903 state.accum <<= 8;
904 state.accum += lsb ? cc & 0xff : (cc & 0xff00) >> 8;
905
906 for (state.bit += 8; state.bit >= 6; )
907 {
908 state.bit -= 6;
909 if ( dst )
910 *dst++ = utf7enb64[(state.accum >> state.bit) % 64];
911 len++;
912 }
913 }
914
915 if ( src == srcEnd || wxIsUTF7Direct(cc = *src) )
916 break;
917
918 src++;
919 }
920 }
921 }
922
923 // we need to restore the original encoder state if we were called just to
924 // calculate the amount of space needed as we will presumably be called
925 // again to really convert the data now
926 if ( !dst )
927 state = stateOrig;
928
929 return len;
930 }
931
932 // ----------------------------------------------------------------------------
933 // UTF-8
934 // ----------------------------------------------------------------------------
935
936 static const wxUint32 utf8_max[]=
937 { 0x7f, 0x7ff, 0xffff, 0x1fffff, 0x3ffffff, 0x7fffffff, 0xffffffff };
938
939 // boundaries of the private use area we use to (temporarily) remap invalid
940 // characters invalid in a UTF-8 encoded string
941 const wxUint32 wxUnicodePUA = 0x100000;
942 const wxUint32 wxUnicodePUAEnd = wxUnicodePUA + 256;
943
944 // this table gives the length of the UTF-8 encoding from its first character:
945 const unsigned char tableUtf8Lengths[256] = {
946 // single-byte sequences (ASCII):
947 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 00..0F
948 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 10..1F
949 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 20..2F
950 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 30..3F
951 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 40..4F
952 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 50..5F
953 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 60..6F
954 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 70..7F
955
956 // these are invalid:
957 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 80..8F
958 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 90..9F
959 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // A0..AF
960 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // B0..BF
961 0, 0, // C0,C1
962
963 // two-byte sequences:
964 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // C2..CF
965 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // D0..DF
966
967 // three-byte sequences:
968 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // E0..EF
969
970 // four-byte sequences:
971 4, 4, 4, 4, 4, // F0..F4
972
973 // these are invalid again (5- or 6-byte
974 // sequences and sequences for code points
975 // above U+10FFFF, as restricted by RFC 3629):
976 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 // F5..FF
977 };
978
979 size_t
ToWChar(wchar_t * dst,size_t dstLen,const char * src,size_t srcLen) const980 wxMBConvStrictUTF8::ToWChar(wchar_t *dst, size_t dstLen,
981 const char *src, size_t srcLen) const
982 {
983 wchar_t *out = dstLen ? dst : NULL;
984 size_t written = 0;
985
986 if ( srcLen == wxNO_LEN )
987 srcLen = strlen(src) + 1;
988
989 for ( const char *p = src; ; p++ )
990 {
991 if ( (srcLen == wxNO_LEN ? !*p : !srcLen) )
992 {
993 // all done successfully, just add the trailing NULL if we are not
994 // using explicit length
995 if ( srcLen == wxNO_LEN )
996 {
997 if ( out )
998 {
999 if ( !dstLen )
1000 break;
1001
1002 *out = L'\0';
1003 }
1004
1005 written++;
1006 }
1007
1008 return written;
1009 }
1010
1011 if ( out && !dstLen-- )
1012 break;
1013
1014 wxUint32 code;
1015 unsigned char c = *p;
1016
1017 if ( c < 0x80 )
1018 {
1019 if ( srcLen == 0 ) // the test works for wxNO_LEN too
1020 break;
1021
1022 if ( srcLen != wxNO_LEN )
1023 srcLen--;
1024
1025 code = c;
1026 }
1027 else
1028 {
1029 unsigned len = tableUtf8Lengths[c];
1030 if ( !len )
1031 break;
1032
1033 if ( srcLen < len ) // the test works for wxNO_LEN too
1034 break;
1035
1036 if ( srcLen != wxNO_LEN )
1037 srcLen -= len;
1038
1039 // Char. number range | UTF-8 octet sequence
1040 // (hexadecimal) | (binary)
1041 // ----------------------+----------------------------------------
1042 // 0000 0000 - 0000 007F | 0xxxxxxx
1043 // 0000 0080 - 0000 07FF | 110xxxxx 10xxxxxx
1044 // 0000 0800 - 0000 FFFF | 1110xxxx 10xxxxxx 10xxxxxx
1045 // 0001 0000 - 0010 FFFF | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
1046 //
1047 // Code point value is stored in bits marked with 'x',
1048 // lowest-order bit of the value on the right side in the diagram
1049 // above. (from RFC 3629)
1050
1051 // mask to extract lead byte's value ('x' bits above), by sequence
1052 // length:
1053 static const unsigned char leadValueMask[] = { 0x7F, 0x1F, 0x0F, 0x07 };
1054
1055 len--; // it's more convenient to work with 0-based length here
1056
1057 code = c & leadValueMask[len];
1058
1059 // all remaining bytes, if any, are handled in the same way
1060 // regardless of sequence's length:
1061 for ( ; len; --len )
1062 {
1063 c = *++p;
1064 if ( (c & 0xC0) != 0x80 )
1065 return wxCONV_FAILED;
1066
1067 code <<= 6;
1068 code |= c & 0x3F;
1069 }
1070 }
1071
1072 #ifdef WC_UTF16
1073 // cast is ok because wchar_t == wxUint16 if WC_UTF16
1074 if ( encode_utf16(code, (wxUint16 *)out) == 2 )
1075 {
1076 if ( out )
1077 out++;
1078 written++;
1079 }
1080 #else // !WC_UTF16
1081 if ( out )
1082 *out = code;
1083 #endif // WC_UTF16/!WC_UTF16
1084
1085 if ( out )
1086 out++;
1087
1088 written++;
1089 }
1090
1091 return wxCONV_FAILED;
1092 }
1093
1094 size_t
FromWChar(char * dst,size_t dstLen,const wchar_t * src,size_t srcLen) const1095 wxMBConvStrictUTF8::FromWChar(char *dst, size_t dstLen,
1096 const wchar_t *src, size_t srcLen) const
1097 {
1098 char *out = dstLen ? dst : NULL;
1099 size_t written = 0;
1100
1101 const wchar_t* const end = srcLen == wxNO_LEN ? NULL : src + srcLen;
1102 for ( const wchar_t *wp = src; ; )
1103 {
1104 if ( end ? wp == end : !*wp )
1105 {
1106 // all done successfully, just add the trailing NULL if we are not
1107 // using explicit length
1108 if ( srcLen == wxNO_LEN )
1109 {
1110 if ( out )
1111 {
1112 if ( !dstLen )
1113 break;
1114
1115 *out = '\0';
1116 }
1117
1118 written++;
1119 }
1120
1121 return written;
1122 }
1123
1124 wxUint32 code;
1125 #ifdef WC_UTF16
1126 code = wxDecodeSurrogate(&wp, end);
1127 if ( !wp )
1128 return wxCONV_FAILED;
1129 #else // wchar_t is UTF-32
1130 code = *wp++ & 0x7fffffff;
1131 #endif
1132
1133 unsigned len;
1134 if ( code <= 0x7F )
1135 {
1136 len = 1;
1137 if ( out )
1138 {
1139 if ( dstLen < len )
1140 break;
1141
1142 out[0] = (char)code;
1143 }
1144 }
1145 else if ( code <= 0x07FF )
1146 {
1147 len = 2;
1148 if ( out )
1149 {
1150 if ( dstLen < len )
1151 break;
1152
1153 // NB: this line takes 6 least significant bits, encodes them as
1154 // 10xxxxxx and discards them so that the next byte can be encoded:
1155 out[1] = 0x80 | (code & 0x3F); code >>= 6;
1156 out[0] = 0xC0 | code;
1157 }
1158 }
1159 else if ( code < 0xFFFF )
1160 {
1161 len = 3;
1162 if ( out )
1163 {
1164 if ( dstLen < len )
1165 break;
1166
1167 out[2] = 0x80 | (code & 0x3F); code >>= 6;
1168 out[1] = 0x80 | (code & 0x3F); code >>= 6;
1169 out[0] = 0xE0 | code;
1170 }
1171 }
1172 else if ( code <= 0x10FFFF )
1173 {
1174 len = 4;
1175 if ( out )
1176 {
1177 if ( dstLen < len )
1178 break;
1179
1180 out[3] = 0x80 | (code & 0x3F); code >>= 6;
1181 out[2] = 0x80 | (code & 0x3F); code >>= 6;
1182 out[1] = 0x80 | (code & 0x3F); code >>= 6;
1183 out[0] = 0xF0 | code;
1184 }
1185 }
1186 else
1187 {
1188 wxFAIL_MSG( wxT("trying to encode undefined Unicode character") );
1189 break;
1190 }
1191
1192 if ( out )
1193 {
1194 out += len;
1195 dstLen -= len;
1196 }
1197
1198 written += len;
1199 }
1200
1201 // we only get here if an error occurs during decoding
1202 return wxCONV_FAILED;
1203 }
1204
ToWChar(wchar_t * buf,size_t n,const char * psz,size_t srcLen) const1205 size_t wxMBConvUTF8::ToWChar(wchar_t *buf, size_t n,
1206 const char *psz, size_t srcLen) const
1207 {
1208 if ( m_options == MAP_INVALID_UTF8_NOT )
1209 return wxMBConvStrictUTF8::ToWChar(buf, n, psz, srcLen);
1210
1211 size_t len = 0;
1212
1213 // The length can be either given explicitly or computed implicitly for the
1214 // NUL-terminated strings.
1215 const bool isNulTerminated = srcLen == wxNO_LEN;
1216 while ((isNulTerminated ? *psz : srcLen--) && ((!buf) || (len < n)))
1217 {
1218 const char *opsz = psz;
1219 bool invalid = false;
1220 unsigned char cc = *psz++, fc = cc;
1221 unsigned cnt;
1222 for (cnt = 0; fc & 0x80; cnt++)
1223 fc <<= 1;
1224
1225 if (!cnt)
1226 {
1227 // plain ASCII char
1228 if (buf)
1229 *buf++ = cc;
1230 len++;
1231
1232 // escape the escape character for octal escapes
1233 if ((m_options & MAP_INVALID_UTF8_TO_OCTAL)
1234 && cc == '\\' && (!buf || len < n))
1235 {
1236 if (buf)
1237 *buf++ = cc;
1238 len++;
1239 }
1240 }
1241 else
1242 {
1243 cnt--;
1244 if (!cnt)
1245 {
1246 // invalid UTF-8 sequence
1247 invalid = true;
1248 }
1249 else
1250 {
1251 unsigned ocnt = cnt - 1;
1252 wxUint32 res = cc & (0x3f >> cnt);
1253 while (cnt--)
1254 {
1255 if (!isNulTerminated && !srcLen)
1256 {
1257 // invalid UTF-8 sequence ending before the end of code
1258 // point.
1259 invalid = true;
1260 break;
1261 }
1262
1263 cc = *psz;
1264 if ((cc & 0xC0) != 0x80)
1265 {
1266 // invalid UTF-8 sequence
1267 invalid = true;
1268 break;
1269 }
1270
1271 psz++;
1272 if (!isNulTerminated)
1273 srcLen--;
1274 res = (res << 6) | (cc & 0x3f);
1275 }
1276
1277 if (invalid || res <= utf8_max[ocnt])
1278 {
1279 // illegal UTF-8 encoding
1280 invalid = true;
1281 }
1282 else if ((m_options & MAP_INVALID_UTF8_TO_PUA) &&
1283 res >= wxUnicodePUA && res < wxUnicodePUAEnd)
1284 {
1285 // if one of our PUA characters turns up externally
1286 // it must also be treated as an illegal sequence
1287 // (a bit like you have to escape an escape character)
1288 invalid = true;
1289 }
1290 else
1291 {
1292 #ifdef WC_UTF16
1293 // cast is ok because wchar_t == wxUint16 if WC_UTF16
1294 size_t pa = encode_utf16(res, (wxUint16 *)buf);
1295 if (pa == wxCONV_FAILED)
1296 {
1297 invalid = true;
1298 }
1299 else
1300 {
1301 if (buf)
1302 buf += pa;
1303 len += pa;
1304 }
1305 #else // !WC_UTF16
1306 if (buf)
1307 *buf++ = (wchar_t)res;
1308 len++;
1309 #endif // WC_UTF16/!WC_UTF16
1310 }
1311 }
1312
1313 if (invalid)
1314 {
1315 if (m_options & MAP_INVALID_UTF8_TO_PUA)
1316 {
1317 while (opsz < psz && (!buf || len < n))
1318 {
1319 #ifdef WC_UTF16
1320 // cast is ok because wchar_t == wxUuint16 if WC_UTF16
1321 size_t pa = encode_utf16((unsigned char)*opsz + wxUnicodePUA, (wxUint16 *)buf);
1322 wxASSERT(pa != wxCONV_FAILED);
1323 if (buf)
1324 buf += pa;
1325 opsz++;
1326 len += pa;
1327 #else
1328 if (buf)
1329 *buf++ = (wchar_t)(wxUnicodePUA + (unsigned char)*opsz);
1330 opsz++;
1331 len++;
1332 #endif
1333 }
1334 }
1335 else if (m_options & MAP_INVALID_UTF8_TO_OCTAL)
1336 {
1337 while (opsz < psz && (!buf || len < n))
1338 {
1339 if ( buf && len + 3 < n )
1340 {
1341 unsigned char on = *opsz;
1342 *buf++ = L'\\';
1343 *buf++ = (wchar_t)( L'0' + on / 0100 );
1344 *buf++ = (wchar_t)( L'0' + (on % 0100) / 010 );
1345 *buf++ = (wchar_t)( L'0' + on % 010 );
1346 }
1347
1348 opsz++;
1349 len += 4;
1350 }
1351 }
1352 else // MAP_INVALID_UTF8_NOT
1353 {
1354 return wxCONV_FAILED;
1355 }
1356 }
1357 }
1358 }
1359
1360 if ( isNulTerminated )
1361 {
1362 // Add the trailing NUL in this case if we have a large enough buffer.
1363 if ( buf && (len < n) )
1364 *buf = 0;
1365
1366 // And count it in any case.
1367 len++;
1368 }
1369
1370 return len;
1371 }
1372
isoctal(wchar_t wch)1373 static inline bool isoctal(wchar_t wch)
1374 {
1375 return L'0' <= wch && wch <= L'7';
1376 }
1377
FromWChar(char * buf,size_t n,const wchar_t * psz,size_t srcLen) const1378 size_t wxMBConvUTF8::FromWChar(char *buf, size_t n,
1379 const wchar_t *psz, size_t srcLen) const
1380 {
1381 if ( m_options == MAP_INVALID_UTF8_NOT )
1382 return wxMBConvStrictUTF8::FromWChar(buf, n, psz, srcLen);
1383
1384 size_t len = 0;
1385
1386 // The length can be either given explicitly or computed implicitly for the
1387 // NUL-terminated strings.
1388 const wchar_t* const end = srcLen == wxNO_LEN ? NULL : psz + srcLen;
1389 while ((end ? psz < end : *psz) && ((!buf) || (len < n)))
1390 {
1391 wxUint32 cc;
1392
1393 #ifdef WC_UTF16
1394 cc = wxDecodeSurrogate(&psz, end);
1395 if ( !psz )
1396 return wxCONV_FAILED;
1397 #else
1398 cc = (*psz++) & 0x7fffffff;
1399 #endif
1400
1401 if ( (m_options & MAP_INVALID_UTF8_TO_PUA)
1402 && cc >= wxUnicodePUA && cc < wxUnicodePUAEnd )
1403 {
1404 if (buf)
1405 *buf++ = (char)(cc - wxUnicodePUA);
1406 len++;
1407 }
1408 else if ( (m_options & MAP_INVALID_UTF8_TO_OCTAL)
1409 && cc == L'\\' && psz[0] == L'\\' )
1410 {
1411 if (buf)
1412 *buf++ = (char)cc;
1413 psz++;
1414 len++;
1415 }
1416 else if ( (m_options & MAP_INVALID_UTF8_TO_OCTAL) &&
1417 cc == L'\\' &&
1418 isoctal(psz[0]) && isoctal(psz[1]) && isoctal(psz[2]) )
1419 {
1420 if (buf)
1421 {
1422 *buf++ = (char) ((psz[0] - L'0') * 0100 +
1423 (psz[1] - L'0') * 010 +
1424 (psz[2] - L'0'));
1425 }
1426
1427 psz += 3;
1428 len++;
1429 }
1430 else
1431 {
1432 unsigned cnt;
1433 for (cnt = 0; cc > utf8_max[cnt]; cnt++)
1434 {
1435 }
1436
1437 if (!cnt)
1438 {
1439 // plain ASCII char
1440 if (buf)
1441 *buf++ = (char) cc;
1442 len++;
1443 }
1444 else
1445 {
1446 len += cnt + 1;
1447 if (buf)
1448 {
1449 *buf++ = (char) ((-128 >> cnt) | ((cc >> (cnt * 6)) & (0x3f >> cnt)));
1450 while (cnt--)
1451 *buf++ = (char) (0x80 | ((cc >> (cnt * 6)) & 0x3f));
1452 }
1453 }
1454 }
1455 }
1456
1457 if ( !end )
1458 {
1459 // Add the trailing NUL in this case if we have a large enough buffer.
1460 if ( buf && (len < n) )
1461 *buf = 0;
1462
1463 // And count it in any case.
1464 len++;
1465 }
1466
1467 return len;
1468 }
1469
1470 // ============================================================================
1471 // UTF-16
1472 // ============================================================================
1473
1474 #ifdef WORDS_BIGENDIAN
1475 #define wxMBConvUTF16straight wxMBConvUTF16BE
1476 #define wxMBConvUTF16swap wxMBConvUTF16LE
1477 #else
1478 #define wxMBConvUTF16swap wxMBConvUTF16BE
1479 #define wxMBConvUTF16straight wxMBConvUTF16LE
1480 #endif
1481
1482 /* static */
GetLength(const char * src,size_t srcLen)1483 size_t wxMBConvUTF16Base::GetLength(const char *src, size_t srcLen)
1484 {
1485 if ( srcLen == wxNO_LEN )
1486 {
1487 // count the number of bytes in input, including the trailing NULs
1488 const wxUint16 *inBuff = reinterpret_cast<const wxUint16 *>(src);
1489 for ( srcLen = 1; *inBuff++; srcLen++ )
1490 ;
1491
1492 srcLen *= BYTES_PER_CHAR;
1493 }
1494 else // we already have the length
1495 {
1496 // we can only convert an entire number of UTF-16 characters
1497 if ( srcLen % BYTES_PER_CHAR )
1498 return wxCONV_FAILED;
1499 }
1500
1501 return srcLen;
1502 }
1503
1504 // case when in-memory representation is UTF-16 too
1505 #ifdef WC_UTF16
1506
1507 // ----------------------------------------------------------------------------
1508 // conversions without endianness change
1509 // ----------------------------------------------------------------------------
1510
1511 size_t
ToWChar(wchar_t * dst,size_t dstLen,const char * src,size_t srcLen) const1512 wxMBConvUTF16straight::ToWChar(wchar_t *dst, size_t dstLen,
1513 const char *src, size_t srcLen) const
1514 {
1515 // set up the scene for using memcpy() (which is presumably more efficient
1516 // than copying the bytes one by one)
1517 srcLen = GetLength(src, srcLen);
1518 if ( srcLen == wxNO_LEN )
1519 return wxCONV_FAILED;
1520
1521 const size_t inLen = srcLen / BYTES_PER_CHAR;
1522 if ( dst )
1523 {
1524 if ( dstLen < inLen )
1525 return wxCONV_FAILED;
1526
1527 memcpy(dst, src, srcLen);
1528 }
1529
1530 return inLen;
1531 }
1532
1533 size_t
FromWChar(char * dst,size_t dstLen,const wchar_t * src,size_t srcLen) const1534 wxMBConvUTF16straight::FromWChar(char *dst, size_t dstLen,
1535 const wchar_t *src, size_t srcLen) const
1536 {
1537 if ( srcLen == wxNO_LEN )
1538 srcLen = wxWcslen(src) + 1;
1539
1540 srcLen *= BYTES_PER_CHAR;
1541
1542 if ( dst )
1543 {
1544 if ( dstLen < srcLen )
1545 return wxCONV_FAILED;
1546
1547 memcpy(dst, src, srcLen);
1548 }
1549
1550 return srcLen;
1551 }
1552
1553 // ----------------------------------------------------------------------------
1554 // endian-reversing conversions
1555 // ----------------------------------------------------------------------------
1556
1557 size_t
ToWChar(wchar_t * dst,size_t dstLen,const char * src,size_t srcLen) const1558 wxMBConvUTF16swap::ToWChar(wchar_t *dst, size_t dstLen,
1559 const char *src, size_t srcLen) const
1560 {
1561 srcLen = GetLength(src, srcLen);
1562 if ( srcLen == wxNO_LEN )
1563 return wxCONV_FAILED;
1564
1565 srcLen /= BYTES_PER_CHAR;
1566
1567 if ( dst )
1568 {
1569 if ( dstLen < srcLen )
1570 return wxCONV_FAILED;
1571
1572 const wxUint16 *inBuff = reinterpret_cast<const wxUint16 *>(src);
1573 for ( size_t n = 0; n < srcLen; n++, inBuff++ )
1574 {
1575 *dst++ = wxUINT16_SWAP_ALWAYS(*inBuff);
1576 }
1577 }
1578
1579 return srcLen;
1580 }
1581
1582 size_t
FromWChar(char * dst,size_t dstLen,const wchar_t * src,size_t srcLen) const1583 wxMBConvUTF16swap::FromWChar(char *dst, size_t dstLen,
1584 const wchar_t *src, size_t srcLen) const
1585 {
1586 if ( srcLen == wxNO_LEN )
1587 srcLen = wxWcslen(src) + 1;
1588
1589 srcLen *= BYTES_PER_CHAR;
1590
1591 if ( dst )
1592 {
1593 if ( dstLen < srcLen )
1594 return wxCONV_FAILED;
1595
1596 wxUint16 *outBuff = reinterpret_cast<wxUint16 *>(dst);
1597 for ( size_t n = 0; n < srcLen; n += BYTES_PER_CHAR, src++ )
1598 {
1599 *outBuff++ = wxUINT16_SWAP_ALWAYS(*src);
1600 }
1601 }
1602
1603 return srcLen;
1604 }
1605
1606 #else // !WC_UTF16: wchar_t is UTF-32
1607
1608 // ----------------------------------------------------------------------------
1609 // conversions without endianness change
1610 // ----------------------------------------------------------------------------
1611
1612 size_t
ToWChar(wchar_t * dst,size_t dstLen,const char * src,size_t srcLen) const1613 wxMBConvUTF16straight::ToWChar(wchar_t *dst, size_t dstLen,
1614 const char *src, size_t srcLen) const
1615 {
1616 srcLen = GetLength(src, srcLen);
1617 if ( srcLen == wxNO_LEN )
1618 return wxCONV_FAILED;
1619
1620 const size_t inLen = srcLen / BYTES_PER_CHAR;
1621 size_t outLen = 0;
1622 const wxUint16 *inBuff = reinterpret_cast<const wxUint16 *>(src);
1623 for ( const wxUint16 * const inEnd = inBuff + inLen; inBuff < inEnd; )
1624 {
1625 const wxUint32 ch = wxDecodeSurrogate(&inBuff, inEnd);
1626 if ( !inBuff )
1627 return wxCONV_FAILED;
1628
1629 outLen++;
1630
1631 if ( dst )
1632 {
1633 if ( outLen > dstLen )
1634 return wxCONV_FAILED;
1635
1636 *dst++ = ch;
1637 }
1638 }
1639
1640
1641 return outLen;
1642 }
1643
1644 size_t
FromWChar(char * dst,size_t dstLen,const wchar_t * src,size_t srcLen) const1645 wxMBConvUTF16straight::FromWChar(char *dst, size_t dstLen,
1646 const wchar_t *src, size_t srcLen) const
1647 {
1648 if ( srcLen == wxNO_LEN )
1649 srcLen = wxWcslen(src) + 1;
1650
1651 size_t outLen = 0;
1652 wxUint16 *outBuff = reinterpret_cast<wxUint16 *>(dst);
1653 for ( size_t n = 0; n < srcLen; n++ )
1654 {
1655 wxUint16 cc[2] = { 0 };
1656 const size_t numChars = encode_utf16(*src++, cc);
1657 if ( numChars == wxCONV_FAILED )
1658 return wxCONV_FAILED;
1659
1660 outLen += numChars * BYTES_PER_CHAR;
1661 if ( outBuff )
1662 {
1663 if ( outLen > dstLen )
1664 return wxCONV_FAILED;
1665
1666 *outBuff++ = cc[0];
1667 if ( numChars == 2 )
1668 {
1669 // second character of a surrogate
1670 *outBuff++ = cc[1];
1671 }
1672 }
1673 }
1674
1675 return outLen;
1676 }
1677
1678 // ----------------------------------------------------------------------------
1679 // endian-reversing conversions
1680 // ----------------------------------------------------------------------------
1681
1682 size_t
ToWChar(wchar_t * dst,size_t dstLen,const char * src,size_t srcLen) const1683 wxMBConvUTF16swap::ToWChar(wchar_t *dst, size_t dstLen,
1684 const char *src, size_t srcLen) const
1685 {
1686 srcLen = GetLength(src, srcLen);
1687 if ( srcLen == wxNO_LEN )
1688 return wxCONV_FAILED;
1689
1690 const size_t inLen = srcLen / BYTES_PER_CHAR;
1691 size_t outLen = 0;
1692 const wxUint16 *inBuff = reinterpret_cast<const wxUint16 *>(src);
1693 for ( const wxUint16 * const inEnd = inBuff + inLen; inBuff < inEnd; )
1694 {
1695 wxUint16 tmp[2];
1696 const wxUint16* tmpEnd = tmp;
1697
1698 tmp[0] = wxUINT16_SWAP_ALWAYS(*inBuff);
1699 tmpEnd++;
1700
1701 if ( inBuff + 1 < inEnd )
1702 {
1703 // Normal case, we have a next character to decode.
1704 tmp[1] = wxUINT16_SWAP_ALWAYS(inBuff[1]);
1705 tmpEnd++;
1706 }
1707
1708 const wxUint16* p = tmp;
1709 const wxUint32 ch = wxDecodeSurrogate(&p, tmpEnd);
1710 if ( !p )
1711 return wxCONV_FAILED;
1712
1713 // Move the real pointer by the same amount as "p" was updated by.
1714 inBuff += p - tmp;
1715
1716 outLen++;
1717
1718 if ( dst )
1719 {
1720 if ( outLen > dstLen )
1721 return wxCONV_FAILED;
1722
1723 *dst++ = ch;
1724 }
1725 }
1726
1727
1728 return outLen;
1729 }
1730
1731 size_t
FromWChar(char * dst,size_t dstLen,const wchar_t * src,size_t srcLen) const1732 wxMBConvUTF16swap::FromWChar(char *dst, size_t dstLen,
1733 const wchar_t *src, size_t srcLen) const
1734 {
1735 if ( srcLen == wxNO_LEN )
1736 srcLen = wxWcslen(src) + 1;
1737
1738 size_t outLen = 0;
1739 wxUint16 *outBuff = reinterpret_cast<wxUint16 *>(dst);
1740 for ( const wchar_t *srcEnd = src + srcLen; src < srcEnd; src++ )
1741 {
1742 wxUint16 cc[2] = { 0 };
1743 const size_t numChars = encode_utf16(*src, cc);
1744 if ( numChars == wxCONV_FAILED )
1745 return wxCONV_FAILED;
1746
1747 outLen += numChars * BYTES_PER_CHAR;
1748 if ( outBuff )
1749 {
1750 if ( outLen > dstLen )
1751 return wxCONV_FAILED;
1752
1753 *outBuff++ = wxUINT16_SWAP_ALWAYS(cc[0]);
1754 if ( numChars == 2 )
1755 {
1756 // second character of a surrogate
1757 *outBuff++ = wxUINT16_SWAP_ALWAYS(cc[1]);
1758 }
1759 }
1760 }
1761
1762 return outLen;
1763 }
1764
1765 #endif // WC_UTF16/!WC_UTF16
1766
1767
1768 // ============================================================================
1769 // UTF-32
1770 // ============================================================================
1771
1772 #ifdef WORDS_BIGENDIAN
1773 #define wxMBConvUTF32straight wxMBConvUTF32BE
1774 #define wxMBConvUTF32swap wxMBConvUTF32LE
1775 #else
1776 #define wxMBConvUTF32swap wxMBConvUTF32BE
1777 #define wxMBConvUTF32straight wxMBConvUTF32LE
1778 #endif
1779
1780
1781 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32LE) wxConvUTF32LE;
1782 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32BE) wxConvUTF32BE;
1783
1784 /* static */
GetLength(const char * src,size_t srcLen)1785 size_t wxMBConvUTF32Base::GetLength(const char *src, size_t srcLen)
1786 {
1787 if ( srcLen == wxNO_LEN )
1788 {
1789 // count the number of bytes in input, including the trailing NULs
1790 const wxUint32 *inBuff = reinterpret_cast<const wxUint32 *>(src);
1791 for ( srcLen = 1; *inBuff++; srcLen++ )
1792 ;
1793
1794 srcLen *= BYTES_PER_CHAR;
1795 }
1796 else // we already have the length
1797 {
1798 // we can only convert an entire number of UTF-32 characters
1799 if ( srcLen % BYTES_PER_CHAR )
1800 return wxCONV_FAILED;
1801 }
1802
1803 return srcLen;
1804 }
1805
1806 // case when in-memory representation is UTF-16
1807 #ifdef WC_UTF16
1808
1809 // ----------------------------------------------------------------------------
1810 // conversions without endianness change
1811 // ----------------------------------------------------------------------------
1812
1813 size_t
ToWChar(wchar_t * dst,size_t dstLen,const char * src,size_t srcLen) const1814 wxMBConvUTF32straight::ToWChar(wchar_t *dst, size_t dstLen,
1815 const char *src, size_t srcLen) const
1816 {
1817 srcLen = GetLength(src, srcLen);
1818 if ( srcLen == wxNO_LEN )
1819 return wxCONV_FAILED;
1820
1821 const wxUint32 *inBuff = reinterpret_cast<const wxUint32 *>(src);
1822 const size_t inLen = srcLen / BYTES_PER_CHAR;
1823 size_t outLen = 0;
1824 for ( size_t n = 0; n < inLen; n++ )
1825 {
1826 wxUint16 cc[2] = { 0 };
1827 const size_t numChars = encode_utf16(*inBuff++, cc);
1828 if ( numChars == wxCONV_FAILED )
1829 return wxCONV_FAILED;
1830
1831 outLen += numChars;
1832 if ( dst )
1833 {
1834 if ( outLen > dstLen )
1835 return wxCONV_FAILED;
1836
1837 *dst++ = cc[0];
1838 if ( numChars == 2 )
1839 {
1840 // second character of a surrogate
1841 *dst++ = cc[1];
1842 }
1843 }
1844 }
1845
1846 return outLen;
1847 }
1848
1849 size_t
FromWChar(char * dst,size_t dstLen,const wchar_t * src,size_t srcLen) const1850 wxMBConvUTF32straight::FromWChar(char *dst, size_t dstLen,
1851 const wchar_t *src, size_t srcLen) const
1852 {
1853 if ( srcLen == wxNO_LEN )
1854 srcLen = wxWcslen(src) + 1;
1855
1856 if ( !dst )
1857 {
1858 // optimization: return maximal space which could be needed for this
1859 // string instead of the exact amount which could be less if there are
1860 // any surrogates in the input
1861 //
1862 // we consider that surrogates are rare enough to make it worthwhile to
1863 // avoid running the loop below at the cost of slightly extra memory
1864 // consumption
1865 return srcLen * BYTES_PER_CHAR;
1866 }
1867
1868 wxUint32 *outBuff = reinterpret_cast<wxUint32 *>(dst);
1869 size_t outLen = 0;
1870 for ( const wchar_t * const srcEnd = src + srcLen; src < srcEnd; )
1871 {
1872 const wxUint32 ch = wxDecodeSurrogate(&src, srcEnd);
1873 if ( !src )
1874 return wxCONV_FAILED;
1875
1876 outLen += BYTES_PER_CHAR;
1877
1878 if ( outLen > dstLen )
1879 return wxCONV_FAILED;
1880
1881 *outBuff++ = ch;
1882 }
1883
1884 return outLen;
1885 }
1886
1887 // ----------------------------------------------------------------------------
1888 // endian-reversing conversions
1889 // ----------------------------------------------------------------------------
1890
1891 size_t
ToWChar(wchar_t * dst,size_t dstLen,const char * src,size_t srcLen) const1892 wxMBConvUTF32swap::ToWChar(wchar_t *dst, size_t dstLen,
1893 const char *src, size_t srcLen) const
1894 {
1895 srcLen = GetLength(src, srcLen);
1896 if ( srcLen == wxNO_LEN )
1897 return wxCONV_FAILED;
1898
1899 const wxUint32 *inBuff = reinterpret_cast<const wxUint32 *>(src);
1900 const size_t inLen = srcLen / BYTES_PER_CHAR;
1901 size_t outLen = 0;
1902 for ( size_t n = 0; n < inLen; n++, inBuff++ )
1903 {
1904 wxUint16 cc[2] = { 0 };
1905 const size_t numChars = encode_utf16(wxUINT32_SWAP_ALWAYS(*inBuff), cc);
1906 if ( numChars == wxCONV_FAILED )
1907 return wxCONV_FAILED;
1908
1909 outLen += numChars;
1910 if ( dst )
1911 {
1912 if ( outLen > dstLen )
1913 return wxCONV_FAILED;
1914
1915 *dst++ = cc[0];
1916 if ( numChars == 2 )
1917 {
1918 // second character of a surrogate
1919 *dst++ = cc[1];
1920 }
1921 }
1922 }
1923
1924 return outLen;
1925 }
1926
1927 size_t
FromWChar(char * dst,size_t dstLen,const wchar_t * src,size_t srcLen) const1928 wxMBConvUTF32swap::FromWChar(char *dst, size_t dstLen,
1929 const wchar_t *src, size_t srcLen) const
1930 {
1931 if ( srcLen == wxNO_LEN )
1932 srcLen = wxWcslen(src) + 1;
1933
1934 if ( !dst )
1935 {
1936 // optimization: return maximal space which could be needed for this
1937 // string instead of the exact amount which could be less if there are
1938 // any surrogates in the input
1939 //
1940 // we consider that surrogates are rare enough to make it worthwhile to
1941 // avoid running the loop below at the cost of slightly extra memory
1942 // consumption
1943 return srcLen*BYTES_PER_CHAR;
1944 }
1945
1946 wxUint32 *outBuff = reinterpret_cast<wxUint32 *>(dst);
1947 size_t outLen = 0;
1948 for ( const wchar_t * const srcEnd = src + srcLen; src < srcEnd; )
1949 {
1950 const wxUint32 ch = wxDecodeSurrogate(&src, srcEnd);
1951 if ( !src )
1952 return wxCONV_FAILED;
1953
1954 outLen += BYTES_PER_CHAR;
1955
1956 if ( outLen > dstLen )
1957 return wxCONV_FAILED;
1958
1959 *outBuff++ = wxUINT32_SWAP_ALWAYS(ch);
1960 }
1961
1962 return outLen;
1963 }
1964
1965 #else // !WC_UTF16: wchar_t is UTF-32
1966
1967 // ----------------------------------------------------------------------------
1968 // conversions without endianness change
1969 // ----------------------------------------------------------------------------
1970
1971 size_t
ToWChar(wchar_t * dst,size_t dstLen,const char * src,size_t srcLen) const1972 wxMBConvUTF32straight::ToWChar(wchar_t *dst, size_t dstLen,
1973 const char *src, size_t srcLen) const
1974 {
1975 // use memcpy() as it should be much faster than hand-written loop
1976 srcLen = GetLength(src, srcLen);
1977 if ( srcLen == wxNO_LEN )
1978 return wxCONV_FAILED;
1979
1980 const size_t inLen = srcLen/BYTES_PER_CHAR;
1981 if ( dst )
1982 {
1983 if ( dstLen < inLen )
1984 return wxCONV_FAILED;
1985
1986 memcpy(dst, src, srcLen);
1987 }
1988
1989 return inLen;
1990 }
1991
1992 size_t
FromWChar(char * dst,size_t dstLen,const wchar_t * src,size_t srcLen) const1993 wxMBConvUTF32straight::FromWChar(char *dst, size_t dstLen,
1994 const wchar_t *src, size_t srcLen) const
1995 {
1996 if ( srcLen == wxNO_LEN )
1997 srcLen = wxWcslen(src) + 1;
1998
1999 srcLen *= BYTES_PER_CHAR;
2000
2001 if ( dst )
2002 {
2003 if ( dstLen < srcLen )
2004 return wxCONV_FAILED;
2005
2006 memcpy(dst, src, srcLen);
2007 }
2008
2009 return srcLen;
2010 }
2011
2012 // ----------------------------------------------------------------------------
2013 // endian-reversing conversions
2014 // ----------------------------------------------------------------------------
2015
2016 size_t
ToWChar(wchar_t * dst,size_t dstLen,const char * src,size_t srcLen) const2017 wxMBConvUTF32swap::ToWChar(wchar_t *dst, size_t dstLen,
2018 const char *src, size_t srcLen) const
2019 {
2020 srcLen = GetLength(src, srcLen);
2021 if ( srcLen == wxNO_LEN )
2022 return wxCONV_FAILED;
2023
2024 srcLen /= BYTES_PER_CHAR;
2025
2026 if ( dst )
2027 {
2028 if ( dstLen < srcLen )
2029 return wxCONV_FAILED;
2030
2031 const wxUint32 *inBuff = reinterpret_cast<const wxUint32 *>(src);
2032 for ( size_t n = 0; n < srcLen; n++, inBuff++ )
2033 {
2034 *dst++ = wxUINT32_SWAP_ALWAYS(*inBuff);
2035 }
2036 }
2037
2038 return srcLen;
2039 }
2040
2041 size_t
FromWChar(char * dst,size_t dstLen,const wchar_t * src,size_t srcLen) const2042 wxMBConvUTF32swap::FromWChar(char *dst, size_t dstLen,
2043 const wchar_t *src, size_t srcLen) const
2044 {
2045 if ( srcLen == wxNO_LEN )
2046 srcLen = wxWcslen(src) + 1;
2047
2048 srcLen *= BYTES_PER_CHAR;
2049
2050 if ( dst )
2051 {
2052 if ( dstLen < srcLen )
2053 return wxCONV_FAILED;
2054
2055 wxUint32 *outBuff = reinterpret_cast<wxUint32 *>(dst);
2056 for ( size_t n = 0; n < srcLen; n += BYTES_PER_CHAR, src++ )
2057 {
2058 *outBuff++ = wxUINT32_SWAP_ALWAYS(*src);
2059 }
2060 }
2061
2062 return srcLen;
2063 }
2064
2065 #endif // WC_UTF16/!WC_UTF16
2066
2067
2068 // ============================================================================
2069 // The classes doing conversion using the iconv_xxx() functions
2070 // ============================================================================
2071
2072 #ifdef HAVE_ICONV
2073
2074 // VS: glibc 2.1.3 is broken in that iconv() conversion to/from UCS4 fails with
2075 // E2BIG if output buffer is _exactly_ as big as needed. Such case is
2076 // (unless there's yet another bug in glibc) the only case when iconv()
2077 // returns with (size_t)-1 (which means error) and says there are 0 bytes
2078 // left in the input buffer -- when _real_ error occurs,
2079 // bytes-left-in-input buffer is non-zero. Hence, this alternative test for
2080 // iconv() failure.
2081 // [This bug does not appear in glibc 2.2.]
2082 #if defined(__GLIBC__) && __GLIBC__ == 2 && __GLIBC_MINOR__ <= 1
2083 #define ICONV_FAILED(cres, bufLeft) ((cres == (size_t)-1) && \
2084 (errno != E2BIG || bufLeft != 0))
2085 #else
2086 #define ICONV_FAILED(cres, bufLeft) (cres == (size_t)-1)
2087 #endif
2088
2089 #define ICONV_CHAR_CAST(x) ((ICONV_CONST char **)(x))
2090
2091 #define ICONV_T_INVALID ((iconv_t)-1)
2092
2093 #if SIZEOF_WCHAR_T == 4
2094 #define WC_BSWAP wxUINT32_SWAP_ALWAYS
2095 #define WC_ENC wxFONTENCODING_UTF32
2096 #elif SIZEOF_WCHAR_T == 2
2097 #define WC_BSWAP wxUINT16_SWAP_ALWAYS
2098 #define WC_ENC wxFONTENCODING_UTF16
2099 #else // sizeof(wchar_t) != 2 nor 4
2100 // does this ever happen?
2101 #error "Unknown sizeof(wchar_t): please report this to wx-dev@lists.wxwindows.org"
2102 #endif
2103
2104 // ----------------------------------------------------------------------------
2105 // wxMBConv_iconv: encapsulates an iconv character set
2106 // ----------------------------------------------------------------------------
2107
2108 class wxMBConv_iconv : public wxMBConv
2109 {
2110 public:
2111 wxMBConv_iconv(const char *name);
2112 virtual ~wxMBConv_iconv();
2113
2114 // implement base class virtual methods
2115 virtual size_t ToWChar(wchar_t *dst, size_t dstLen,
2116 const char *src, size_t srcLen = wxNO_LEN) const;
2117 virtual size_t FromWChar(char *dst, size_t dstLen,
2118 const wchar_t *src, size_t srcLen = wxNO_LEN) const;
2119 virtual size_t GetMBNulLen() const;
2120
2121 #if wxUSE_UNICODE_UTF8
2122 virtual bool IsUTF8() const;
2123 #endif
2124
Clone() const2125 virtual wxMBConv *Clone() const
2126 {
2127 wxMBConv_iconv *p = new wxMBConv_iconv(m_name);
2128 p->m_minMBCharWidth = m_minMBCharWidth;
2129 return p;
2130 }
2131
IsOk() const2132 bool IsOk() const
2133 { return (m2w != ICONV_T_INVALID) && (w2m != ICONV_T_INVALID); }
2134
2135 protected:
2136 // the iconv handlers used to translate from multibyte
2137 // to wide char and in the other direction
2138 iconv_t m2w,
2139 w2m;
2140
2141 #if wxUSE_THREADS
2142 // guards access to m2w and w2m objects
2143 wxMutex m_iconvMutex;
2144 #endif
2145
2146 private:
2147 // the name (for iconv_open()) of a wide char charset -- if none is
2148 // available on this machine, it will remain NULL
2149 static wxString ms_wcCharsetName;
2150
2151 // true if the wide char encoding we use (i.e. ms_wcCharsetName) has
2152 // different endian-ness than the native one
2153 static bool ms_wcNeedsSwap;
2154
2155
2156 // name of the encoding handled by this conversion
2157 const char *m_name;
2158
2159 // cached result of GetMBNulLen(); set to 0 meaning "unknown"
2160 // initially
2161 size_t m_minMBCharWidth;
2162 };
2163
2164 // make the constructor available for unit testing
new_wxMBConv_iconv(const char * name)2165 WXDLLIMPEXP_BASE wxMBConv* new_wxMBConv_iconv( const char* name )
2166 {
2167 wxMBConv_iconv* result = new wxMBConv_iconv( name );
2168 if ( !result->IsOk() )
2169 {
2170 delete result;
2171 return 0;
2172 }
2173
2174 return result;
2175 }
2176
2177 wxString wxMBConv_iconv::ms_wcCharsetName;
2178 bool wxMBConv_iconv::ms_wcNeedsSwap = false;
2179
wxMBConv_iconv(const char * name)2180 wxMBConv_iconv::wxMBConv_iconv(const char *name)
2181 : m_name(wxStrdup(name))
2182 {
2183 m_minMBCharWidth = 0;
2184
2185 // check for charset that represents wchar_t:
2186 if ( ms_wcCharsetName.empty() )
2187 {
2188 wxLogTrace(TRACE_STRCONV, wxT("Looking for wide char codeset:"));
2189
2190 #if wxUSE_FONTMAP
2191 const wxChar *const *names = wxFontMapperBase::GetAllEncodingNames(WC_ENC);
2192 #else // !wxUSE_FONTMAP
2193 static const wxChar *const names_static[] =
2194 {
2195 #if SIZEOF_WCHAR_T == 4
2196 wxT("UCS-4"),
2197 #elif SIZEOF_WCHAR_T == 2
2198 wxT("UCS-2"),
2199 #endif
2200 NULL
2201 };
2202 const wxChar *const *names = names_static;
2203 #endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
2204
2205 for ( ; *names && ms_wcCharsetName.empty(); ++names )
2206 {
2207 const wxString nameCS(*names);
2208
2209 // first try charset with explicit bytesex info (e.g. "UCS-4LE"):
2210 wxString nameXE(nameCS);
2211
2212 #ifdef WORDS_BIGENDIAN
2213 nameXE += wxT("BE");
2214 #else // little endian
2215 nameXE += wxT("LE");
2216 #endif
2217
2218 wxLogTrace(TRACE_STRCONV, wxT(" trying charset \"%s\""),
2219 nameXE.c_str());
2220
2221 m2w = iconv_open(nameXE.ToAscii(), name);
2222 if ( m2w == ICONV_T_INVALID )
2223 {
2224 // try charset w/o bytesex info (e.g. "UCS4")
2225 wxLogTrace(TRACE_STRCONV, wxT(" trying charset \"%s\""),
2226 nameCS.c_str());
2227 m2w = iconv_open(nameCS.ToAscii(), name);
2228
2229 // and check for bytesex ourselves:
2230 if ( m2w != ICONV_T_INVALID )
2231 {
2232 char buf[2], *bufPtr;
2233 wchar_t wbuf[2];
2234 size_t insz, outsz;
2235 size_t res;
2236
2237 buf[0] = 'A';
2238 buf[1] = 0;
2239 wbuf[0] = 0;
2240 insz = 2;
2241 outsz = SIZEOF_WCHAR_T * 2;
2242 char* wbufPtr = (char*)wbuf;
2243 bufPtr = buf;
2244
2245 res = iconv(
2246 m2w, ICONV_CHAR_CAST(&bufPtr), &insz,
2247 &wbufPtr, &outsz);
2248
2249 if (ICONV_FAILED(res, insz))
2250 {
2251 wxLogLastError(wxT("iconv"));
2252 wxLogError(_("Conversion to charset '%s' doesn't work."),
2253 nameCS.c_str());
2254 }
2255 else // ok, can convert to this encoding, remember it
2256 {
2257 ms_wcCharsetName = nameCS;
2258 ms_wcNeedsSwap = wbuf[0] != (wchar_t)buf[0];
2259 }
2260 }
2261 }
2262 else // use charset not requiring byte swapping
2263 {
2264 ms_wcCharsetName = nameXE;
2265 }
2266 }
2267
2268 wxLogTrace(TRACE_STRCONV,
2269 wxT("iconv wchar_t charset is \"%s\"%s"),
2270 ms_wcCharsetName.empty() ? wxString("<none>")
2271 : ms_wcCharsetName,
2272 ms_wcNeedsSwap ? wxT(" (needs swap)")
2273 : wxT(""));
2274 }
2275 else // we already have ms_wcCharsetName
2276 {
2277 m2w = iconv_open(ms_wcCharsetName.ToAscii(), name);
2278 }
2279
2280 if ( ms_wcCharsetName.empty() )
2281 {
2282 w2m = ICONV_T_INVALID;
2283 }
2284 else
2285 {
2286 w2m = iconv_open(name, ms_wcCharsetName.ToAscii());
2287 if ( w2m == ICONV_T_INVALID )
2288 {
2289 wxLogTrace(TRACE_STRCONV,
2290 wxT("\"%s\" -> \"%s\" works but not the converse!?"),
2291 ms_wcCharsetName.c_str(), name);
2292 }
2293 }
2294 }
2295
~wxMBConv_iconv()2296 wxMBConv_iconv::~wxMBConv_iconv()
2297 {
2298 free(const_cast<char *>(m_name));
2299
2300 if ( m2w != ICONV_T_INVALID )
2301 iconv_close(m2w);
2302 if ( w2m != ICONV_T_INVALID )
2303 iconv_close(w2m);
2304 }
2305
2306 size_t
ToWChar(wchar_t * dst,size_t dstLen,const char * src,size_t srcLen) const2307 wxMBConv_iconv::ToWChar(wchar_t *dst, size_t dstLen,
2308 const char *src, size_t srcLen) const
2309 {
2310 if ( srcLen == wxNO_LEN )
2311 {
2312 // find the string length: notice that must be done differently for
2313 // NUL-terminated strings and UTF-16/32 which are terminated with 2/4
2314 // consecutive NULs
2315 const size_t nulLen = GetMBNulLen();
2316 switch ( nulLen )
2317 {
2318 default:
2319 return wxCONV_FAILED;
2320
2321 case 1:
2322 srcLen = strlen(src); // arguably more optimized than our version
2323 break;
2324
2325 case 2:
2326 case 4:
2327 // for UTF-16/32 not only we need to have 2/4 consecutive NULs
2328 // but they also have to start at character boundary and not
2329 // span two adjacent characters
2330 const char *p;
2331 for ( p = src; NotAllNULs(p, nulLen); p += nulLen )
2332 ;
2333 srcLen = p - src;
2334 break;
2335 }
2336
2337 // when we're determining the length of the string ourselves we count
2338 // the terminating NUL(s) as part of it and always NUL-terminate the
2339 // output
2340 srcLen += nulLen;
2341 }
2342
2343 // we express length in the number of (wide) characters but iconv always
2344 // counts buffer sizes it in bytes
2345 dstLen *= SIZEOF_WCHAR_T;
2346
2347 #if wxUSE_THREADS
2348 // NB: iconv() is MT-safe, but each thread must use its own iconv_t handle.
2349 // Unfortunately there are a couple of global wxCSConv objects such as
2350 // wxConvLocal that are used all over wx code, so we have to make sure
2351 // the handle is used by at most one thread at the time. Otherwise
2352 // only a few wx classes would be safe to use from non-main threads
2353 // as MB<->WC conversion would fail "randomly".
2354 wxMutexLocker lock(wxConstCast(this, wxMBConv_iconv)->m_iconvMutex);
2355 #endif // wxUSE_THREADS
2356
2357 size_t res, cres;
2358 const char *pszPtr = src;
2359
2360 if ( dst )
2361 {
2362 char* bufPtr = (char*)dst;
2363
2364 // have destination buffer, convert there
2365 size_t dstLenOrig = dstLen;
2366 cres = iconv(m2w,
2367 ICONV_CHAR_CAST(&pszPtr), &srcLen,
2368 &bufPtr, &dstLen);
2369
2370 // convert the number of bytes converted as returned by iconv to the
2371 // number of (wide) characters converted that we need
2372 res = (dstLenOrig - dstLen) / SIZEOF_WCHAR_T;
2373
2374 if (ms_wcNeedsSwap)
2375 {
2376 // convert to native endianness
2377 for ( unsigned i = 0; i < res; i++ )
2378 dst[i] = WC_BSWAP(dst[i]);
2379 }
2380 }
2381 else // no destination buffer
2382 {
2383 // convert using temp buffer to calculate the size of the buffer needed
2384 wchar_t tbuf[256];
2385 res = 0;
2386
2387 do
2388 {
2389 char* bufPtr = (char*)tbuf;
2390 dstLen = 8 * SIZEOF_WCHAR_T;
2391
2392 cres = iconv(m2w,
2393 ICONV_CHAR_CAST(&pszPtr), &srcLen,
2394 &bufPtr, &dstLen );
2395
2396 res += 8 - (dstLen / SIZEOF_WCHAR_T);
2397 }
2398 while ((cres == (size_t)-1) && (errno == E2BIG));
2399 }
2400
2401 if (ICONV_FAILED(cres, srcLen))
2402 {
2403 //VS: it is ok if iconv fails, hence trace only
2404 wxLogTrace(TRACE_STRCONV, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
2405 return wxCONV_FAILED;
2406 }
2407
2408 return res;
2409 }
2410
FromWChar(char * dst,size_t dstLen,const wchar_t * src,size_t srcLen) const2411 size_t wxMBConv_iconv::FromWChar(char *dst, size_t dstLen,
2412 const wchar_t *src, size_t srcLen) const
2413 {
2414 #if wxUSE_THREADS
2415 // NB: explained in MB2WC
2416 wxMutexLocker lock(wxConstCast(this, wxMBConv_iconv)->m_iconvMutex);
2417 #endif
2418
2419 if ( srcLen == wxNO_LEN )
2420 srcLen = wxWcslen(src) + 1;
2421
2422 size_t inbuflen = srcLen * SIZEOF_WCHAR_T;
2423 size_t outbuflen = dstLen;
2424 size_t res, cres;
2425
2426 wchar_t *tmpbuf = 0;
2427
2428 if (ms_wcNeedsSwap)
2429 {
2430 // need to copy to temp buffer to switch endianness
2431 // (doing WC_BSWAP twice on the original buffer won't work, as it
2432 // could be in read-only memory, or be accessed in some other thread)
2433 tmpbuf = (wchar_t *)malloc(inbuflen);
2434 for ( size_t i = 0; i < srcLen; i++ )
2435 tmpbuf[i] = WC_BSWAP(src[i]);
2436
2437 src = tmpbuf;
2438 }
2439
2440 char* inbuf = (char*)src;
2441 if ( dst )
2442 {
2443 // have destination buffer, convert there
2444 cres = iconv(w2m, ICONV_CHAR_CAST(&inbuf), &inbuflen, &dst, &outbuflen);
2445
2446 res = dstLen - outbuflen;
2447 }
2448 else // no destination buffer
2449 {
2450 // convert using temp buffer to calculate the size of the buffer needed
2451 char tbuf[256];
2452 res = 0;
2453 do
2454 {
2455 dst = tbuf;
2456 outbuflen = WXSIZEOF(tbuf);
2457
2458 cres = iconv(w2m, ICONV_CHAR_CAST(&inbuf), &inbuflen, &dst, &outbuflen);
2459
2460 res += WXSIZEOF(tbuf) - outbuflen;
2461 }
2462 while ((cres == (size_t)-1) && (errno == E2BIG));
2463 }
2464
2465 if (ms_wcNeedsSwap)
2466 {
2467 free(tmpbuf);
2468 }
2469
2470 if (ICONV_FAILED(cres, inbuflen))
2471 {
2472 wxLogTrace(TRACE_STRCONV, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
2473 return wxCONV_FAILED;
2474 }
2475
2476 return res;
2477 }
2478
GetMBNulLen() const2479 size_t wxMBConv_iconv::GetMBNulLen() const
2480 {
2481 if ( m_minMBCharWidth == 0 )
2482 {
2483 wxMBConv_iconv * const self = wxConstCast(this, wxMBConv_iconv);
2484
2485 #if wxUSE_THREADS
2486 // NB: explained in MB2WC
2487 wxMutexLocker lock(self->m_iconvMutex);
2488 #endif
2489
2490 const wchar_t *wnul = L"";
2491 char buf[8]; // should be enough for NUL in any encoding
2492 size_t inLen = sizeof(wchar_t),
2493 outLen = WXSIZEOF(buf);
2494 char *inBuff = (char *)wnul;
2495 char *outBuff = buf;
2496 if ( iconv(w2m, ICONV_CHAR_CAST(&inBuff), &inLen, &outBuff, &outLen) == (size_t)-1 )
2497 {
2498 self->m_minMBCharWidth = (size_t)-1;
2499 }
2500 else // ok
2501 {
2502 self->m_minMBCharWidth = outBuff - buf;
2503 }
2504 }
2505
2506 return m_minMBCharWidth;
2507 }
2508
2509 #if wxUSE_UNICODE_UTF8
IsUTF8() const2510 bool wxMBConv_iconv::IsUTF8() const
2511 {
2512 return wxStricmp(m_name, "UTF-8") == 0 ||
2513 wxStricmp(m_name, "UTF8") == 0;
2514 }
2515 #endif
2516
2517 #endif // HAVE_ICONV
2518
2519
2520 // ============================================================================
2521 // Win32 conversion classes
2522 // ============================================================================
2523
2524 #ifdef wxHAVE_WIN32_MB2WC
2525
2526 // from utils.cpp
2527 #if wxUSE_FONTMAP
2528 extern WXDLLIMPEXP_BASE long wxCharsetToCodepage(const char *charset);
2529 #endif
2530 extern WXDLLIMPEXP_BASE long wxEncodingToCodepage(wxFontEncoding encoding);
2531
2532 class wxMBConv_win32 : public wxMBConv
2533 {
2534 public:
wxMBConv_win32()2535 wxMBConv_win32()
2536 {
2537 m_CodePage = CP_ACP;
2538 m_minMBCharWidth = 0;
2539 }
2540
wxMBConv_win32(const wxMBConv_win32 & conv)2541 wxMBConv_win32(const wxMBConv_win32& conv)
2542 : wxMBConv()
2543 {
2544 m_CodePage = conv.m_CodePage;
2545 m_minMBCharWidth = conv.m_minMBCharWidth;
2546 }
2547
2548 #if wxUSE_FONTMAP
wxMBConv_win32(const char * name)2549 wxMBConv_win32(const char* name)
2550 {
2551 m_CodePage = wxCharsetToCodepage(name);
2552 m_minMBCharWidth = 0;
2553 }
2554 #endif // wxUSE_FONTMAP
2555
wxMBConv_win32(wxFontEncoding encoding)2556 wxMBConv_win32(wxFontEncoding encoding)
2557 {
2558 m_CodePage = wxEncodingToCodepage(encoding);
2559 m_minMBCharWidth = 0;
2560 }
2561
MB2WC(wchar_t * buf,const char * psz,size_t n) const2562 virtual size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const
2563 {
2564 // note that we have to use MB_ERR_INVALID_CHARS flag as it without it
2565 // the behaviour is not compatible with the Unix version (using iconv)
2566 // and break the library itself, e.g. wxTextInputStream::NextChar()
2567 // wouldn't work if reading an incomplete MB char didn't result in an
2568 // error
2569 //
2570 // Moreover, MB_ERR_INVALID_CHARS is only supported on Win 2K SP4 or
2571 // Win XP or newer and it is not supported for UTF-[78] so we always
2572 // use our own conversions in this case. See
2573 // http://blogs.msdn.com/michkap/archive/2005/04/19/409566.aspx
2574 // http://msdn.microsoft.com/library/en-us/intl/unicode_17si.asp
2575 if ( m_CodePage == CP_UTF8 )
2576 {
2577 return wxMBConvUTF8().MB2WC(buf, psz, n);
2578 }
2579
2580 if ( m_CodePage == CP_UTF7 )
2581 {
2582 return wxMBConvUTF7().MB2WC(buf, psz, n);
2583 }
2584
2585 int flags = 0;
2586 if ( (m_CodePage < 50000 && m_CodePage != CP_SYMBOL) &&
2587 IsAtLeastWin2kSP4() )
2588 {
2589 flags = MB_ERR_INVALID_CHARS;
2590 }
2591
2592 const size_t len = ::MultiByteToWideChar
2593 (
2594 m_CodePage, // code page
2595 flags, // flags: fall on error
2596 psz, // input string
2597 -1, // its length (NUL-terminated)
2598 buf, // output string
2599 buf ? n : 0 // size of output buffer
2600 );
2601 if ( !len )
2602 {
2603 // function totally failed
2604 return wxCONV_FAILED;
2605 }
2606
2607 // if we were really converting and didn't use MB_ERR_INVALID_CHARS,
2608 // check if we succeeded, by doing a double trip:
2609 if ( !flags && buf )
2610 {
2611 const size_t mbLen = strlen(psz);
2612 wxCharBuffer mbBuf(mbLen);
2613 if ( ::WideCharToMultiByte
2614 (
2615 m_CodePage,
2616 0,
2617 buf,
2618 -1,
2619 mbBuf.data(),
2620 mbLen + 1, // size in bytes, not length
2621 NULL,
2622 NULL
2623 ) == 0 ||
2624 strcmp(mbBuf, psz) != 0 )
2625 {
2626 // we didn't obtain the same thing we started from, hence
2627 // the conversion was lossy and we consider that it failed
2628 return wxCONV_FAILED;
2629 }
2630 }
2631
2632 // note that it returns count of written chars for buf != NULL and size
2633 // of the needed buffer for buf == NULL so in either case the length of
2634 // the string (which never includes the terminating NUL) is one less
2635 return len - 1;
2636 }
2637
WC2MB(char * buf,const wchar_t * pwz,size_t n) const2638 virtual size_t WC2MB(char *buf, const wchar_t *pwz, size_t n) const
2639 {
2640 /*
2641 we have a problem here: by default, WideCharToMultiByte() may
2642 replace characters unrepresentable in the target code page with bad
2643 quality approximations such as turning "1/2" symbol (U+00BD) into
2644 "1" for the code pages which don't have it and we, obviously, want
2645 to avoid this at any price
2646
2647 the trouble is that this function does it _silently_, i.e. it won't
2648 even tell us whether it did or not... Win98/2000 and higher provide
2649 WC_NO_BEST_FIT_CHARS but it doesn't work for the older systems and
2650 we have to resort to a round trip, i.e. check that converting back
2651 results in the same string -- this is, of course, expensive but
2652 otherwise we simply can't be sure to not garble the data.
2653 */
2654
2655 // determine if we can rely on WC_NO_BEST_FIT_CHARS: according to MSDN
2656 // it doesn't work with CJK encodings (which we test for rather roughly
2657 // here...) nor with UTF-7/8 nor, of course, with Windows versions not
2658 // supporting it
2659 BOOL usedDef wxDUMMY_INITIALIZE(false);
2660 BOOL *pUsedDef;
2661 int flags;
2662 if ( CanUseNoBestFit() && m_CodePage < 50000 )
2663 {
2664 // it's our lucky day
2665 flags = WC_NO_BEST_FIT_CHARS;
2666 pUsedDef = &usedDef;
2667 }
2668 else // old system or unsupported encoding
2669 {
2670 flags = 0;
2671 pUsedDef = NULL;
2672 }
2673
2674 const size_t len = ::WideCharToMultiByte
2675 (
2676 m_CodePage, // code page
2677 flags, // either none or no best fit
2678 pwz, // input string
2679 -1, // it is (wide) NUL-terminated
2680 buf, // output buffer
2681 buf ? n : 0, // and its size
2682 NULL, // default "replacement" char
2683 pUsedDef // [out] was it used?
2684 );
2685
2686 if ( !len )
2687 {
2688 // function totally failed
2689 return wxCONV_FAILED;
2690 }
2691
2692 // we did something, check if we really succeeded
2693 if ( flags )
2694 {
2695 // check if the conversion failed, i.e. if any replacements
2696 // were done
2697 if ( usedDef )
2698 return wxCONV_FAILED;
2699 }
2700 else // we must resort to double tripping...
2701 {
2702 // first we need to ensure that we really have the MB data: this is
2703 // not the case if we're called with NULL buffer, in which case we
2704 // need to do the conversion yet again
2705 wxCharBuffer bufDef;
2706 if ( !buf )
2707 {
2708 bufDef = wxCharBuffer(len);
2709 buf = bufDef.data();
2710 if ( !::WideCharToMultiByte(m_CodePage, flags, pwz, -1,
2711 buf, len, NULL, NULL) )
2712 return wxCONV_FAILED;
2713 }
2714
2715 if ( !n )
2716 n = wcslen(pwz);
2717 wxWCharBuffer wcBuf(n);
2718 if ( MB2WC(wcBuf.data(), buf, n + 1) == wxCONV_FAILED ||
2719 wcscmp(wcBuf, pwz) != 0 )
2720 {
2721 // we didn't obtain the same thing we started from, hence
2722 // the conversion was lossy and we consider that it failed
2723 return wxCONV_FAILED;
2724 }
2725 }
2726
2727 // see the comment above for the reason of "len - 1"
2728 return len - 1;
2729 }
2730
GetMBNulLen() const2731 virtual size_t GetMBNulLen() const
2732 {
2733 if ( m_minMBCharWidth == 0 )
2734 {
2735 int len = ::WideCharToMultiByte
2736 (
2737 m_CodePage, // code page
2738 0, // no flags
2739 L"", // input string
2740 1, // translate just the NUL
2741 NULL, // output buffer
2742 0, // and its size
2743 NULL, // no replacement char
2744 NULL // [out] don't care if it was used
2745 );
2746
2747 wxMBConv_win32 * const self = wxConstCast(this, wxMBConv_win32);
2748 switch ( len )
2749 {
2750 default:
2751 wxLogDebug(wxT("Unexpected NUL length %d"), len);
2752 self->m_minMBCharWidth = (size_t)-1;
2753 break;
2754
2755 case 0:
2756 self->m_minMBCharWidth = (size_t)-1;
2757 break;
2758
2759 case 1:
2760 case 2:
2761 case 4:
2762 self->m_minMBCharWidth = len;
2763 break;
2764 }
2765 }
2766
2767 return m_minMBCharWidth;
2768 }
2769
Clone() const2770 virtual wxMBConv *Clone() const { return new wxMBConv_win32(*this); }
2771
IsOk() const2772 bool IsOk() const { return m_CodePage != -1; }
2773
2774 private:
CanUseNoBestFit()2775 static bool CanUseNoBestFit()
2776 {
2777 static int s_isWin98Or2k = -1;
2778
2779 if ( s_isWin98Or2k == -1 )
2780 {
2781 int verMaj, verMin;
2782 switch ( wxGetOsVersion(&verMaj, &verMin) )
2783 {
2784 case wxOS_WINDOWS_9X:
2785 s_isWin98Or2k = verMaj >= 4 && verMin >= 10;
2786 break;
2787
2788 case wxOS_WINDOWS_NT:
2789 s_isWin98Or2k = verMaj >= 5;
2790 break;
2791
2792 default:
2793 // unknown: be conservative by default
2794 s_isWin98Or2k = 0;
2795 break;
2796 }
2797
2798 wxASSERT_MSG( s_isWin98Or2k != -1, wxT("should be set above") );
2799 }
2800
2801 return s_isWin98Or2k == 1;
2802 }
2803
IsAtLeastWin2kSP4()2804 static bool IsAtLeastWin2kSP4()
2805 {
2806 #ifdef __WXWINCE__
2807 return false;
2808 #else
2809 static int s_isAtLeastWin2kSP4 = -1;
2810
2811 if ( s_isAtLeastWin2kSP4 == -1 )
2812 {
2813 OSVERSIONINFOEX ver;
2814
2815 memset(&ver, 0, sizeof(ver));
2816 ver.dwOSVersionInfoSize = sizeof(ver);
2817 GetVersionEx((OSVERSIONINFO*)&ver);
2818
2819 s_isAtLeastWin2kSP4 =
2820 ((ver.dwMajorVersion > 5) || // Vista+
2821 (ver.dwMajorVersion == 5 && ver.dwMinorVersion > 0) || // XP/2003
2822 (ver.dwMajorVersion == 5 && ver.dwMinorVersion == 0 &&
2823 ver.wServicePackMajor >= 4)) // 2000 SP4+
2824 ? 1 : 0;
2825 }
2826
2827 return s_isAtLeastWin2kSP4 == 1;
2828 #endif
2829 }
2830
2831
2832 // the code page we're working with
2833 long m_CodePage;
2834
2835 // cached result of GetMBNulLen(), set to 0 initially meaning
2836 // "unknown"
2837 size_t m_minMBCharWidth;
2838 };
2839
2840 #endif // wxHAVE_WIN32_MB2WC
2841
2842
2843 // ============================================================================
2844 // wxEncodingConverter based conversion classes
2845 // ============================================================================
2846
2847 #if wxUSE_FONTMAP
2848
2849 class wxMBConv_wxwin : public wxMBConv
2850 {
2851 private:
Init()2852 void Init()
2853 {
2854 // Refuse to use broken wxEncodingConverter code for Mac-specific encodings.
2855 // The wxMBConv_cf class does a better job.
2856 m_ok = (m_enc < wxFONTENCODING_MACMIN || m_enc > wxFONTENCODING_MACMAX) &&
2857 m2w.Init(m_enc, wxFONTENCODING_UNICODE) &&
2858 w2m.Init(wxFONTENCODING_UNICODE, m_enc);
2859 }
2860
2861 public:
2862 // temporarily just use wxEncodingConverter stuff,
2863 // so that it works while a better implementation is built
wxMBConv_wxwin(const char * name)2864 wxMBConv_wxwin(const char* name)
2865 {
2866 if (name)
2867 m_enc = wxFontMapperBase::Get()->CharsetToEncoding(name, false);
2868 else
2869 m_enc = wxFONTENCODING_SYSTEM;
2870
2871 Init();
2872 }
2873
wxMBConv_wxwin(wxFontEncoding enc)2874 wxMBConv_wxwin(wxFontEncoding enc)
2875 {
2876 m_enc = enc;
2877
2878 Init();
2879 }
2880
MB2WC(wchar_t * buf,const char * psz,size_t WXUNUSED (n)) const2881 size_t MB2WC(wchar_t *buf, const char *psz, size_t WXUNUSED(n)) const
2882 {
2883 size_t inbuf = strlen(psz);
2884 if (buf)
2885 {
2886 if (!m2w.Convert(psz, buf))
2887 return wxCONV_FAILED;
2888 }
2889 return inbuf;
2890 }
2891
WC2MB(char * buf,const wchar_t * psz,size_t WXUNUSED (n)) const2892 size_t WC2MB(char *buf, const wchar_t *psz, size_t WXUNUSED(n)) const
2893 {
2894 const size_t inbuf = wxWcslen(psz);
2895 if (buf)
2896 {
2897 if (!w2m.Convert(psz, buf))
2898 return wxCONV_FAILED;
2899 }
2900
2901 return inbuf;
2902 }
2903
GetMBNulLen() const2904 virtual size_t GetMBNulLen() const
2905 {
2906 switch ( m_enc )
2907 {
2908 case wxFONTENCODING_UTF16BE:
2909 case wxFONTENCODING_UTF16LE:
2910 return 2;
2911
2912 case wxFONTENCODING_UTF32BE:
2913 case wxFONTENCODING_UTF32LE:
2914 return 4;
2915
2916 default:
2917 return 1;
2918 }
2919 }
2920
Clone() const2921 virtual wxMBConv *Clone() const { return new wxMBConv_wxwin(m_enc); }
2922
IsOk() const2923 bool IsOk() const { return m_ok; }
2924
2925 public:
2926 wxFontEncoding m_enc;
2927 wxEncodingConverter m2w, w2m;
2928
2929 private:
2930 // were we initialized successfully?
2931 bool m_ok;
2932
2933 wxDECLARE_NO_COPY_CLASS(wxMBConv_wxwin);
2934 };
2935
2936 // make the constructors available for unit testing
new_wxMBConv_wxwin(const char * name)2937 WXDLLIMPEXP_BASE wxMBConv* new_wxMBConv_wxwin( const char* name )
2938 {
2939 wxMBConv_wxwin* result = new wxMBConv_wxwin( name );
2940 if ( !result->IsOk() )
2941 {
2942 delete result;
2943 return 0;
2944 }
2945
2946 return result;
2947 }
2948
2949 #endif // wxUSE_FONTMAP
2950
2951 // ============================================================================
2952 // wxCSConv implementation
2953 // ============================================================================
2954
Init()2955 void wxCSConv::Init()
2956 {
2957 m_name = NULL;
2958 m_convReal = NULL;
2959 }
2960
SetEncoding(wxFontEncoding encoding)2961 void wxCSConv::SetEncoding(wxFontEncoding encoding)
2962 {
2963 switch ( encoding )
2964 {
2965 case wxFONTENCODING_MAX:
2966 case wxFONTENCODING_SYSTEM:
2967 if ( m_name )
2968 {
2969 // It's ok to not have encoding value if we have a name for it.
2970 m_encoding = wxFONTENCODING_SYSTEM;
2971 }
2972 else // No name neither.
2973 {
2974 // Fall back to the system default encoding in this case (not
2975 // sure how much sense does this make but this is how the old
2976 // code used to behave).
2977 #if wxUSE_INTL
2978 m_encoding = wxLocale::GetSystemEncoding();
2979 if ( m_encoding == wxFONTENCODING_SYSTEM )
2980 #endif // wxUSE_INTL
2981 m_encoding = wxFONTENCODING_ISO8859_1;
2982 }
2983 break;
2984
2985 case wxFONTENCODING_DEFAULT:
2986 // wxFONTENCODING_DEFAULT is same as US-ASCII in this context
2987 m_encoding = wxFONTENCODING_ISO8859_1;
2988 break;
2989
2990 default:
2991 // Just use the provided encoding.
2992 m_encoding = encoding;
2993 }
2994 }
2995
wxCSConv(const wxString & charset)2996 wxCSConv::wxCSConv(const wxString& charset)
2997 {
2998 Init();
2999
3000 if ( !charset.empty() )
3001 {
3002 SetName(charset.ToAscii());
3003 }
3004
3005 #if wxUSE_FONTMAP
3006 SetEncoding(wxFontMapperBase::GetEncodingFromName(charset));
3007 #else
3008 SetEncoding(wxFONTENCODING_SYSTEM);
3009 #endif
3010
3011 m_convReal = DoCreate();
3012 }
3013
wxCSConv(wxFontEncoding encoding)3014 wxCSConv::wxCSConv(wxFontEncoding encoding)
3015 {
3016 if ( encoding == wxFONTENCODING_MAX || encoding == wxFONTENCODING_DEFAULT )
3017 {
3018 wxFAIL_MSG( wxT("invalid encoding value in wxCSConv ctor") );
3019
3020 encoding = wxFONTENCODING_SYSTEM;
3021 }
3022
3023 Init();
3024
3025 SetEncoding(encoding);
3026
3027 m_convReal = DoCreate();
3028 }
3029
~wxCSConv()3030 wxCSConv::~wxCSConv()
3031 {
3032 Clear();
3033 }
3034
wxCSConv(const wxCSConv & conv)3035 wxCSConv::wxCSConv(const wxCSConv& conv)
3036 : wxMBConv()
3037 {
3038 Init();
3039
3040 SetName(conv.m_name);
3041 SetEncoding(conv.m_encoding);
3042
3043 m_convReal = DoCreate();
3044 }
3045
operator =(const wxCSConv & conv)3046 wxCSConv& wxCSConv::operator=(const wxCSConv& conv)
3047 {
3048 Clear();
3049
3050 SetName(conv.m_name);
3051 SetEncoding(conv.m_encoding);
3052
3053 m_convReal = DoCreate();
3054
3055 return *this;
3056 }
3057
Clear()3058 void wxCSConv::Clear()
3059 {
3060 free(m_name);
3061 m_name = NULL;
3062
3063 wxDELETE(m_convReal);
3064 }
3065
SetName(const char * charset)3066 void wxCSConv::SetName(const char *charset)
3067 {
3068 if ( charset )
3069 m_name = wxStrdup(charset);
3070 }
3071
3072 #if wxUSE_FONTMAP
3073
3074 WX_DECLARE_HASH_MAP( wxFontEncoding, wxString, wxIntegerHash, wxIntegerEqual,
3075 wxEncodingNameCache );
3076
3077 static wxEncodingNameCache gs_nameCache;
3078 #endif
3079
DoCreate() const3080 wxMBConv *wxCSConv::DoCreate() const
3081 {
3082 #if wxUSE_FONTMAP
3083 wxLogTrace(TRACE_STRCONV,
3084 wxT("creating conversion for %s"),
3085 (m_name ? m_name
3086 : (const char*)wxFontMapperBase::GetEncodingName(m_encoding).mb_str()));
3087 #endif // wxUSE_FONTMAP
3088
3089 // check for the special case of ASCII or ISO8859-1 charset: as we have
3090 // special knowledge of it anyhow, we don't need to create a special
3091 // conversion object
3092 if ( m_encoding == wxFONTENCODING_ISO8859_1 )
3093 {
3094 // don't convert at all
3095 return NULL;
3096 }
3097
3098 // we trust OS to do conversion better than we can so try external
3099 // conversion methods first
3100 //
3101 // the full order is:
3102 // 1. OS conversion (iconv() under Unix or Win32 API)
3103 // 2. hard coded conversions for UTF
3104 // 3. wxEncodingConverter as fall back
3105
3106 // step (1)
3107 #ifdef HAVE_ICONV
3108 #if !wxUSE_FONTMAP
3109 if ( m_name )
3110 #endif // !wxUSE_FONTMAP
3111 {
3112 #if wxUSE_FONTMAP
3113 wxFontEncoding encoding(m_encoding);
3114 #endif
3115
3116 if ( m_name )
3117 {
3118 wxMBConv_iconv *conv = new wxMBConv_iconv(m_name);
3119 if ( conv->IsOk() )
3120 return conv;
3121
3122 delete conv;
3123
3124 #if wxUSE_FONTMAP
3125 encoding =
3126 wxFontMapperBase::Get()->CharsetToEncoding(m_name, false);
3127 #endif // wxUSE_FONTMAP
3128 }
3129 #if wxUSE_FONTMAP
3130 {
3131 const wxEncodingNameCache::iterator it = gs_nameCache.find(encoding);
3132 if ( it != gs_nameCache.end() )
3133 {
3134 if ( it->second.empty() )
3135 return NULL;
3136
3137 wxMBConv_iconv *conv = new wxMBConv_iconv(it->second.ToAscii());
3138 if ( conv->IsOk() )
3139 return conv;
3140
3141 delete conv;
3142 }
3143
3144 const wxChar* const* names = wxFontMapperBase::GetAllEncodingNames(encoding);
3145 // CS : in case this does not return valid names (eg for MacRoman)
3146 // encoding got a 'failure' entry in the cache all the same,
3147 // although it just has to be created using a different method, so
3148 // only store failed iconv creation attempts (or perhaps we
3149 // shoulnd't do this at all ?)
3150 if ( names[0] != NULL )
3151 {
3152 for ( ; *names; ++names )
3153 {
3154 // FIXME-UTF8: wxFontMapperBase::GetAllEncodingNames()
3155 // will need changes that will obsolete this
3156 wxString name(*names);
3157 wxMBConv_iconv *conv = new wxMBConv_iconv(name.ToAscii());
3158 if ( conv->IsOk() )
3159 {
3160 gs_nameCache[encoding] = *names;
3161 return conv;
3162 }
3163
3164 delete conv;
3165 }
3166
3167 gs_nameCache[encoding] = wxT(""); // cache the failure
3168 }
3169 }
3170 #endif // wxUSE_FONTMAP
3171 }
3172 #endif // HAVE_ICONV
3173
3174 #ifdef wxHAVE_WIN32_MB2WC
3175 {
3176 #if wxUSE_FONTMAP
3177 wxMBConv_win32 *conv = m_name ? new wxMBConv_win32(m_name)
3178 : new wxMBConv_win32(m_encoding);
3179 #else
3180 wxMBConv_win32* conv = new wxMBConv_win32(m_encoding);
3181 #endif
3182 if ( conv->IsOk() )
3183 return conv;
3184
3185 delete conv;
3186 }
3187 #endif // wxHAVE_WIN32_MB2WC
3188
3189 #ifdef __DARWIN__
3190 {
3191 // leave UTF16 and UTF32 to the built-ins of wx
3192 if ( m_name || ( m_encoding < wxFONTENCODING_UTF16BE ||
3193 ( m_encoding >= wxFONTENCODING_MACMIN && m_encoding <= wxFONTENCODING_MACMAX ) ) )
3194 {
3195 #if wxUSE_FONTMAP
3196 wxMBConv_cf *conv = m_name ? new wxMBConv_cf(m_name)
3197 : new wxMBConv_cf(m_encoding);
3198 #else
3199 wxMBConv_cf *conv = new wxMBConv_cf(m_encoding);
3200 #endif
3201
3202 if ( conv->IsOk() )
3203 return conv;
3204
3205 delete conv;
3206 }
3207 }
3208 #endif // __DARWIN__
3209
3210 // step (2)
3211 wxFontEncoding enc = m_encoding;
3212 #if wxUSE_FONTMAP
3213 if ( enc == wxFONTENCODING_SYSTEM && m_name )
3214 {
3215 // use "false" to suppress interactive dialogs -- we can be called from
3216 // anywhere and popping up a dialog from here is the last thing we want to
3217 // do
3218 enc = wxFontMapperBase::Get()->CharsetToEncoding(m_name, false);
3219 }
3220 #endif // wxUSE_FONTMAP
3221
3222 switch ( enc )
3223 {
3224 case wxFONTENCODING_UTF7:
3225 return new wxMBConvUTF7;
3226
3227 case wxFONTENCODING_UTF8:
3228 return new wxMBConvUTF8;
3229
3230 case wxFONTENCODING_UTF16BE:
3231 return new wxMBConvUTF16BE;
3232
3233 case wxFONTENCODING_UTF16LE:
3234 return new wxMBConvUTF16LE;
3235
3236 case wxFONTENCODING_UTF32BE:
3237 return new wxMBConvUTF32BE;
3238
3239 case wxFONTENCODING_UTF32LE:
3240 return new wxMBConvUTF32LE;
3241
3242 default:
3243 // nothing to do but put here to suppress gcc warnings
3244 break;
3245 }
3246
3247 // step (3)
3248 #if wxUSE_FONTMAP
3249 {
3250 wxMBConv_wxwin *conv = m_name ? new wxMBConv_wxwin(m_name)
3251 : new wxMBConv_wxwin(m_encoding);
3252 if ( conv->IsOk() )
3253 return conv;
3254
3255 delete conv;
3256 }
3257
3258 wxLogTrace(TRACE_STRCONV,
3259 wxT("encoding \"%s\" is not supported by this system"),
3260 (m_name ? wxString(m_name)
3261 : wxFontMapperBase::GetEncodingName(m_encoding)));
3262 #endif // wxUSE_FONTMAP
3263
3264 return NULL;
3265 }
3266
IsOk() const3267 bool wxCSConv::IsOk() const
3268 {
3269 // special case: no convReal created for wxFONTENCODING_ISO8859_1
3270 if ( m_encoding == wxFONTENCODING_ISO8859_1 )
3271 return true; // always ok as we do it ourselves
3272
3273 // m_convReal->IsOk() is called at its own creation, so we know it must
3274 // be ok if m_convReal is non-NULL
3275 return m_convReal != NULL;
3276 }
3277
ToWChar(wchar_t * dst,size_t dstLen,const char * src,size_t srcLen) const3278 size_t wxCSConv::ToWChar(wchar_t *dst, size_t dstLen,
3279 const char *src, size_t srcLen) const
3280 {
3281 if (m_convReal)
3282 return m_convReal->ToWChar(dst, dstLen, src, srcLen);
3283
3284 // latin-1 (direct)
3285 if ( srcLen == wxNO_LEN )
3286 srcLen = strlen(src) + 1; // take trailing NUL too
3287
3288 if ( dst )
3289 {
3290 if ( dstLen < srcLen )
3291 return wxCONV_FAILED;
3292
3293 for ( size_t n = 0; n < srcLen; n++ )
3294 dst[n] = (unsigned char)(src[n]);
3295 }
3296
3297 return srcLen;
3298 }
3299
FromWChar(char * dst,size_t dstLen,const wchar_t * src,size_t srcLen) const3300 size_t wxCSConv::FromWChar(char *dst, size_t dstLen,
3301 const wchar_t *src, size_t srcLen) const
3302 {
3303 if (m_convReal)
3304 return m_convReal->FromWChar(dst, dstLen, src, srcLen);
3305
3306 // latin-1 (direct)
3307 if ( srcLen == wxNO_LEN )
3308 srcLen = wxWcslen(src) + 1;
3309
3310 if ( dst )
3311 {
3312 if ( dstLen < srcLen )
3313 return wxCONV_FAILED;
3314
3315 for ( size_t n = 0; n < srcLen; n++ )
3316 {
3317 if ( src[n] > 0xFF )
3318 return wxCONV_FAILED;
3319
3320 dst[n] = (char)src[n];
3321 }
3322
3323 }
3324 else // still need to check the input validity
3325 {
3326 for ( size_t n = 0; n < srcLen; n++ )
3327 {
3328 if ( src[n] > 0xFF )
3329 return wxCONV_FAILED;
3330 }
3331 }
3332
3333 return srcLen;
3334 }
3335
GetMBNulLen() const3336 size_t wxCSConv::GetMBNulLen() const
3337 {
3338 if ( m_convReal )
3339 return m_convReal->GetMBNulLen();
3340
3341 // otherwise, we are ISO-8859-1
3342 return 1;
3343 }
3344
3345 #if wxUSE_UNICODE_UTF8
IsUTF8() const3346 bool wxCSConv::IsUTF8() const
3347 {
3348 if ( m_convReal )
3349 return m_convReal->IsUTF8();
3350
3351 // otherwise, we are ISO-8859-1
3352 return false;
3353 }
3354 #endif
3355
3356
3357 #if wxUSE_UNICODE
3358
wxSafeConvertMB2WX(const char * s)3359 wxWCharBuffer wxSafeConvertMB2WX(const char *s)
3360 {
3361 if ( !s )
3362 return wxWCharBuffer();
3363
3364 wxWCharBuffer wbuf(wxConvLibc.cMB2WX(s));
3365 if ( !wbuf )
3366 wbuf = wxMBConvUTF8().cMB2WX(s);
3367 if ( !wbuf )
3368 wbuf = wxConvISO8859_1.cMB2WX(s);
3369
3370 return wbuf;
3371 }
3372
wxSafeConvertWX2MB(const wchar_t * ws)3373 wxCharBuffer wxSafeConvertWX2MB(const wchar_t *ws)
3374 {
3375 if ( !ws )
3376 return wxCharBuffer();
3377
3378 wxCharBuffer buf(wxConvLibc.cWX2MB(ws));
3379 if ( !buf )
3380 buf = wxMBConvUTF8(wxMBConvUTF8::MAP_INVALID_UTF8_TO_OCTAL).cWX2MB(ws);
3381
3382 return buf;
3383 }
3384
3385 #endif // wxUSE_UNICODE
3386
3387 // ----------------------------------------------------------------------------
3388 // globals
3389 // ----------------------------------------------------------------------------
3390
3391 // NB: The reason why we create converted objects in this convoluted way,
3392 // using a factory function instead of global variable, is that they
3393 // may be used at static initialization time (some of them are used by
3394 // wxString ctors and there may be a global wxString object). In other
3395 // words, possibly _before_ the converter global object would be
3396 // initialized.
3397
3398 #undef wxConvLibc
3399 #undef wxConvUTF8
3400 #undef wxConvUTF7
3401 #undef wxConvLocal
3402 #undef wxConvISO8859_1
3403
3404 #define WX_DEFINE_GLOBAL_CONV2(klass, impl_klass, name, ctor_args) \
3405 WXDLLIMPEXP_DATA_BASE(klass*) name##Ptr = NULL; \
3406 WXDLLIMPEXP_BASE klass* wxGet_##name##Ptr() \
3407 { \
3408 static impl_klass name##Obj ctor_args; \
3409 return &name##Obj; \
3410 } \
3411 /* this ensures that all global converter objects are created */ \
3412 /* by the time static initialization is done, i.e. before any */ \
3413 /* thread is launched: */ \
3414 static klass* gs_##name##instance = wxGet_##name##Ptr()
3415
3416 #define WX_DEFINE_GLOBAL_CONV(klass, name, ctor_args) \
3417 WX_DEFINE_GLOBAL_CONV2(klass, klass, name, ctor_args)
3418
3419 #ifdef __INTELC__
3420 // disable warning "variable 'xxx' was declared but never referenced"
3421 #pragma warning(disable: 177)
3422 #endif // Intel C++
3423
3424 #ifdef __WINDOWS__
3425 WX_DEFINE_GLOBAL_CONV2(wxMBConv, wxMBConv_win32, wxConvLibc, wxEMPTY_PARAMETER_VALUE);
3426 #elif 0 // defined(__WXOSX__)
3427 WX_DEFINE_GLOBAL_CONV2(wxMBConv, wxMBConv_cf, wxConvLibc, (wxFONTENCODING_UTF8));
3428 #else
3429 WX_DEFINE_GLOBAL_CONV2(wxMBConv, wxMBConvLibc, wxConvLibc, wxEMPTY_PARAMETER_VALUE);
3430 #endif
3431
3432 // NB: we can't use wxEMPTY_PARAMETER_VALUE as final argument here because it's
3433 // passed to WX_DEFINE_GLOBAL_CONV2 after a macro expansion and so still
3434 // provokes an error message about "not enough macro parameters"; and we
3435 // can't use "()" here as the name##Obj declaration would be parsed as a
3436 // function declaration then, so use a semicolon and live with an extra
3437 // empty statement (and hope that no compilers warns about this)
3438 WX_DEFINE_GLOBAL_CONV(wxMBConvStrictUTF8, wxConvUTF8, ;);
3439 WX_DEFINE_GLOBAL_CONV(wxMBConvUTF7, wxConvUTF7, ;);
3440
3441 WX_DEFINE_GLOBAL_CONV(wxCSConv, wxConvLocal, (wxFONTENCODING_SYSTEM));
3442 WX_DEFINE_GLOBAL_CONV(wxCSConv, wxConvISO8859_1, (wxFONTENCODING_ISO8859_1));
3443
3444 WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvCurrent = wxGet_wxConvLibcPtr();
3445 WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvUI = wxGet_wxConvLocalPtr();
3446
3447 #ifdef __DARWIN__
3448 // It is important to use this conversion object under Darwin as it ensures
3449 // that Unicode strings are (re)composed correctly even though xnu kernel uses
3450 // decomposed form internally (at least for the file names).
3451 static wxMBConv_cf wxConvMacUTF8DObj(wxFONTENCODING_UTF8);
3452 #endif
3453
3454 WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvFileName =
3455 #ifdef __DARWIN__
3456 &wxConvMacUTF8DObj;
3457 #else // !__DARWIN__
3458 wxGet_wxConvLibcPtr();
3459 #endif // __DARWIN__/!__DARWIN__
3460