1 /////////////////////////////////////////////////////////////////////////////
2 // Name: src/common/strconv.cpp
3 // Purpose: Unicode conversion classes
4 // Author: Ove Kaaven, Robert Roebling, Vadim Zeitlin, Vaclav Slavik,
5 // Ryan Norton, Fredrik Roubert (UTF7)
6 // Modified by:
7 // Created: 29/01/98
8 // Copyright: (c) 1999 Ove Kaaven, Robert Roebling, Vaclav Slavik
9 // (c) 2000-2003 Vadim Zeitlin
10 // (c) 2004 Ryan Norton, Fredrik Roubert
11 // Licence: wxWindows licence
12 /////////////////////////////////////////////////////////////////////////////
13
14 // For compilers that support precompilation, includes "wx.h".
15 #include "wx/wxprec.h"
16
17 #ifndef WX_PRECOMP
18 #include "wx/intl.h"
19 #include "wx/log.h"
20 #include "wx/utils.h"
21 #include "wx/hashmap.h"
22 #endif
23
24 #include "wx/strconv.h"
25
26 #include <errno.h>
27
28 #include <ctype.h>
29 #include <string.h>
30 #include <stdlib.h>
31
32 #if defined(__WIN32__)
33 #include "wx/msw/private.h"
34 #include "wx/msw/missing.h"
35 #define wxHAVE_WIN32_MB2WC
36 #endif
37
38 #ifdef HAVE_ICONV
39 #include <iconv.h>
40 #include "wx/thread.h"
41 #endif
42
43 #include "wx/encconv.h"
44 #include "wx/fontmap.h"
45 #include "wx/private/unicode.h"
46
47 #ifdef __DARWIN__
48 #include "wx/osx/core/private/strconv_cf.h"
49 #endif //def __DARWIN__
50
51
52 #define TRACE_STRCONV wxT("strconv")
53
54 // WC_UTF16 is defined only if sizeof(wchar_t) == 2, otherwise it's supposed to
55 // be 4 bytes
56 #if SIZEOF_WCHAR_T == 2
57 #define WC_UTF16
58 #endif
59
60
61 // ============================================================================
62 // implementation
63 // ============================================================================
64
65 // helper function of cMB2WC(): check if n bytes at this location are all NUL
NotAllNULs(const char * p,size_t n)66 static bool NotAllNULs(const char *p, size_t n)
67 {
68 while ( n && *p++ == '\0' )
69 n--;
70
71 return n != 0;
72 }
73
74 // ----------------------------------------------------------------------------
75 // UTF-16 en/decoding to/from UCS-4 with surrogates handling
76 // ----------------------------------------------------------------------------
77
encode_utf16(wxUint32 input,wxUint16 * output)78 static size_t encode_utf16(wxUint32 input, wxUint16 *output)
79 {
80 if (wxUniChar::IsBMP(input))
81 {
82 if (output)
83 *output = (wxUint16) input;
84
85 return 1;
86 }
87 else if (wxUniChar::IsSupplementary(input))
88 {
89 if (output)
90 {
91 *output++ = wxUniChar::HighSurrogate(input);
92 *output = wxUniChar::LowSurrogate(input);
93 }
94
95 return 2;
96 }
97 else
98 {
99 return wxCONV_FAILED;
100 }
101 }
102
103 // Returns the next UTF-32 character from the wchar_t buffer terminated by the
104 // "end" pointer (the caller must ensure that on input "*pSrc < end") and
105 // advances the pointer to the character after this one.
106 //
107 // If an invalid or incomplete character is found, *pSrc is set to NULL, the
108 // caller must check for this.
wxDecodeSurrogate(const wxChar16 ** pSrc,const wxChar16 * end)109 static wxUint32 wxDecodeSurrogate(const wxChar16 **pSrc, const wxChar16* end)
110 {
111 const wxChar16*& src = *pSrc;
112
113 // Is this a BMP character?
114 const wxUint16 u = *src++;
115 if ((u < 0xd800) || (u > 0xdfff))
116 {
117 // Yes, just return it.
118 return u;
119 }
120
121 // No, we have the first half of a surrogate, check if we also have the
122 // second half (notice that this check does nothing if end == NULL, as it
123 // is allowed to be, and this is correct).
124 if ( src == end )
125 {
126 // No, we don't because this is the end of input.
127 src = NULL;
128 return 0;
129 }
130
131 const wxUint16 u2 = *src++;
132 if ( (u2 < 0xdc00) || (u2 > 0xdfff) )
133 {
134 // No, it's not in the low surrogate range.
135 src = NULL;
136 return 0;
137 }
138
139 // Yes, decode it and return the corresponding Unicode character.
140 return ((u - 0xd7c0) << 10) + (u2 - 0xdc00);
141 }
142
143 // ----------------------------------------------------------------------------
144 // wxMBConv
145 // ----------------------------------------------------------------------------
146
147 size_t
ToWChar(wchar_t * dst,size_t dstLen,const char * src,size_t srcLen) const148 wxMBConv::ToWChar(wchar_t *dst, size_t dstLen,
149 const char *src, size_t srcLen) const
150 {
151 // although new conversion classes are supposed to implement this function
152 // directly, the existing ones only implement the old MB2WC() and so, to
153 // avoid to have to rewrite all conversion classes at once, we provide a
154 // default (but not efficient) implementation of this one in terms of the
155 // old function by copying the input to ensure that it's NUL-terminated and
156 // then using MB2WC() to convert it
157 //
158 // moreover, some conversion classes simply can't implement ToWChar()
159 // directly, the primary example is wxConvLibc: mbstowcs() only handles
160 // NUL-terminated strings
161
162 // the number of chars [which would be] written to dst [if it were not NULL]
163 size_t dstWritten = 0;
164
165 // the number of NULs terminating this string
166 size_t nulLen = 0; // not really needed, but just to avoid warnings
167
168 // if we were not given the input size we just have to assume that the
169 // string is properly terminated as we have no way of knowing how long it
170 // is anyhow, but if we do have the size check whether there are enough
171 // NULs at the end
172 wxCharBuffer bufTmp;
173 const char *srcEnd;
174 if ( srcLen != wxNO_LEN )
175 {
176 // we need to know how to find the end of this string
177 nulLen = GetMBNulLen();
178 if ( nulLen == wxCONV_FAILED )
179 return wxCONV_FAILED;
180
181 // if there are enough NULs we can avoid the copy
182 if ( srcLen < nulLen || NotAllNULs(src + srcLen - nulLen, nulLen) )
183 {
184 // make a copy in order to properly NUL-terminate the string
185 bufTmp = wxCharBuffer(srcLen + nulLen - 1 /* 1 will be added */);
186 char * const p = bufTmp.data();
187 memcpy(p, src, srcLen);
188 for ( char *s = p + srcLen; s < p + srcLen + nulLen; s++ )
189 *s = '\0';
190
191 src = bufTmp;
192 }
193
194 srcEnd = src + srcLen;
195 }
196 else // quit after the first loop iteration
197 {
198 srcEnd = NULL;
199 }
200
201 // the idea of this code is straightforward: it converts a NUL-terminated
202 // chunk of the string during each iteration and updates the output buffer
203 // with the result
204 //
205 // all the complication come from the fact that this function, for
206 // historical reasons, must behave in 2 subtly different ways when it's
207 // called with a fixed number of characters and when it's called for the
208 // entire NUL-terminated string: in the former case (srcEnd != NULL) we
209 // must count all characters we convert, NUL or not; but in the latter we
210 // do not count the trailing NUL -- but still count all the NULs inside the
211 // string
212 //
213 // so for the (simple) former case we just always count the trailing NUL,
214 // but for the latter we need to wait until we see if there is going to be
215 // another loop iteration and only count it then
216 for ( ;; )
217 {
218 // try to convert the current chunk
219 size_t lenChunk = MB2WC(NULL, src, 0);
220 if ( lenChunk == wxCONV_FAILED )
221 return wxCONV_FAILED;
222
223 dstWritten += lenChunk;
224 if ( !srcEnd )
225 dstWritten++;
226
227 if ( dst )
228 {
229 if ( dstWritten > dstLen )
230 return wxCONV_FAILED;
231
232 // +1 is for trailing NUL
233 if ( MB2WC(dst, src, lenChunk + 1) == wxCONV_FAILED )
234 return wxCONV_FAILED;
235
236 dst += lenChunk;
237 if ( !srcEnd )
238 dst++;
239 }
240
241 if ( !srcEnd )
242 {
243 // we convert just one chunk in this case as this is the entire
244 // string anyhow (and we don't count the trailing NUL in this case)
245 break;
246 }
247
248 // advance the input pointer past the end of this chunk: notice that we
249 // will always stop before srcEnd because we know that the chunk is
250 // always properly NUL-terminated
251 while ( NotAllNULs(src, nulLen) )
252 {
253 // notice that we must skip over multiple bytes here as we suppose
254 // that if NUL takes 2 or 4 bytes, then all the other characters do
255 // too and so if advanced by a single byte we might erroneously
256 // detect sequences of NUL bytes in the middle of the input
257 src += nulLen;
258 }
259
260 // if the buffer ends before this NUL, we shouldn't count it in our
261 // output so skip the code below
262 if ( src == srcEnd )
263 break;
264
265 // do count this terminator as it's inside the buffer we convert
266 dstWritten++;
267 if ( dst )
268 dst++;
269
270 src += nulLen; // skip the terminator itself
271
272 if ( src >= srcEnd )
273 break;
274 }
275
276 return dstWritten;
277 }
278
279 size_t
FromWChar(char * dst,size_t dstLen,const wchar_t * src,size_t srcLen) const280 wxMBConv::FromWChar(char *dst, size_t dstLen,
281 const wchar_t *src, size_t srcLen) const
282 {
283 // the number of chars [which would be] written to dst [if it were not NULL]
284 size_t dstWritten = 0;
285
286 // if we don't know its length we have no choice but to assume that it is
287 // NUL-terminated (notice that it can still be NUL-terminated even if
288 // explicit length is given but it doesn't change our return value)
289 const bool isNulTerminated = srcLen == wxNO_LEN;
290
291 // make a copy of the input string unless it is already properly
292 // NUL-terminated
293 wxWCharBuffer bufTmp;
294 if ( isNulTerminated )
295 {
296 srcLen = wxWcslen(src) + 1;
297 }
298 else if ( srcLen != 0 && src[srcLen - 1] != L'\0' )
299 {
300 // make a copy in order to properly NUL-terminate the string
301 bufTmp = wxWCharBuffer(srcLen);
302 memcpy(bufTmp.data(), src, srcLen * sizeof(wchar_t));
303 src = bufTmp;
304 }
305
306 const size_t lenNul = GetMBNulLen();
307 for ( const wchar_t * const srcEnd = src + srcLen;
308 src < srcEnd;
309 src++ /* skip L'\0' too */ )
310 {
311 // try to convert the current chunk
312 size_t lenChunk = WC2MB(NULL, src, 0);
313 if ( lenChunk == wxCONV_FAILED )
314 return wxCONV_FAILED;
315
316 dstWritten += lenChunk;
317
318 const wchar_t * const
319 chunkEnd = isNulTerminated ? srcEnd - 1 : src + wxWcslen(src);
320
321 // our return value accounts for the trailing NUL(s), unlike that of
322 // WC2MB(), however don't do it for the last NUL we artificially added
323 // ourselves above
324 if ( chunkEnd < srcEnd )
325 dstWritten += lenNul;
326
327 if ( dst )
328 {
329 if ( dstWritten > dstLen )
330 return wxCONV_FAILED;
331
332 // if we know that there is enough space in the destination buffer
333 // (because we accounted for lenNul in dstWritten above), we can
334 // convert directly in place -- but otherwise we need another
335 // temporary buffer to ensure that we don't overwrite the output
336 wxCharBuffer dstBuf;
337 char *dstTmp;
338 if ( chunkEnd == srcEnd )
339 {
340 dstBuf = wxCharBuffer(lenChunk + lenNul - 1);
341 dstTmp = dstBuf.data();
342 }
343 else
344 {
345 dstTmp = dst;
346 }
347
348 if ( WC2MB(dstTmp, src, lenChunk + lenNul) == wxCONV_FAILED )
349 return wxCONV_FAILED;
350
351 if ( dstTmp != dst )
352 {
353 // copy everything up to but excluding the terminating NUL(s)
354 // into the real output buffer
355 memcpy(dst, dstTmp, lenChunk);
356
357 // micro-optimization: if dstTmp != dst it means that chunkEnd
358 // == srcEnd and so we're done, no need to update anything below
359 break;
360 }
361
362 dst += lenChunk;
363 if ( chunkEnd < srcEnd )
364 dst += lenNul;
365 }
366
367 src = chunkEnd;
368 }
369
370 return dstWritten;
371 }
372
MB2WC(wchar_t * outBuff,const char * inBuff,size_t outLen) const373 size_t wxMBConv::MB2WC(wchar_t *outBuff, const char *inBuff, size_t outLen) const
374 {
375 size_t rc = ToWChar(outBuff, outLen, inBuff);
376 if ( rc != wxCONV_FAILED )
377 {
378 // ToWChar() returns the buffer length, i.e. including the trailing
379 // NUL, while this method doesn't take it into account
380 rc--;
381 }
382
383 return rc;
384 }
385
WC2MB(char * outBuff,const wchar_t * inBuff,size_t outLen) const386 size_t wxMBConv::WC2MB(char *outBuff, const wchar_t *inBuff, size_t outLen) const
387 {
388 size_t rc = FromWChar(outBuff, outLen, inBuff);
389 if ( rc != wxCONV_FAILED )
390 {
391 rc -= GetMBNulLen();
392 }
393
394 return rc;
395 }
396
397 wxWCharBuffer
cMB2WC(const char * inBuff,size_t inLen,size_t * outLen) const398 wxMBConv::cMB2WC(const char *inBuff, size_t inLen, size_t *outLen) const
399 {
400 const size_t dstLen = ToWChar(NULL, 0, inBuff, inLen);
401 if ( dstLen != wxCONV_FAILED )
402 {
403 // notice that we allocate space for dstLen+1 wide characters here
404 // because we want the buffer to always be NUL-terminated, even if the
405 // input isn't (as otherwise the caller has no way to know its length)
406 wxWCharBuffer wbuf(dstLen);
407 if ( ToWChar(wbuf.data(), dstLen, inBuff, inLen) != wxCONV_FAILED )
408 {
409 if ( outLen )
410 {
411 *outLen = dstLen;
412
413 // we also need to handle NUL-terminated input strings
414 // specially: for them the output is the length of the string
415 // excluding the trailing NUL, however if we're asked to
416 // convert a specific number of characters we return the length
417 // of the resulting output even if it's NUL-terminated
418 if ( inLen == wxNO_LEN )
419 (*outLen)--;
420 }
421
422 return wbuf;
423 }
424 }
425
426 if ( outLen )
427 *outLen = 0;
428
429 return wxWCharBuffer();
430 }
431
432 wxCharBuffer
cWC2MB(const wchar_t * inBuff,size_t inLen,size_t * outLen) const433 wxMBConv::cWC2MB(const wchar_t *inBuff, size_t inLen, size_t *outLen) const
434 {
435 size_t dstLen = FromWChar(NULL, 0, inBuff, inLen);
436 if ( dstLen != wxCONV_FAILED )
437 {
438 const size_t nulLen = GetMBNulLen();
439
440 // as above, ensure that the buffer is always NUL-terminated, even if
441 // the input is not
442 wxCharBuffer buf(dstLen + nulLen - 1);
443 memset(buf.data() + dstLen, 0, nulLen);
444
445 // Notice that return value of the call to FromWChar() here may be
446 // different from the one above as it could have overestimated the
447 // space needed, while what we get here is the exact length.
448 dstLen = FromWChar(buf.data(), dstLen, inBuff, inLen);
449 if ( dstLen != wxCONV_FAILED )
450 {
451 if ( outLen )
452 {
453 *outLen = dstLen;
454
455 if ( inLen == wxNO_LEN )
456 {
457 // in this case both input and output are NUL-terminated
458 // and we're not supposed to count NUL
459 *outLen -= nulLen;
460 }
461 }
462
463 return buf;
464 }
465 }
466
467 if ( outLen )
468 *outLen = 0;
469
470 return wxCharBuffer();
471 }
472
DoConvertMB2WC(const char * buf,size_t srcLen) const473 wxWCharBuffer wxMBConv::DoConvertMB2WC(const char* buf, size_t srcLen) const
474 {
475 // Notice that converting NULL pointer should work, i.e. return an empty
476 // buffer instead of crashing, so we need to check both the length and the
477 // pointer because length is wxNO_LEN if it's a raw pointer and doesn't
478 // come from wxScopedCharBuffer.
479 if ( srcLen && buf )
480 {
481 const size_t dstLen = ToWChar(NULL, 0, buf, srcLen);
482 if ( dstLen != wxCONV_FAILED )
483 {
484 wxWCharBuffer wbuf(dstLen);
485 wbuf.data()[dstLen] = L'\0';
486 if ( ToWChar(wbuf.data(), dstLen, buf, srcLen) != wxCONV_FAILED )
487 {
488 // If the input string was NUL-terminated, we shouldn't include
489 // the length of the trailing NUL into the length of the return
490 // value.
491 if ( srcLen == wxNO_LEN )
492 wbuf.shrink(dstLen - 1);
493
494 return wbuf;
495 }
496 }
497 }
498
499 return wxWCharBuffer();
500 }
501
DoConvertWC2MB(const wchar_t * wbuf,size_t srcLen) const502 wxCharBuffer wxMBConv::DoConvertWC2MB(const wchar_t* wbuf, size_t srcLen) const
503 {
504 if ( srcLen && wbuf )
505 {
506 const size_t dstLen = FromWChar(NULL, 0, wbuf, srcLen);
507 if ( dstLen != wxCONV_FAILED )
508 {
509 wxCharBuffer buf(dstLen);
510 buf.data()[dstLen] = '\0';
511 if ( FromWChar(buf.data(), dstLen, wbuf, srcLen) != wxCONV_FAILED )
512 {
513 // As above, in DoConvertMB2WC(), except that the length of the
514 // trailing NUL is variable in this case.
515 if ( srcLen == wxNO_LEN )
516 buf.shrink(dstLen - GetMBNulLen());
517
518 return buf;
519 }
520 }
521 }
522
523 return wxCharBuffer();
524 }
525
526 // ----------------------------------------------------------------------------
527 // wxMBConvLibc
528 // ----------------------------------------------------------------------------
529
MB2WC(wchar_t * buf,const char * psz,size_t n) const530 size_t wxMBConvLibc::MB2WC(wchar_t *buf, const char *psz, size_t n) const
531 {
532 return wxMB2WC(buf, psz, n);
533 }
534
WC2MB(char * buf,const wchar_t * psz,size_t n) const535 size_t wxMBConvLibc::WC2MB(char *buf, const wchar_t *psz, size_t n) const
536 {
537 return wxWC2MB(buf, psz, n);
538 }
539
540 // ----------------------------------------------------------------------------
541 // wxConvBrokenFileNames
542 // ----------------------------------------------------------------------------
543
544 #ifdef __UNIX__
545
wxConvBrokenFileNames(const wxString & charset)546 wxConvBrokenFileNames::wxConvBrokenFileNames(const wxString& charset)
547 {
548 if ( wxStricmp(charset, wxT("UTF-8")) == 0 ||
549 wxStricmp(charset, wxT("UTF8")) == 0 )
550 m_conv = new wxMBConvUTF8(wxMBConvUTF8::MAP_INVALID_UTF8_TO_PUA);
551 else
552 m_conv = new wxCSConv(charset);
553 }
554
555 #endif // __UNIX__
556
557 // ----------------------------------------------------------------------------
558 // UTF-7
559 // ----------------------------------------------------------------------------
560
561 // Implementation (C) 2004 Fredrik Roubert
562 //
563 // Changes to work in streaming mode (C) 2008 Vadim Zeitlin
564
565 //
566 // BASE64 decoding table
567 //
568 static const unsigned char utf7unb64[] =
569 {
570 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
571 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
572 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
573 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
574 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
575 0xff, 0xff, 0xff, 0x3e, 0xff, 0xff, 0xff, 0x3f,
576 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x3b,
577 0x3c, 0x3d, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
578 0xff, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06,
579 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e,
580 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16,
581 0x17, 0x18, 0x19, 0xff, 0xff, 0xff, 0xff, 0xff,
582 0xff, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 0x20,
583 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28,
584 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, 0x30,
585 0x31, 0x32, 0x33, 0xff, 0xff, 0xff, 0xff, 0xff,
586 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
587 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
588 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
589 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
590 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
591 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
592 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
593 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
594 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
595 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
596 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
597 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
598 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
599 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
600 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
601 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
602 };
603
ToWChar(wchar_t * dst,size_t dstLen,const char * src,size_t srcLen) const604 size_t wxMBConvUTF7::ToWChar(wchar_t *dst, size_t dstLen,
605 const char *src, size_t srcLen) const
606 {
607 DecoderState stateOrig,
608 *statePtr;
609 if ( srcLen == wxNO_LEN )
610 {
611 // convert the entire string, up to and including the trailing NUL
612 srcLen = strlen(src) + 1;
613
614 // when working on the entire strings we don't update nor use the shift
615 // state from the previous call
616 statePtr = &stateOrig;
617 }
618 else // when working with partial strings we do use the shift state
619 {
620 statePtr = const_cast<DecoderState *>(&m_stateDecoder);
621
622 // also save the old state to be able to rollback to it on error
623 stateOrig = m_stateDecoder;
624 }
625
626 // but to simplify the code below we use this variable in both cases
627 DecoderState& state = *statePtr;
628
629
630 // number of characters [which would have been] written to dst [if it were
631 // not NULL]
632 size_t len = 0;
633
634 const char * const srcEnd = src + srcLen;
635
636 while ( (src < srcEnd) && (!dst || (len < dstLen)) )
637 {
638 const unsigned char cc = *src++;
639
640 if ( state.IsShifted() )
641 {
642 const unsigned char dc = utf7unb64[cc];
643 if ( dc == 0xff )
644 {
645 // end of encoded part, check that nothing was left: there can
646 // be up to 4 bits of 0 padding but nothing else (we also need
647 // to check isLSB as we count bits modulo 8 while a valid UTF-7
648 // encoded sequence must contain an integral number of UTF-16
649 // characters)
650 if ( state.isLSB || state.bit > 4 ||
651 (state.accum & ((1 << state.bit) - 1)) )
652 {
653 if ( !len )
654 state = stateOrig;
655
656 return wxCONV_FAILED;
657 }
658
659 state.ToDirect();
660
661 // re-parse this character normally below unless it's '-' which
662 // is consumed by the decoder
663 if ( cc == '-' )
664 continue;
665 }
666 else // valid encoded character
667 {
668 // mini base64 decoder: each character is 6 bits
669 state.bit += 6;
670 state.accum <<= 6;
671 state.accum += dc;
672
673 if ( state.bit >= 8 )
674 {
675 // got the full byte, consume it
676 state.bit -= 8;
677 unsigned char b = (state.accum >> state.bit) & 0x00ff;
678
679 if ( state.isLSB )
680 {
681 // we've got the full word, output it
682 if ( dst )
683 *dst++ = (state.msb << 8) | b;
684 len++;
685 state.isLSB = false;
686 }
687 else // MSB
688 {
689 // just store it while we wait for LSB
690 state.msb = b;
691 state.isLSB = true;
692 }
693 }
694 }
695 }
696
697 if ( state.IsDirect() )
698 {
699 // start of an encoded segment?
700 if ( cc == '+' )
701 {
702 // Can't end with a plus sign.
703 if ( src == srcEnd )
704 return wxCONV_FAILED;
705
706 if ( *src == '-' )
707 {
708 // just the encoded plus sign, don't switch to shifted mode
709 if ( dst )
710 *dst++ = '+';
711 len++;
712 src++;
713 }
714 else if ( utf7unb64[(unsigned)*src] == 0xff )
715 {
716 // empty encoded chunks are not allowed
717 if ( !len )
718 state = stateOrig;
719
720 return wxCONV_FAILED;
721 }
722 else // base-64 encoded chunk follows
723 {
724 state.ToShifted();
725 }
726 }
727 else // not '+'
728 {
729 // only printable 7 bit ASCII characters (with the exception of
730 // NUL, TAB, CR and LF) can be used directly
731 if ( cc >= 0x7f || (cc < ' ' &&
732 !(cc == '\0' || cc == '\t' || cc == '\r' || cc == '\n')) )
733 return wxCONV_FAILED;
734
735 if ( dst )
736 *dst++ = cc;
737 len++;
738 }
739 }
740 }
741
742 if ( !len )
743 {
744 // as we didn't read any characters we should be called with the same
745 // data (followed by some more new data) again later so don't save our
746 // state
747 state = stateOrig;
748
749 return wxCONV_FAILED;
750 }
751
752 return len;
753 }
754
755 //
756 // BASE64 encoding table
757 //
758 static const unsigned char utf7enb64[] =
759 {
760 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H',
761 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P',
762 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
763 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f',
764 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n',
765 'o', 'p', 'q', 'r', 's', 't', 'u', 'v',
766 'w', 'x', 'y', 'z', '0', '1', '2', '3',
767 '4', '5', '6', '7', '8', '9', '+', '/'
768 };
769
770 //
771 // UTF-7 encoding table
772 //
773 // 0 - Set D (directly encoded characters)
774 // 1 - Set O (optional direct characters)
775 // 2 - whitespace characters (optional)
776 // 3 - special characters
777 //
778 static const unsigned char utf7encode[128] =
779 {
780 0, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
781 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
782 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 3,
783 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
784 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
785 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
786 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
787 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3
788 };
789
wxIsUTF7Direct(wchar_t wc)790 static inline bool wxIsUTF7Direct(wchar_t wc)
791 {
792 return wc < 0x80 && utf7encode[wc] < 1;
793 }
794
FromWChar(char * dst,size_t dstLen,const wchar_t * src,size_t srcLen) const795 size_t wxMBConvUTF7::FromWChar(char *dst, size_t dstLen,
796 const wchar_t *src, size_t srcLen) const
797 {
798 EncoderState stateOrig,
799 *statePtr;
800 if ( srcLen == wxNO_LEN )
801 {
802 // we don't apply the stored state when operating on entire strings at
803 // once
804 statePtr = &stateOrig;
805
806 srcLen = wxWcslen(src) + 1;
807 }
808 else // do use the mode we left the output in previously
809 {
810 stateOrig = m_stateEncoder;
811 statePtr = const_cast<EncoderState *>(&m_stateEncoder);
812 }
813
814 EncoderState& state = *statePtr;
815
816
817 size_t len = 0;
818
819 const wchar_t * const srcEnd = src + srcLen;
820 while ( src < srcEnd && (!dst || len < dstLen) )
821 {
822 wchar_t cc = *src++;
823 if ( wxIsUTF7Direct(cc) )
824 {
825 if ( state.IsShifted() )
826 {
827 // pad with zeros the last encoded block if necessary
828 if ( state.bit )
829 {
830 if ( dst )
831 *dst++ = utf7enb64[((state.accum % 16) << (6 - state.bit)) % 64];
832 len++;
833 }
834
835 state.ToDirect();
836
837 if ( dst )
838 *dst++ = '-';
839 len++;
840 }
841
842 if ( dst )
843 *dst++ = (char)cc;
844 len++;
845 }
846 else if ( cc == '+' && state.IsDirect() )
847 {
848 if ( dst )
849 {
850 *dst++ = '+';
851 *dst++ = '-';
852 }
853
854 len += 2;
855 }
856 #ifndef WC_UTF16
857 else if (((wxUint32)cc) > 0xffff)
858 {
859 // no surrogate pair generation (yet?)
860 return wxCONV_FAILED;
861 }
862 #endif
863 else
864 {
865 if ( state.IsDirect() )
866 {
867 state.ToShifted();
868
869 if ( dst )
870 *dst++ = '+';
871 len++;
872 }
873
874 // BASE64 encode string
875 for ( ;; )
876 {
877 for ( unsigned lsb = 0; lsb < 2; lsb++ )
878 {
879 state.accum <<= 8;
880 state.accum += lsb ? cc & 0xff : (cc & 0xff00) >> 8;
881
882 for (state.bit += 8; state.bit >= 6; )
883 {
884 state.bit -= 6;
885 if ( dst )
886 *dst++ = utf7enb64[(state.accum >> state.bit) % 64];
887 len++;
888 }
889 }
890
891 if ( src == srcEnd || wxIsUTF7Direct(cc = *src) )
892 break;
893
894 src++;
895 }
896 }
897 }
898
899 // we need to restore the original encoder state if we were called just to
900 // calculate the amount of space needed as we will presumably be called
901 // again to really convert the data now
902 if ( !dst )
903 state = stateOrig;
904
905 return len;
906 }
907
908 // ----------------------------------------------------------------------------
909 // UTF-8
910 // ----------------------------------------------------------------------------
911
912 static const wxUint32 utf8_max[]=
913 { 0x7f, 0x7ff, 0xffff, 0x1fffff, 0x3ffffff, 0x7fffffff, 0xffffffff };
914
915 // boundaries of the private use area we use to (temporarily) remap invalid
916 // characters invalid in a UTF-8 encoded string
917 const wxUint32 wxUnicodePUA = 0x100000;
918 const wxUint32 wxUnicodePUAEnd = wxUnicodePUA + 256;
919
920 // this table gives the length of the UTF-8 encoding from its first character:
921 extern const unsigned char tableUtf8Lengths[256] = {
922 // single-byte sequences (ASCII):
923 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 00..0F
924 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 10..1F
925 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 20..2F
926 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 30..3F
927 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 40..4F
928 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 50..5F
929 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 60..6F
930 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 70..7F
931
932 // these are invalid:
933 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 80..8F
934 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 90..9F
935 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // A0..AF
936 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // B0..BF
937 0, 0, // C0,C1
938
939 // two-byte sequences:
940 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // C2..CF
941 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // D0..DF
942
943 // three-byte sequences:
944 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // E0..EF
945
946 // four-byte sequences:
947 4, 4, 4, 4, 4, // F0..F4
948
949 // these are invalid again (5- or 6-byte
950 // sequences and sequences for code points
951 // above U+10FFFF, as restricted by RFC 3629):
952 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 // F5..FF
953 };
954
955 size_t
ToWChar(wchar_t * dst,size_t dstLen,const char * src,size_t srcLen) const956 wxMBConvStrictUTF8::ToWChar(wchar_t *dst, size_t dstLen,
957 const char *src, size_t srcLen) const
958 {
959 wchar_t *out = dstLen ? dst : NULL;
960 size_t written = 0;
961
962 if ( srcLen == wxNO_LEN )
963 srcLen = strlen(src) + 1;
964
965 for ( const char *p = src; ; p++ )
966 {
967 if ( (srcLen == wxNO_LEN ? !*p : !srcLen) )
968 {
969 // all done successfully, just add the trailing NULL if we are not
970 // using explicit length
971 if ( srcLen == wxNO_LEN )
972 {
973 if ( out )
974 {
975 if ( !dstLen )
976 break;
977
978 *out = L'\0';
979 }
980
981 written++;
982 }
983
984 return written;
985 }
986
987 if ( out && !dstLen-- )
988 break;
989
990 wxUint32 code;
991 unsigned char c = *p;
992
993 if ( c < 0x80 )
994 {
995 if ( srcLen == 0 ) // the test works for wxNO_LEN too
996 break;
997
998 if ( srcLen != wxNO_LEN )
999 srcLen--;
1000
1001 code = c;
1002 }
1003 else
1004 {
1005 unsigned len = tableUtf8Lengths[c];
1006 if ( !len )
1007 break;
1008
1009 if ( srcLen < len ) // the test works for wxNO_LEN too
1010 break;
1011
1012 if ( srcLen != wxNO_LEN )
1013 srcLen -= len;
1014
1015 // Char. number range | UTF-8 octet sequence
1016 // (hexadecimal) | (binary)
1017 // ----------------------+----------------------------------------
1018 // 0000 0000 - 0000 007F | 0xxxxxxx
1019 // 0000 0080 - 0000 07FF | 110xxxxx 10xxxxxx
1020 // 0000 0800 - 0000 FFFF | 1110xxxx 10xxxxxx 10xxxxxx
1021 // 0001 0000 - 0010 FFFF | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
1022 //
1023 // Code point value is stored in bits marked with 'x',
1024 // lowest-order bit of the value on the right side in the diagram
1025 // above. (from RFC 3629)
1026
1027 // mask to extract lead byte's value ('x' bits above), by sequence
1028 // length:
1029 static const unsigned char leadValueMask[] = { 0x7F, 0x1F, 0x0F, 0x07 };
1030
1031 len--; // it's more convenient to work with 0-based length here
1032
1033 code = c & leadValueMask[len];
1034
1035 // all remaining bytes, if any, are handled in the same way
1036 // regardless of sequence's length:
1037 for ( ; len; --len )
1038 {
1039 c = *++p;
1040 if ( (c & 0xC0) != 0x80 )
1041 return wxCONV_FAILED;
1042
1043 code <<= 6;
1044 code |= c & 0x3F;
1045 }
1046 }
1047
1048 #ifdef WC_UTF16
1049 // cast is ok because wchar_t == wxUint16 if WC_UTF16
1050 if ( encode_utf16(code, (wxUint16 *)out) == 2 )
1051 {
1052 if ( out )
1053 out++;
1054 written++;
1055 }
1056 #else // !WC_UTF16
1057 if ( out )
1058 *out = code;
1059 #endif // WC_UTF16/!WC_UTF16
1060
1061 if ( out )
1062 out++;
1063
1064 written++;
1065 }
1066
1067 return wxCONV_FAILED;
1068 }
1069
1070 size_t
FromWChar(char * dst,size_t dstLen,const wchar_t * src,size_t srcLen) const1071 wxMBConvStrictUTF8::FromWChar(char *dst, size_t dstLen,
1072 const wchar_t *src, size_t srcLen) const
1073 {
1074 char *out = dstLen ? dst : NULL;
1075 size_t written = 0;
1076
1077 const wchar_t* const end = srcLen == wxNO_LEN ? NULL : src + srcLen;
1078 for ( const wchar_t *wp = src; ; )
1079 {
1080 if ( end ? wp == end : !*wp )
1081 {
1082 // all done successfully, just add the trailing NULL if we are not
1083 // using explicit length
1084 if ( srcLen == wxNO_LEN )
1085 {
1086 if ( out )
1087 {
1088 if ( !dstLen )
1089 break;
1090
1091 *out = '\0';
1092 }
1093
1094 written++;
1095 }
1096
1097 return written;
1098 }
1099
1100 wxUint32 code;
1101 #ifdef WC_UTF16
1102 code = wxDecodeSurrogate(&wp, end);
1103 if ( !wp )
1104 return wxCONV_FAILED;
1105 #else // wchar_t is UTF-32
1106 code = *wp++ & 0x7fffffff;
1107 #endif
1108
1109 unsigned len;
1110 if ( code <= 0x7F )
1111 {
1112 len = 1;
1113 if ( out )
1114 {
1115 if ( dstLen < len )
1116 break;
1117
1118 out[0] = (char)code;
1119 }
1120 }
1121 else if ( code <= 0x07FF )
1122 {
1123 len = 2;
1124 if ( out )
1125 {
1126 if ( dstLen < len )
1127 break;
1128
1129 // NB: this line takes 6 least significant bits, encodes them as
1130 // 10xxxxxx and discards them so that the next byte can be encoded:
1131 out[1] = 0x80 | (code & 0x3F); code >>= 6;
1132 out[0] = 0xC0 | code;
1133 }
1134 }
1135 else if ( code <= 0xFFFF )
1136 {
1137 len = 3;
1138 if ( out )
1139 {
1140 if ( dstLen < len )
1141 break;
1142
1143 out[2] = 0x80 | (code & 0x3F); code >>= 6;
1144 out[1] = 0x80 | (code & 0x3F); code >>= 6;
1145 out[0] = 0xE0 | code;
1146 }
1147 }
1148 else if ( code <= 0x10FFFF )
1149 {
1150 len = 4;
1151 if ( out )
1152 {
1153 if ( dstLen < len )
1154 break;
1155
1156 out[3] = 0x80 | (code & 0x3F); code >>= 6;
1157 out[2] = 0x80 | (code & 0x3F); code >>= 6;
1158 out[1] = 0x80 | (code & 0x3F); code >>= 6;
1159 out[0] = 0xF0 | code;
1160 }
1161 }
1162 else
1163 {
1164 wxFAIL_MSG( wxT("trying to encode undefined Unicode character") );
1165 break;
1166 }
1167
1168 if ( out )
1169 {
1170 out += len;
1171 dstLen -= len;
1172 }
1173
1174 written += len;
1175 }
1176
1177 // we only get here if an error occurs during decoding
1178 return wxCONV_FAILED;
1179 }
1180
ToWChar(wchar_t * buf,size_t n,const char * psz,size_t srcLen) const1181 size_t wxMBConvUTF8::ToWChar(wchar_t *buf, size_t n,
1182 const char *psz, size_t srcLen) const
1183 {
1184 if ( m_options == MAP_INVALID_UTF8_NOT )
1185 return wxMBConvStrictUTF8::ToWChar(buf, n, psz, srcLen);
1186
1187 size_t len = 0;
1188
1189 // The length can be either given explicitly or computed implicitly for the
1190 // NUL-terminated strings.
1191 const bool isNulTerminated = srcLen == wxNO_LEN;
1192 while ((isNulTerminated ? *psz : srcLen--) && ((!buf) || (len < n)))
1193 {
1194 const char *opsz = psz;
1195 unsigned char cc = *psz++, fc = cc;
1196 unsigned cnt;
1197 for (cnt = 0; fc & 0x80; cnt++)
1198 fc <<= 1;
1199
1200 if (!cnt)
1201 {
1202 // plain ASCII char
1203 if (buf)
1204 *buf++ = cc;
1205 len++;
1206
1207 // escape the escape character for octal escapes
1208 if ((m_options & MAP_INVALID_UTF8_TO_OCTAL)
1209 && cc == '\\' && (!buf || len < n))
1210 {
1211 if (buf)
1212 *buf++ = cc;
1213 len++;
1214 }
1215 }
1216 else
1217 {
1218 bool invalid = false;
1219 cnt--;
1220 if (!cnt)
1221 {
1222 // invalid UTF-8 sequence
1223 invalid = true;
1224 }
1225 else
1226 {
1227 unsigned ocnt = cnt - 1;
1228 wxUint32 res = cc & (0x3f >> cnt);
1229 while (cnt--)
1230 {
1231 if (!isNulTerminated && !srcLen)
1232 {
1233 // invalid UTF-8 sequence ending before the end of code
1234 // point.
1235 invalid = true;
1236 break;
1237 }
1238
1239 cc = *psz;
1240 if ((cc & 0xC0) != 0x80)
1241 {
1242 // invalid UTF-8 sequence
1243 invalid = true;
1244 break;
1245 }
1246
1247 psz++;
1248 if (!isNulTerminated)
1249 srcLen--;
1250 res = (res << 6) | (cc & 0x3f);
1251 }
1252
1253 if (invalid || res <= utf8_max[ocnt])
1254 {
1255 // illegal UTF-8 encoding
1256 invalid = true;
1257 }
1258 else if ((m_options & MAP_INVALID_UTF8_TO_PUA) &&
1259 res >= wxUnicodePUA && res < wxUnicodePUAEnd)
1260 {
1261 // if one of our PUA characters turns up externally
1262 // it must also be treated as an illegal sequence
1263 // (a bit like you have to escape an escape character)
1264 invalid = true;
1265 }
1266 else
1267 {
1268 #ifdef WC_UTF16
1269 // cast is ok because wchar_t == wxUint16 if WC_UTF16
1270 size_t pa = encode_utf16(res, (wxUint16 *)buf);
1271 if (pa == wxCONV_FAILED)
1272 {
1273 invalid = true;
1274 }
1275 else
1276 {
1277 if (buf)
1278 buf += pa;
1279 len += pa;
1280 }
1281 #else // !WC_UTF16
1282 if (buf)
1283 *buf++ = (wchar_t)res;
1284 len++;
1285 #endif // WC_UTF16/!WC_UTF16
1286 }
1287 }
1288
1289 if (invalid)
1290 {
1291 if (m_options & MAP_INVALID_UTF8_TO_PUA)
1292 {
1293 while (opsz < psz && (!buf || len < n))
1294 {
1295 #ifdef WC_UTF16
1296 // cast is ok because wchar_t == wxUuint16 if WC_UTF16
1297 size_t pa = encode_utf16((unsigned char)*opsz + wxUnicodePUA, (wxUint16 *)buf);
1298 wxASSERT(pa != wxCONV_FAILED);
1299 if (buf)
1300 buf += pa;
1301 opsz++;
1302 len += pa;
1303 #else
1304 if (buf)
1305 *buf++ = (wchar_t)(wxUnicodePUA + (unsigned char)*opsz);
1306 opsz++;
1307 len++;
1308 #endif
1309 }
1310 }
1311 else if (m_options & MAP_INVALID_UTF8_TO_OCTAL)
1312 {
1313 while (opsz < psz && (!buf || len < n))
1314 {
1315 if ( buf && len + 3 < n )
1316 {
1317 unsigned char on = *opsz;
1318 *buf++ = L'\\';
1319 *buf++ = (wchar_t)( L'0' + on / 0100 );
1320 *buf++ = (wchar_t)( L'0' + (on % 0100) / 010 );
1321 *buf++ = (wchar_t)( L'0' + on % 010 );
1322 }
1323
1324 opsz++;
1325 len += 4;
1326 }
1327 }
1328 else // MAP_INVALID_UTF8_NOT
1329 {
1330 return wxCONV_FAILED;
1331 }
1332 }
1333 }
1334 }
1335
1336 if ( isNulTerminated )
1337 {
1338 // Add the trailing NUL in this case if we have a large enough buffer.
1339 if ( buf && (len < n) )
1340 *buf = 0;
1341
1342 // And count it in any case.
1343 len++;
1344 }
1345
1346 return len;
1347 }
1348
isoctal(wchar_t wch)1349 static inline bool isoctal(wchar_t wch)
1350 {
1351 return L'0' <= wch && wch <= L'7';
1352 }
1353
FromWChar(char * buf,size_t n,const wchar_t * psz,size_t srcLen) const1354 size_t wxMBConvUTF8::FromWChar(char *buf, size_t n,
1355 const wchar_t *psz, size_t srcLen) const
1356 {
1357 if ( m_options == MAP_INVALID_UTF8_NOT )
1358 return wxMBConvStrictUTF8::FromWChar(buf, n, psz, srcLen);
1359
1360 size_t len = 0;
1361
1362 // The length can be either given explicitly or computed implicitly for the
1363 // NUL-terminated strings.
1364 const wchar_t* const end = srcLen == wxNO_LEN ? NULL : psz + srcLen;
1365 while ((end ? psz < end : *psz) && ((!buf) || (len < n)))
1366 {
1367 wxUint32 cc;
1368
1369 #ifdef WC_UTF16
1370 cc = wxDecodeSurrogate(&psz, end);
1371 if ( !psz )
1372 return wxCONV_FAILED;
1373 #else
1374 cc = (*psz++) & 0x7fffffff;
1375 #endif
1376
1377 if ( (m_options & MAP_INVALID_UTF8_TO_PUA)
1378 && cc >= wxUnicodePUA && cc < wxUnicodePUAEnd )
1379 {
1380 if (buf)
1381 *buf++ = (char)(cc - wxUnicodePUA);
1382 len++;
1383 }
1384 else if ( (m_options & MAP_INVALID_UTF8_TO_OCTAL)
1385 && cc == L'\\' && psz[0] == L'\\' )
1386 {
1387 if (buf)
1388 *buf++ = (char)cc;
1389 psz++;
1390 len++;
1391 }
1392 else if ( (m_options & MAP_INVALID_UTF8_TO_OCTAL) &&
1393 cc == L'\\' &&
1394 isoctal(psz[0]) && isoctal(psz[1]) && isoctal(psz[2]) )
1395 {
1396 if (buf)
1397 {
1398 *buf++ = (char) ((psz[0] - L'0') * 0100 +
1399 (psz[1] - L'0') * 010 +
1400 (psz[2] - L'0'));
1401 }
1402
1403 psz += 3;
1404 len++;
1405 }
1406 else
1407 {
1408 unsigned cnt;
1409 for (cnt = 0; cc > utf8_max[cnt]; cnt++)
1410 {
1411 }
1412
1413 if (!cnt)
1414 {
1415 // plain ASCII char
1416 if (buf)
1417 *buf++ = (char) cc;
1418 len++;
1419 }
1420 else
1421 {
1422 len += cnt + 1;
1423 if (buf)
1424 {
1425 *buf++ = (char) ((~0x7fu >> cnt) | ((cc >> (cnt * 6)) & (0x3f >> cnt)));
1426 while (cnt--)
1427 *buf++ = (char) (0x80 | ((cc >> (cnt * 6)) & 0x3f));
1428 }
1429 }
1430 }
1431 }
1432
1433 if ( !end )
1434 {
1435 // Add the trailing NUL in this case if we have a large enough buffer.
1436 if ( buf && (len < n) )
1437 *buf = 0;
1438
1439 // And count it in any case.
1440 len++;
1441 }
1442
1443 return len;
1444 }
1445
1446 // ============================================================================
1447 // UTF-16
1448 // ============================================================================
1449
1450 #ifdef WORDS_BIGENDIAN
1451 #define wxMBConvUTF16straight wxMBConvUTF16BE
1452 #define wxMBConvUTF16swap wxMBConvUTF16LE
1453 #else
1454 #define wxMBConvUTF16swap wxMBConvUTF16BE
1455 #define wxMBConvUTF16straight wxMBConvUTF16LE
1456 #endif
1457
1458 /* static */
GetLength(const char * src,size_t srcLen)1459 size_t wxMBConvUTF16Base::GetLength(const char *src, size_t srcLen)
1460 {
1461 if ( srcLen == wxNO_LEN )
1462 {
1463 // count the number of bytes in input, including the trailing NULs
1464 const wxUint16 *inBuff = reinterpret_cast<const wxUint16 *>(src);
1465 for ( srcLen = 1; *inBuff++; srcLen++ )
1466 ;
1467
1468 srcLen *= BYTES_PER_CHAR;
1469 }
1470 else // we already have the length
1471 {
1472 // we can only convert an entire number of UTF-16 characters
1473 if ( srcLen % BYTES_PER_CHAR )
1474 return wxCONV_FAILED;
1475 }
1476
1477 return srcLen;
1478 }
1479
1480 // case when in-memory representation is UTF-16 too
1481 #ifdef WC_UTF16
1482
1483 // ----------------------------------------------------------------------------
1484 // conversions without endianness change
1485 // ----------------------------------------------------------------------------
1486
1487 size_t
ToWChar(wchar_t * dst,size_t dstLen,const char * src,size_t srcLen) const1488 wxMBConvUTF16straight::ToWChar(wchar_t *dst, size_t dstLen,
1489 const char *src, size_t srcLen) const
1490 {
1491 // set up the scene for using memcpy() (which is presumably more efficient
1492 // than copying the bytes one by one)
1493 srcLen = GetLength(src, srcLen);
1494 if ( srcLen == wxNO_LEN )
1495 return wxCONV_FAILED;
1496
1497 const size_t inLen = srcLen / BYTES_PER_CHAR;
1498 if ( dst )
1499 {
1500 if ( dstLen < inLen )
1501 return wxCONV_FAILED;
1502
1503 memcpy(dst, src, srcLen);
1504 }
1505
1506 return inLen;
1507 }
1508
1509 size_t
FromWChar(char * dst,size_t dstLen,const wchar_t * src,size_t srcLen) const1510 wxMBConvUTF16straight::FromWChar(char *dst, size_t dstLen,
1511 const wchar_t *src, size_t srcLen) const
1512 {
1513 if ( srcLen == wxNO_LEN )
1514 srcLen = wxWcslen(src) + 1;
1515
1516 srcLen *= BYTES_PER_CHAR;
1517
1518 if ( dst )
1519 {
1520 if ( dstLen < srcLen )
1521 return wxCONV_FAILED;
1522
1523 memcpy(dst, src, srcLen);
1524 }
1525
1526 return srcLen;
1527 }
1528
1529 // ----------------------------------------------------------------------------
1530 // endian-reversing conversions
1531 // ----------------------------------------------------------------------------
1532
1533 size_t
ToWChar(wchar_t * dst,size_t dstLen,const char * src,size_t srcLen) const1534 wxMBConvUTF16swap::ToWChar(wchar_t *dst, size_t dstLen,
1535 const char *src, size_t srcLen) const
1536 {
1537 srcLen = GetLength(src, srcLen);
1538 if ( srcLen == wxNO_LEN )
1539 return wxCONV_FAILED;
1540
1541 srcLen /= BYTES_PER_CHAR;
1542
1543 if ( dst )
1544 {
1545 if ( dstLen < srcLen )
1546 return wxCONV_FAILED;
1547
1548 const wxUint16 *inBuff = reinterpret_cast<const wxUint16 *>(src);
1549 for ( size_t n = 0; n < srcLen; n++, inBuff++ )
1550 {
1551 *dst++ = wxUINT16_SWAP_ALWAYS(*inBuff);
1552 }
1553 }
1554
1555 return srcLen;
1556 }
1557
1558 size_t
FromWChar(char * dst,size_t dstLen,const wchar_t * src,size_t srcLen) const1559 wxMBConvUTF16swap::FromWChar(char *dst, size_t dstLen,
1560 const wchar_t *src, size_t srcLen) const
1561 {
1562 if ( srcLen == wxNO_LEN )
1563 srcLen = wxWcslen(src) + 1;
1564
1565 srcLen *= BYTES_PER_CHAR;
1566
1567 if ( dst )
1568 {
1569 if ( dstLen < srcLen )
1570 return wxCONV_FAILED;
1571
1572 wxUint16 *outBuff = reinterpret_cast<wxUint16 *>(dst);
1573 for ( size_t n = 0; n < srcLen; n += BYTES_PER_CHAR, src++ )
1574 {
1575 *outBuff++ = wxUINT16_SWAP_ALWAYS(*src);
1576 }
1577 }
1578
1579 return srcLen;
1580 }
1581
1582 #else // !WC_UTF16: wchar_t is UTF-32
1583
1584 // ----------------------------------------------------------------------------
1585 // conversions without endianness change
1586 // ----------------------------------------------------------------------------
1587
1588 size_t
ToWChar(wchar_t * dst,size_t dstLen,const char * src,size_t srcLen) const1589 wxMBConvUTF16straight::ToWChar(wchar_t *dst, size_t dstLen,
1590 const char *src, size_t srcLen) const
1591 {
1592 srcLen = GetLength(src, srcLen);
1593 if ( srcLen == wxNO_LEN )
1594 return wxCONV_FAILED;
1595
1596 const size_t inLen = srcLen / BYTES_PER_CHAR;
1597 size_t outLen = 0;
1598 const wxUint16 *inBuff = reinterpret_cast<const wxUint16 *>(src);
1599 for ( const wxUint16 * const inEnd = inBuff + inLen; inBuff < inEnd; )
1600 {
1601 const wxUint32 ch = wxDecodeSurrogate(&inBuff, inEnd);
1602 if ( !inBuff )
1603 return wxCONV_FAILED;
1604
1605 outLen++;
1606
1607 if ( dst )
1608 {
1609 if ( outLen > dstLen )
1610 return wxCONV_FAILED;
1611
1612 *dst++ = ch;
1613 }
1614 }
1615
1616
1617 return outLen;
1618 }
1619
1620 size_t
FromWChar(char * dst,size_t dstLen,const wchar_t * src,size_t srcLen) const1621 wxMBConvUTF16straight::FromWChar(char *dst, size_t dstLen,
1622 const wchar_t *src, size_t srcLen) const
1623 {
1624 if ( srcLen == wxNO_LEN )
1625 srcLen = wxWcslen(src) + 1;
1626
1627 size_t outLen = 0;
1628 wxUint16 *outBuff = reinterpret_cast<wxUint16 *>(dst);
1629 for ( size_t n = 0; n < srcLen; n++ )
1630 {
1631 wxUint16 cc[2] = { 0 };
1632 const size_t numChars = encode_utf16(*src++, cc);
1633 if ( numChars == wxCONV_FAILED )
1634 return wxCONV_FAILED;
1635
1636 outLen += numChars * BYTES_PER_CHAR;
1637 if ( outBuff )
1638 {
1639 if ( outLen > dstLen )
1640 return wxCONV_FAILED;
1641
1642 *outBuff++ = cc[0];
1643 if ( numChars == 2 )
1644 {
1645 // second character of a surrogate
1646 *outBuff++ = cc[1];
1647 }
1648 }
1649 }
1650
1651 return outLen;
1652 }
1653
1654 // ----------------------------------------------------------------------------
1655 // endian-reversing conversions
1656 // ----------------------------------------------------------------------------
1657
1658 size_t
ToWChar(wchar_t * dst,size_t dstLen,const char * src,size_t srcLen) const1659 wxMBConvUTF16swap::ToWChar(wchar_t *dst, size_t dstLen,
1660 const char *src, size_t srcLen) const
1661 {
1662 srcLen = GetLength(src, srcLen);
1663 if ( srcLen == wxNO_LEN )
1664 return wxCONV_FAILED;
1665
1666 const size_t inLen = srcLen / BYTES_PER_CHAR;
1667 size_t outLen = 0;
1668 const wxUint16 *inBuff = reinterpret_cast<const wxUint16 *>(src);
1669 for ( const wxUint16 * const inEnd = inBuff + inLen; inBuff < inEnd; )
1670 {
1671 wxUint16 tmp[2];
1672 const wxUint16* tmpEnd = tmp;
1673
1674 tmp[0] = wxUINT16_SWAP_ALWAYS(*inBuff);
1675 tmpEnd++;
1676
1677 if ( inBuff + 1 < inEnd )
1678 {
1679 // Normal case, we have a next character to decode.
1680 tmp[1] = wxUINT16_SWAP_ALWAYS(inBuff[1]);
1681 tmpEnd++;
1682 }
1683
1684 const wxUint16* p = tmp;
1685 const wxUint32 ch = wxDecodeSurrogate(&p, tmpEnd);
1686 if ( !p )
1687 return wxCONV_FAILED;
1688
1689 // Move the real pointer by the same amount as "p" was updated by.
1690 inBuff += p - tmp;
1691
1692 outLen++;
1693
1694 if ( dst )
1695 {
1696 if ( outLen > dstLen )
1697 return wxCONV_FAILED;
1698
1699 *dst++ = ch;
1700 }
1701 }
1702
1703
1704 return outLen;
1705 }
1706
1707 size_t
FromWChar(char * dst,size_t dstLen,const wchar_t * src,size_t srcLen) const1708 wxMBConvUTF16swap::FromWChar(char *dst, size_t dstLen,
1709 const wchar_t *src, size_t srcLen) const
1710 {
1711 if ( srcLen == wxNO_LEN )
1712 srcLen = wxWcslen(src) + 1;
1713
1714 size_t outLen = 0;
1715 wxUint16 *outBuff = reinterpret_cast<wxUint16 *>(dst);
1716 for ( const wchar_t *srcEnd = src + srcLen; src < srcEnd; src++ )
1717 {
1718 wxUint16 cc[2] = { 0 };
1719 const size_t numChars = encode_utf16(*src, cc);
1720 if ( numChars == wxCONV_FAILED )
1721 return wxCONV_FAILED;
1722
1723 outLen += numChars * BYTES_PER_CHAR;
1724 if ( outBuff )
1725 {
1726 if ( outLen > dstLen )
1727 return wxCONV_FAILED;
1728
1729 *outBuff++ = wxUINT16_SWAP_ALWAYS(cc[0]);
1730 if ( numChars == 2 )
1731 {
1732 // second character of a surrogate
1733 *outBuff++ = wxUINT16_SWAP_ALWAYS(cc[1]);
1734 }
1735 }
1736 }
1737
1738 return outLen;
1739 }
1740
1741 #endif // WC_UTF16/!WC_UTF16
1742
1743
1744 // ============================================================================
1745 // UTF-32
1746 // ============================================================================
1747
1748 #ifdef WORDS_BIGENDIAN
1749 #define wxMBConvUTF32straight wxMBConvUTF32BE
1750 #define wxMBConvUTF32swap wxMBConvUTF32LE
1751 #else
1752 #define wxMBConvUTF32swap wxMBConvUTF32BE
1753 #define wxMBConvUTF32straight wxMBConvUTF32LE
1754 #endif
1755
1756
1757 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32LE) wxConvUTF32LE;
1758 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32BE) wxConvUTF32BE;
1759
1760 /* static */
GetLength(const char * src,size_t srcLen)1761 size_t wxMBConvUTF32Base::GetLength(const char *src, size_t srcLen)
1762 {
1763 if ( srcLen == wxNO_LEN )
1764 {
1765 // count the number of bytes in input, including the trailing NULs
1766 const wxUint32 *inBuff = reinterpret_cast<const wxUint32 *>(src);
1767 for ( srcLen = 1; *inBuff++; srcLen++ )
1768 ;
1769
1770 srcLen *= BYTES_PER_CHAR;
1771 }
1772 else // we already have the length
1773 {
1774 // we can only convert an entire number of UTF-32 characters
1775 if ( srcLen % BYTES_PER_CHAR )
1776 return wxCONV_FAILED;
1777 }
1778
1779 return srcLen;
1780 }
1781
1782 // case when in-memory representation is UTF-16
1783 #ifdef WC_UTF16
1784
1785 // ----------------------------------------------------------------------------
1786 // conversions without endianness change
1787 // ----------------------------------------------------------------------------
1788
1789 size_t
ToWChar(wchar_t * dst,size_t dstLen,const char * src,size_t srcLen) const1790 wxMBConvUTF32straight::ToWChar(wchar_t *dst, size_t dstLen,
1791 const char *src, size_t srcLen) const
1792 {
1793 srcLen = GetLength(src, srcLen);
1794 if ( srcLen == wxNO_LEN )
1795 return wxCONV_FAILED;
1796
1797 const wxUint32 *inBuff = reinterpret_cast<const wxUint32 *>(src);
1798 const size_t inLen = srcLen / BYTES_PER_CHAR;
1799 size_t outLen = 0;
1800 for ( size_t n = 0; n < inLen; n++ )
1801 {
1802 wxUint16 cc[2] = { 0 };
1803 const size_t numChars = encode_utf16(*inBuff++, cc);
1804 if ( numChars == wxCONV_FAILED )
1805 return wxCONV_FAILED;
1806
1807 outLen += numChars;
1808 if ( dst )
1809 {
1810 if ( outLen > dstLen )
1811 return wxCONV_FAILED;
1812
1813 *dst++ = cc[0];
1814 if ( numChars == 2 )
1815 {
1816 // second character of a surrogate
1817 *dst++ = cc[1];
1818 }
1819 }
1820 }
1821
1822 return outLen;
1823 }
1824
1825 size_t
FromWChar(char * dst,size_t dstLen,const wchar_t * src,size_t srcLen) const1826 wxMBConvUTF32straight::FromWChar(char *dst, size_t dstLen,
1827 const wchar_t *src, size_t srcLen) const
1828 {
1829 if ( srcLen == wxNO_LEN )
1830 srcLen = wxWcslen(src) + 1;
1831
1832 wxUint32 *outBuff = reinterpret_cast<wxUint32 *>(dst);
1833 size_t outLen = 0;
1834 for ( const wchar_t * const srcEnd = src + srcLen; src < srcEnd; )
1835 {
1836 const wxUint32 ch = wxDecodeSurrogate(&src, srcEnd);
1837 if ( !src )
1838 return wxCONV_FAILED;
1839
1840 outLen += BYTES_PER_CHAR;
1841
1842 if ( outBuff )
1843 {
1844 if ( outLen > dstLen )
1845 return wxCONV_FAILED;
1846
1847 *outBuff++ = ch;
1848 }
1849 }
1850
1851 return outLen;
1852 }
1853
1854 // ----------------------------------------------------------------------------
1855 // endian-reversing conversions
1856 // ----------------------------------------------------------------------------
1857
1858 size_t
ToWChar(wchar_t * dst,size_t dstLen,const char * src,size_t srcLen) const1859 wxMBConvUTF32swap::ToWChar(wchar_t *dst, size_t dstLen,
1860 const char *src, size_t srcLen) const
1861 {
1862 srcLen = GetLength(src, srcLen);
1863 if ( srcLen == wxNO_LEN )
1864 return wxCONV_FAILED;
1865
1866 const wxUint32 *inBuff = reinterpret_cast<const wxUint32 *>(src);
1867 const size_t inLen = srcLen / BYTES_PER_CHAR;
1868 size_t outLen = 0;
1869 for ( size_t n = 0; n < inLen; n++, inBuff++ )
1870 {
1871 wxUint16 cc[2] = { 0 };
1872 const size_t numChars = encode_utf16(wxUINT32_SWAP_ALWAYS(*inBuff), cc);
1873 if ( numChars == wxCONV_FAILED )
1874 return wxCONV_FAILED;
1875
1876 outLen += numChars;
1877 if ( dst )
1878 {
1879 if ( outLen > dstLen )
1880 return wxCONV_FAILED;
1881
1882 *dst++ = cc[0];
1883 if ( numChars == 2 )
1884 {
1885 // second character of a surrogate
1886 *dst++ = cc[1];
1887 }
1888 }
1889 }
1890
1891 return outLen;
1892 }
1893
1894 size_t
FromWChar(char * dst,size_t dstLen,const wchar_t * src,size_t srcLen) const1895 wxMBConvUTF32swap::FromWChar(char *dst, size_t dstLen,
1896 const wchar_t *src, size_t srcLen) const
1897 {
1898 if ( srcLen == wxNO_LEN )
1899 srcLen = wxWcslen(src) + 1;
1900
1901 wxUint32 *outBuff = reinterpret_cast<wxUint32 *>(dst);
1902 size_t outLen = 0;
1903 for ( const wchar_t * const srcEnd = src + srcLen; src < srcEnd; )
1904 {
1905 const wxUint32 ch = wxDecodeSurrogate(&src, srcEnd);
1906 if ( !src )
1907 return wxCONV_FAILED;
1908
1909 outLen += BYTES_PER_CHAR;
1910
1911 if ( outBuff )
1912 {
1913 if ( outLen > dstLen )
1914 return wxCONV_FAILED;
1915
1916 *outBuff++ = wxUINT32_SWAP_ALWAYS(ch);
1917 }
1918 }
1919
1920 return outLen;
1921 }
1922
1923 #else // !WC_UTF16: wchar_t is UTF-32
1924
1925 // ----------------------------------------------------------------------------
1926 // conversions without endianness change
1927 // ----------------------------------------------------------------------------
1928
1929 size_t
ToWChar(wchar_t * dst,size_t dstLen,const char * src,size_t srcLen) const1930 wxMBConvUTF32straight::ToWChar(wchar_t *dst, size_t dstLen,
1931 const char *src, size_t srcLen) const
1932 {
1933 // use memcpy() as it should be much faster than hand-written loop
1934 srcLen = GetLength(src, srcLen);
1935 if ( srcLen == wxNO_LEN )
1936 return wxCONV_FAILED;
1937
1938 const size_t inLen = srcLen/BYTES_PER_CHAR;
1939 if ( dst )
1940 {
1941 if ( dstLen < inLen )
1942 return wxCONV_FAILED;
1943
1944 memcpy(dst, src, srcLen);
1945 }
1946
1947 return inLen;
1948 }
1949
1950 size_t
FromWChar(char * dst,size_t dstLen,const wchar_t * src,size_t srcLen) const1951 wxMBConvUTF32straight::FromWChar(char *dst, size_t dstLen,
1952 const wchar_t *src, size_t srcLen) const
1953 {
1954 if ( srcLen == wxNO_LEN )
1955 srcLen = wxWcslen(src) + 1;
1956
1957 srcLen *= BYTES_PER_CHAR;
1958
1959 if ( dst )
1960 {
1961 if ( dstLen < srcLen )
1962 return wxCONV_FAILED;
1963
1964 memcpy(dst, src, srcLen);
1965 }
1966
1967 return srcLen;
1968 }
1969
1970 // ----------------------------------------------------------------------------
1971 // endian-reversing conversions
1972 // ----------------------------------------------------------------------------
1973
1974 size_t
ToWChar(wchar_t * dst,size_t dstLen,const char * src,size_t srcLen) const1975 wxMBConvUTF32swap::ToWChar(wchar_t *dst, size_t dstLen,
1976 const char *src, size_t srcLen) const
1977 {
1978 srcLen = GetLength(src, srcLen);
1979 if ( srcLen == wxNO_LEN )
1980 return wxCONV_FAILED;
1981
1982 srcLen /= BYTES_PER_CHAR;
1983
1984 if ( dst )
1985 {
1986 if ( dstLen < srcLen )
1987 return wxCONV_FAILED;
1988
1989 const wxUint32 *inBuff = reinterpret_cast<const wxUint32 *>(src);
1990 for ( size_t n = 0; n < srcLen; n++, inBuff++ )
1991 {
1992 *dst++ = wxUINT32_SWAP_ALWAYS(*inBuff);
1993 }
1994 }
1995
1996 return srcLen;
1997 }
1998
1999 size_t
FromWChar(char * dst,size_t dstLen,const wchar_t * src,size_t srcLen) const2000 wxMBConvUTF32swap::FromWChar(char *dst, size_t dstLen,
2001 const wchar_t *src, size_t srcLen) const
2002 {
2003 if ( srcLen == wxNO_LEN )
2004 srcLen = wxWcslen(src) + 1;
2005
2006 srcLen *= BYTES_PER_CHAR;
2007
2008 if ( dst )
2009 {
2010 if ( dstLen < srcLen )
2011 return wxCONV_FAILED;
2012
2013 wxUint32 *outBuff = reinterpret_cast<wxUint32 *>(dst);
2014 for ( size_t n = 0; n < srcLen; n += BYTES_PER_CHAR, src++ )
2015 {
2016 *outBuff++ = wxUINT32_SWAP_ALWAYS(*src);
2017 }
2018 }
2019
2020 return srcLen;
2021 }
2022
2023 #endif // WC_UTF16/!WC_UTF16
2024
2025
2026 // ============================================================================
2027 // The classes doing conversion using the iconv_xxx() functions
2028 // ============================================================================
2029
2030 #ifdef HAVE_ICONV
2031
2032 // VS: glibc 2.1.3 is broken in that iconv() conversion to/from UCS4 fails with
2033 // E2BIG if output buffer is _exactly_ as big as needed. Such case is
2034 // (unless there's yet another bug in glibc) the only case when iconv()
2035 // returns with (size_t)-1 (which means error) and says there are 0 bytes
2036 // left in the input buffer -- when _real_ error occurs,
2037 // bytes-left-in-input buffer is non-zero. Hence, this alternative test for
2038 // iconv() failure.
2039 // [This bug does not appear in glibc 2.2.]
2040 #if defined(__GLIBC__) && __GLIBC__ == 2 && __GLIBC_MINOR__ <= 1
2041 #define ICONV_FAILED(cres, bufLeft) ((cres == (size_t)-1) && \
2042 (errno != E2BIG || bufLeft != 0))
2043 #else
2044 #define ICONV_FAILED(cres, bufLeft) (cres == (size_t)-1)
2045 #endif
2046
2047 #define ICONV_CHAR_CAST(x) const_cast<ICONV_CONST char**>(x)
2048
2049 #define ICONV_T_INVALID ((iconv_t)-1)
2050
2051 #if SIZEOF_WCHAR_T == 4
2052 #define WC_BSWAP wxUINT32_SWAP_ALWAYS
2053 #define WC_ENC wxFONTENCODING_UTF32
2054 #elif SIZEOF_WCHAR_T == 2
2055 #define WC_BSWAP wxUINT16_SWAP_ALWAYS
2056 #define WC_ENC wxFONTENCODING_UTF16
2057 #else // sizeof(wchar_t) != 2 nor 4
2058 // does this ever happen?
2059 #error "Unknown sizeof(wchar_t): please report this to wx-dev@lists.wxwindows.org"
2060 #endif
2061
2062 // ----------------------------------------------------------------------------
2063 // wxMBConv_iconv: encapsulates an iconv character set
2064 // ----------------------------------------------------------------------------
2065
2066 class wxMBConv_iconv : public wxMBConv
2067 {
2068 public:
2069 wxMBConv_iconv(const char *name);
2070 virtual ~wxMBConv_iconv();
2071
2072 // implement base class virtual methods
2073 virtual size_t ToWChar(wchar_t *dst, size_t dstLen,
2074 const char *src, size_t srcLen = wxNO_LEN) const wxOVERRIDE;
2075 virtual size_t FromWChar(char *dst, size_t dstLen,
2076 const wchar_t *src, size_t srcLen = wxNO_LEN) const wxOVERRIDE;
2077 virtual size_t GetMBNulLen() const wxOVERRIDE;
2078
2079 virtual bool IsUTF8() const wxOVERRIDE;
2080
Clone() const2081 virtual wxMBConv *Clone() const wxOVERRIDE
2082 {
2083 wxMBConv_iconv *p = new wxMBConv_iconv(m_name);
2084 p->m_minMBCharWidth = m_minMBCharWidth;
2085 return p;
2086 }
2087
IsOk() const2088 bool IsOk() const
2089 { return (m2w != ICONV_T_INVALID) && (w2m != ICONV_T_INVALID); }
2090
2091 protected:
2092 // the iconv handlers used to translate from multibyte
2093 // to wide char and in the other direction
2094 iconv_t m2w,
2095 w2m;
2096
2097 #if wxUSE_THREADS
2098 // guards access to m2w and w2m objects
2099 wxMutex m_iconvMutex;
2100 #endif
2101
2102 private:
2103 // the name (for iconv_open()) of a wide char charset -- if none is
2104 // available on this machine, it will remain NULL
2105 static wxString ms_wcCharsetName;
2106
2107 // true if the wide char encoding we use (i.e. ms_wcCharsetName) has
2108 // different endian-ness than the native one
2109 static bool ms_wcNeedsSwap;
2110
2111
2112 // name of the encoding handled by this conversion
2113 const char *m_name;
2114
2115 // cached result of GetMBNulLen(); set to 0 meaning "unknown"
2116 // initially
2117 size_t m_minMBCharWidth;
2118 };
2119
2120 // make the constructor available for unit testing
new_wxMBConv_iconv(const char * name)2121 WXDLLIMPEXP_BASE wxMBConv* new_wxMBConv_iconv( const char* name )
2122 {
2123 wxMBConv_iconv* result = new wxMBConv_iconv( name );
2124 if ( !result->IsOk() )
2125 {
2126 delete result;
2127 return 0;
2128 }
2129
2130 return result;
2131 }
2132
2133 wxString wxMBConv_iconv::ms_wcCharsetName;
2134 bool wxMBConv_iconv::ms_wcNeedsSwap = false;
2135
wxMBConv_iconv(const char * name)2136 wxMBConv_iconv::wxMBConv_iconv(const char *name)
2137 : m_name(wxStrdup(name))
2138 {
2139 m_minMBCharWidth = 0;
2140
2141 // check for charset that represents wchar_t:
2142 if ( ms_wcCharsetName.empty() )
2143 {
2144 wxLogTrace(TRACE_STRCONV, wxT("Looking for wide char codeset:"));
2145
2146 #if wxUSE_FONTMAP
2147 const wxChar *const *names = wxFontMapperBase::GetAllEncodingNames(WC_ENC);
2148 #else // !wxUSE_FONTMAP
2149 static const wxChar *const names_static[] =
2150 {
2151 #if SIZEOF_WCHAR_T == 4
2152 wxT("UCS-4"),
2153 #elif SIZEOF_WCHAR_T == 2
2154 wxT("UCS-2"),
2155 #endif
2156 NULL
2157 };
2158 const wxChar *const *names = names_static;
2159 #endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
2160
2161 for ( ; *names && ms_wcCharsetName.empty(); ++names )
2162 {
2163 const wxString nameCS(*names);
2164
2165 // first try charset with explicit bytesex info (e.g. "UCS-4LE"):
2166 wxString nameXE(nameCS);
2167
2168 #ifdef WORDS_BIGENDIAN
2169 nameXE += wxT("BE");
2170 #else // little endian
2171 nameXE += wxT("LE");
2172 #endif
2173
2174 wxLogTrace(TRACE_STRCONV, wxT(" trying charset \"%s\""),
2175 nameXE.c_str());
2176
2177 m2w = iconv_open(nameXE.ToAscii(), name);
2178 if ( m2w == ICONV_T_INVALID )
2179 {
2180 // try charset w/o bytesex info (e.g. "UCS4")
2181 wxLogTrace(TRACE_STRCONV, wxT(" trying charset \"%s\""),
2182 nameCS.c_str());
2183 m2w = iconv_open(nameCS.ToAscii(), name);
2184
2185 // and check for bytesex ourselves:
2186 if ( m2w != ICONV_T_INVALID )
2187 {
2188 char buf[2], *bufPtr;
2189 wchar_t wbuf[2];
2190 size_t insz, outsz;
2191 size_t res;
2192
2193 buf[0] = 'A';
2194 buf[1] = 0;
2195 wbuf[0] = 0;
2196 insz = 2;
2197 outsz = SIZEOF_WCHAR_T * 2;
2198 char* wbufPtr = (char*)wbuf;
2199 bufPtr = buf;
2200
2201 res = iconv(
2202 m2w, ICONV_CHAR_CAST(&bufPtr), &insz,
2203 &wbufPtr, &outsz);
2204
2205 if (ICONV_FAILED(res, insz))
2206 {
2207 wxLogLastError(wxT("iconv"));
2208 wxLogError(_("Conversion to charset '%s' doesn't work."),
2209 nameCS.c_str());
2210 }
2211 else // ok, can convert to this encoding, remember it
2212 {
2213 ms_wcCharsetName = nameCS;
2214 ms_wcNeedsSwap = wbuf[0] != (wchar_t)buf[0];
2215 }
2216 }
2217 }
2218 else // use charset not requiring byte swapping
2219 {
2220 ms_wcCharsetName = nameXE;
2221 }
2222 }
2223
2224 wxLogTrace(TRACE_STRCONV,
2225 wxT("iconv wchar_t charset is \"%s\"%s"),
2226 ms_wcCharsetName.empty() ? wxString("<none>")
2227 : ms_wcCharsetName,
2228 ms_wcNeedsSwap ? wxT(" (needs swap)")
2229 : wxT(""));
2230 }
2231 else // we already have ms_wcCharsetName
2232 {
2233 m2w = iconv_open(ms_wcCharsetName.ToAscii(), name);
2234 }
2235
2236 if ( ms_wcCharsetName.empty() )
2237 {
2238 w2m = ICONV_T_INVALID;
2239 }
2240 else
2241 {
2242 w2m = iconv_open(name, ms_wcCharsetName.ToAscii());
2243 if ( w2m == ICONV_T_INVALID )
2244 {
2245 wxLogTrace(TRACE_STRCONV,
2246 wxT("\"%s\" -> \"%s\" works but not the converse!?"),
2247 ms_wcCharsetName.c_str(), name);
2248 }
2249 }
2250 }
2251
~wxMBConv_iconv()2252 wxMBConv_iconv::~wxMBConv_iconv()
2253 {
2254 free(const_cast<char *>(m_name));
2255
2256 if ( m2w != ICONV_T_INVALID )
2257 iconv_close(m2w);
2258 if ( w2m != ICONV_T_INVALID )
2259 iconv_close(w2m);
2260 }
2261
2262 size_t
ToWChar(wchar_t * dst,size_t dstLen,const char * src,size_t srcLen) const2263 wxMBConv_iconv::ToWChar(wchar_t *dst, size_t dstLen,
2264 const char *src, size_t srcLen) const
2265 {
2266 if ( srcLen == wxNO_LEN )
2267 {
2268 // find the string length: notice that must be done differently for
2269 // NUL-terminated strings and UTF-16/32 which are terminated with 2/4
2270 // consecutive NULs
2271 const size_t nulLen = GetMBNulLen();
2272 switch ( nulLen )
2273 {
2274 default:
2275 return wxCONV_FAILED;
2276
2277 case 1:
2278 srcLen = strlen(src); // arguably more optimized than our version
2279 break;
2280
2281 case 2:
2282 case 4:
2283 // for UTF-16/32 not only we need to have 2/4 consecutive NULs
2284 // but they also have to start at character boundary and not
2285 // span two adjacent characters
2286 const char *p;
2287 for ( p = src; NotAllNULs(p, nulLen); p += nulLen )
2288 ;
2289 srcLen = p - src;
2290 break;
2291 }
2292
2293 // when we're determining the length of the string ourselves we count
2294 // the terminating NUL(s) as part of it and always NUL-terminate the
2295 // output
2296 srcLen += nulLen;
2297 }
2298
2299 // we express length in the number of (wide) characters but iconv always
2300 // counts buffer sizes it in bytes
2301 dstLen *= SIZEOF_WCHAR_T;
2302
2303 #if wxUSE_THREADS
2304 // NB: iconv() is MT-safe, but each thread must use its own iconv_t handle.
2305 // Unfortunately there are a couple of global wxCSConv objects such as
2306 // wxConvLocal that are used all over wx code, so we have to make sure
2307 // the handle is used by at most one thread at the time. Otherwise
2308 // only a few wx classes would be safe to use from non-main threads
2309 // as MB<->WC conversion would fail "randomly".
2310 wxMutexLocker lock(wxConstCast(this, wxMBConv_iconv)->m_iconvMutex);
2311 #endif // wxUSE_THREADS
2312
2313 size_t res, cres;
2314 const char *pszPtr = src;
2315
2316 if ( dst )
2317 {
2318 char* bufPtr = (char*)dst;
2319
2320 // have destination buffer, convert there
2321 size_t dstLenOrig = dstLen;
2322 cres = iconv(m2w,
2323 ICONV_CHAR_CAST(&pszPtr), &srcLen,
2324 &bufPtr, &dstLen);
2325
2326 // convert the number of bytes converted as returned by iconv to the
2327 // number of (wide) characters converted that we need
2328 res = (dstLenOrig - dstLen) / SIZEOF_WCHAR_T;
2329
2330 if (ms_wcNeedsSwap)
2331 {
2332 // convert to native endianness
2333 for ( unsigned i = 0; i < res; i++ )
2334 dst[i] = WC_BSWAP(dst[i]);
2335 }
2336 }
2337 else // no destination buffer
2338 {
2339 // convert using temp buffer to calculate the size of the buffer needed
2340 wchar_t tbuf[256];
2341 res = 0;
2342
2343 do
2344 {
2345 char* bufPtr = (char*)tbuf;
2346 dstLen = 8 * SIZEOF_WCHAR_T;
2347
2348 cres = iconv(m2w,
2349 ICONV_CHAR_CAST(&pszPtr), &srcLen,
2350 &bufPtr, &dstLen );
2351
2352 res += 8 - (dstLen / SIZEOF_WCHAR_T);
2353 }
2354 while ((cres == (size_t)-1) && (errno == E2BIG));
2355 }
2356
2357 if (ICONV_FAILED(cres, srcLen))
2358 {
2359 //VS: it is ok if iconv fails, hence trace only
2360 wxLogTrace(TRACE_STRCONV, wxT("iconv failed: %s"), wxSysErrorMsgStr(wxSysErrorCode()));
2361 return wxCONV_FAILED;
2362 }
2363
2364 return res;
2365 }
2366
FromWChar(char * dst,size_t dstLen,const wchar_t * src,size_t srcLen) const2367 size_t wxMBConv_iconv::FromWChar(char *dst, size_t dstLen,
2368 const wchar_t *src, size_t srcLen) const
2369 {
2370 #if wxUSE_THREADS
2371 // NB: explained in MB2WC
2372 wxMutexLocker lock(wxConstCast(this, wxMBConv_iconv)->m_iconvMutex);
2373 #endif
2374
2375 if ( srcLen == wxNO_LEN )
2376 srcLen = wxWcslen(src) + 1;
2377
2378 size_t inbuflen = srcLen * SIZEOF_WCHAR_T;
2379 size_t outbuflen = dstLen;
2380 size_t res, cres;
2381
2382 wchar_t *tmpbuf = 0;
2383
2384 if (ms_wcNeedsSwap)
2385 {
2386 // need to copy to temp buffer to switch endianness
2387 // (doing WC_BSWAP twice on the original buffer won't work, as it
2388 // could be in read-only memory, or be accessed in some other thread)
2389 tmpbuf = (wchar_t *)malloc(inbuflen);
2390 for ( size_t i = 0; i < srcLen; i++ )
2391 tmpbuf[i] = WC_BSWAP(src[i]);
2392
2393 src = tmpbuf;
2394 }
2395
2396 const char* inbuf = reinterpret_cast<const char*>(src);
2397 if ( dst )
2398 {
2399 // have destination buffer, convert there
2400 cres = iconv(w2m, ICONV_CHAR_CAST(&inbuf), &inbuflen, &dst, &outbuflen);
2401
2402 res = dstLen - outbuflen;
2403 }
2404 else // no destination buffer
2405 {
2406 // convert using temp buffer to calculate the size of the buffer needed
2407 char tbuf[256];
2408 res = 0;
2409 do
2410 {
2411 dst = tbuf;
2412 outbuflen = WXSIZEOF(tbuf);
2413
2414 cres = iconv(w2m, ICONV_CHAR_CAST(&inbuf), &inbuflen, &dst, &outbuflen);
2415
2416 res += WXSIZEOF(tbuf) - outbuflen;
2417 }
2418 while ((cres == (size_t)-1) && (errno == E2BIG));
2419 }
2420
2421 if (ms_wcNeedsSwap)
2422 {
2423 free(tmpbuf);
2424 }
2425
2426 if (ICONV_FAILED(cres, inbuflen))
2427 {
2428 wxLogTrace(TRACE_STRCONV, wxT("iconv failed: %s"), wxSysErrorMsgStr(wxSysErrorCode()));
2429 return wxCONV_FAILED;
2430 }
2431
2432 return res;
2433 }
2434
GetMBNulLen() const2435 size_t wxMBConv_iconv::GetMBNulLen() const
2436 {
2437 if ( m_minMBCharWidth == 0 )
2438 {
2439 wxMBConv_iconv * const self = wxConstCast(this, wxMBConv_iconv);
2440
2441 #if wxUSE_THREADS
2442 // NB: explained in MB2WC
2443 wxMutexLocker lock(self->m_iconvMutex);
2444 #endif
2445
2446 const wchar_t *wnul = L"";
2447 char buf[8]; // should be enough for NUL in any encoding
2448 size_t inLen = sizeof(wchar_t),
2449 outLen = WXSIZEOF(buf);
2450 const char* inBuff = reinterpret_cast<const char*>(wnul);
2451 char *outBuff = buf;
2452 if ( iconv(w2m, ICONV_CHAR_CAST(&inBuff), &inLen, &outBuff, &outLen) == (size_t)-1 )
2453 {
2454 self->m_minMBCharWidth = (size_t)-1;
2455 }
2456 else // ok
2457 {
2458 self->m_minMBCharWidth = outBuff - buf;
2459 }
2460 }
2461
2462 return m_minMBCharWidth;
2463 }
2464
IsUTF8() const2465 bool wxMBConv_iconv::IsUTF8() const
2466 {
2467 return wxStricmp(m_name, "UTF-8") == 0 ||
2468 wxStricmp(m_name, "UTF8") == 0;
2469 }
2470
2471 #endif // HAVE_ICONV
2472
2473
2474 // ============================================================================
2475 // Win32 conversion classes
2476 // ============================================================================
2477
2478 #ifdef wxHAVE_WIN32_MB2WC
2479
2480 // from utils.cpp
2481 #if wxUSE_FONTMAP
2482 extern WXDLLIMPEXP_BASE long wxCharsetToCodepage(const char *charset);
2483 #endif
2484 extern WXDLLIMPEXP_BASE long wxEncodingToCodepage(wxFontEncoding encoding);
2485
2486 class wxMBConv_win32 : public wxMBConv
2487 {
2488 public:
wxMBConv_win32()2489 wxMBConv_win32()
2490 {
2491 m_CodePage = CP_ACP;
2492 m_minMBCharWidth = 0;
2493 }
2494
wxMBConv_win32(const wxMBConv_win32 & conv)2495 wxMBConv_win32(const wxMBConv_win32& conv)
2496 : wxMBConv()
2497 {
2498 m_CodePage = conv.m_CodePage;
2499 m_minMBCharWidth = conv.m_minMBCharWidth;
2500 }
2501
2502 #if wxUSE_FONTMAP
wxMBConv_win32(const char * name)2503 wxMBConv_win32(const char* name)
2504 {
2505 m_CodePage = wxCharsetToCodepage(name);
2506 m_minMBCharWidth = 0;
2507 }
2508 #endif // wxUSE_FONTMAP
2509
wxMBConv_win32(wxFontEncoding encoding)2510 wxMBConv_win32(wxFontEncoding encoding)
2511 {
2512 m_CodePage = wxEncodingToCodepage(encoding);
2513 m_minMBCharWidth = 0;
2514 }
2515
MB2WC(wchar_t * buf,const char * psz,size_t n) const2516 virtual size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const wxOVERRIDE
2517 {
2518 // note that we have to use MB_ERR_INVALID_CHARS flag as it without it
2519 // the behaviour is not compatible with the Unix version (using iconv)
2520 // and break the library itself, e.g. wxTextInputStream::NextChar()
2521 // wouldn't work if reading an incomplete MB char didn't result in an
2522 // error
2523 //
2524 // Moreover, MB_ERR_INVALID_CHARS is not supported for UTF-8 under XP
2525 // and for UTF-7 under any Windows version, so we always use our own
2526 // conversions in this case.
2527 if ( m_CodePage == CP_UTF8 )
2528 {
2529 return wxMBConvUTF8().MB2WC(buf, psz, n);
2530 }
2531
2532 if ( m_CodePage == CP_UTF7 )
2533 {
2534 return wxMBConvUTF7().MB2WC(buf, psz, n);
2535 }
2536
2537 const size_t len = ::MultiByteToWideChar
2538 (
2539 m_CodePage, // code page
2540 MB_ERR_INVALID_CHARS, // flags: fall on error
2541 psz, // input string
2542 -1, // its length (NUL-terminated)
2543 buf, // output string
2544 buf ? n : 0 // size of output buffer
2545 );
2546 if ( !len )
2547 return wxCONV_FAILED;
2548
2549 // note that it returns count of written chars for buf != NULL and size
2550 // of the needed buffer for buf == NULL so in either case the length of
2551 // the string (which never includes the terminating NUL) is one less
2552 return len - 1;
2553 }
2554
WC2MB(char * buf,const wchar_t * pwz,size_t n) const2555 virtual size_t WC2MB(char *buf, const wchar_t *pwz, size_t n) const wxOVERRIDE
2556 {
2557 /*
2558 We need to WC_NO_BEST_FIT_CHARS to prevent WideCharToMultiByte()
2559 from replacing characters unrepresentable in the target code page
2560 with bad quality approximations such as turning "1/2" symbol
2561 (U+00BD) into "1" for the code pages which don't have the fraction
2562 symbol.
2563
2564 Unfortunately this flag can't be used with CJK encodings nor
2565 UTF-7/8 and so if the code page is one of those, we need to resort
2566 to a round trip to verify that no replacements have been done.
2567 */
2568 BOOL usedDef wxDUMMY_INITIALIZE(false);
2569 BOOL *pUsedDef;
2570 int flags;
2571 if ( m_CodePage < 50000 )
2572 {
2573 // it's our lucky day
2574 flags = WC_NO_BEST_FIT_CHARS;
2575 pUsedDef = &usedDef;
2576 }
2577 else // old system or unsupported encoding
2578 {
2579 flags = 0;
2580 pUsedDef = NULL;
2581 }
2582
2583 const size_t len = ::WideCharToMultiByte
2584 (
2585 m_CodePage, // code page
2586 flags, // either none or no best fit
2587 pwz, // input string
2588 -1, // it is (wide) NUL-terminated
2589 buf, // output buffer
2590 buf ? n : 0, // and its size
2591 NULL, // default "replacement" char
2592 pUsedDef // [out] was it used?
2593 );
2594
2595 if ( !len )
2596 {
2597 // function totally failed
2598 return wxCONV_FAILED;
2599 }
2600
2601 // we did something, check if we really succeeded
2602 if ( flags )
2603 {
2604 // check if the conversion failed, i.e. if any replacements
2605 // were done
2606 if ( usedDef )
2607 return wxCONV_FAILED;
2608 }
2609 else // we must resort to double tripping...
2610 {
2611 // first we need to ensure that we really have the MB data: this is
2612 // not the case if we're called with NULL buffer, in which case we
2613 // need to do the conversion yet again
2614 wxCharBuffer bufDef;
2615 if ( !buf )
2616 {
2617 bufDef = wxCharBuffer(len);
2618 buf = bufDef.data();
2619 if ( !::WideCharToMultiByte(m_CodePage, flags, pwz, -1,
2620 buf, len, NULL, NULL) )
2621 return wxCONV_FAILED;
2622 }
2623
2624 if ( !n )
2625 n = wcslen(pwz);
2626 wxWCharBuffer wcBuf(n);
2627 if ( MB2WC(wcBuf.data(), buf, n + 1) == wxCONV_FAILED ||
2628 wcscmp(wcBuf, pwz) != 0 )
2629 {
2630 // we didn't obtain the same thing we started from, hence
2631 // the conversion was lossy and we consider that it failed
2632 return wxCONV_FAILED;
2633 }
2634 }
2635
2636 // see the comment above for the reason of "len - 1"
2637 return len - 1;
2638 }
2639
GetMBNulLen() const2640 virtual size_t GetMBNulLen() const wxOVERRIDE
2641 {
2642 if ( m_minMBCharWidth == 0 )
2643 {
2644 int len = ::WideCharToMultiByte
2645 (
2646 m_CodePage, // code page
2647 0, // no flags
2648 L"", // input string
2649 1, // translate just the NUL
2650 NULL, // output buffer
2651 0, // and its size
2652 NULL, // no replacement char
2653 NULL // [out] don't care if it was used
2654 );
2655
2656 wxMBConv_win32 * const self = wxConstCast(this, wxMBConv_win32);
2657 switch ( len )
2658 {
2659 default:
2660 wxLogDebug(wxT("Unexpected NUL length %d"), len);
2661 self->m_minMBCharWidth = (size_t)-1;
2662 break;
2663
2664 case 0:
2665 self->m_minMBCharWidth = (size_t)-1;
2666 break;
2667
2668 case 1:
2669 case 2:
2670 case 4:
2671 self->m_minMBCharWidth = len;
2672 break;
2673 }
2674 }
2675
2676 return m_minMBCharWidth;
2677 }
2678
Clone() const2679 virtual wxMBConv *Clone() const wxOVERRIDE { return new wxMBConv_win32(*this); }
2680
IsOk() const2681 bool IsOk() const { return m_CodePage != -1; }
2682
2683 private:
2684 // the code page we're working with
2685 long m_CodePage;
2686
2687 // cached result of GetMBNulLen(), set to 0 initially meaning
2688 // "unknown"
2689 size_t m_minMBCharWidth;
2690 };
2691
2692 #endif // wxHAVE_WIN32_MB2WC
2693
2694
2695 // ============================================================================
2696 // wxEncodingConverter based conversion classes
2697 // ============================================================================
2698
2699 #if wxUSE_FONTMAP
2700
2701 class wxMBConv_wxwin : public wxMBConv
2702 {
2703 private:
Init()2704 void Init()
2705 {
2706 // Refuse to use broken wxEncodingConverter code for Mac-specific encodings.
2707 // The wxMBConv_cf class does a better job.
2708 m_ok = (m_enc < wxFONTENCODING_MACMIN || m_enc > wxFONTENCODING_MACMAX) &&
2709 m2w.Init(m_enc, wxFONTENCODING_UNICODE) &&
2710 w2m.Init(wxFONTENCODING_UNICODE, m_enc);
2711 }
2712
2713 public:
2714 // temporarily just use wxEncodingConverter stuff,
2715 // so that it works while a better implementation is built
wxMBConv_wxwin(const char * name)2716 wxMBConv_wxwin(const char* name)
2717 {
2718 if (name)
2719 m_enc = wxFontMapperBase::Get()->CharsetToEncoding(name, false);
2720 else
2721 m_enc = wxFONTENCODING_SYSTEM;
2722
2723 Init();
2724 }
2725
wxMBConv_wxwin(wxFontEncoding enc)2726 wxMBConv_wxwin(wxFontEncoding enc)
2727 {
2728 m_enc = enc;
2729
2730 Init();
2731 }
2732
MB2WC(wchar_t * buf,const char * psz,size_t WXUNUSED (n)) const2733 size_t MB2WC(wchar_t *buf, const char *psz, size_t WXUNUSED(n)) const wxOVERRIDE
2734 {
2735 size_t inbuf = strlen(psz);
2736 if (buf)
2737 {
2738 if (!m2w.Convert(psz, buf))
2739 return wxCONV_FAILED;
2740 }
2741 return inbuf;
2742 }
2743
WC2MB(char * buf,const wchar_t * psz,size_t WXUNUSED (n)) const2744 size_t WC2MB(char *buf, const wchar_t *psz, size_t WXUNUSED(n)) const wxOVERRIDE
2745 {
2746 const size_t inbuf = wxWcslen(psz);
2747 if (buf)
2748 {
2749 if (!w2m.Convert(psz, buf))
2750 return wxCONV_FAILED;
2751 }
2752
2753 return inbuf;
2754 }
2755
GetMBNulLen() const2756 virtual size_t GetMBNulLen() const wxOVERRIDE
2757 {
2758 switch ( m_enc )
2759 {
2760 case wxFONTENCODING_UTF16BE:
2761 case wxFONTENCODING_UTF16LE:
2762 return 2;
2763
2764 case wxFONTENCODING_UTF32BE:
2765 case wxFONTENCODING_UTF32LE:
2766 return 4;
2767
2768 default:
2769 return 1;
2770 }
2771 }
2772
Clone() const2773 virtual wxMBConv *Clone() const wxOVERRIDE { return new wxMBConv_wxwin(m_enc); }
2774
IsOk() const2775 bool IsOk() const { return m_ok; }
2776
2777 public:
2778 wxFontEncoding m_enc;
2779 wxEncodingConverter m2w, w2m;
2780
2781 private:
2782 // were we initialized successfully?
2783 bool m_ok;
2784
2785 wxDECLARE_NO_COPY_CLASS(wxMBConv_wxwin);
2786 };
2787
2788 // make the constructors available for unit testing
new_wxMBConv_wxwin(const char * name)2789 WXDLLIMPEXP_BASE wxMBConv* new_wxMBConv_wxwin( const char* name )
2790 {
2791 wxMBConv_wxwin* result = new wxMBConv_wxwin( name );
2792 if ( !result->IsOk() )
2793 {
2794 delete result;
2795 return 0;
2796 }
2797
2798 return result;
2799 }
2800
2801 #endif // wxUSE_FONTMAP
2802
2803 // ============================================================================
2804 // wxCSConv implementation
2805 // ============================================================================
2806
Init()2807 void wxCSConv::Init()
2808 {
2809 m_name = NULL;
2810 m_convReal = NULL;
2811 }
2812
SetEncoding(wxFontEncoding encoding)2813 void wxCSConv::SetEncoding(wxFontEncoding encoding)
2814 {
2815 switch ( encoding )
2816 {
2817 case wxFONTENCODING_MAX:
2818 case wxFONTENCODING_SYSTEM:
2819 if ( m_name )
2820 {
2821 // It's ok to not have encoding value if we have a name for it.
2822 m_encoding = wxFONTENCODING_SYSTEM;
2823 }
2824 else // No name neither.
2825 {
2826 // Fall back to the system default encoding in this case (not
2827 // sure how much sense does this make but this is how the old
2828 // code used to behave).
2829 #if wxUSE_INTL
2830 m_encoding = wxLocale::GetSystemEncoding();
2831 if ( m_encoding == wxFONTENCODING_SYSTEM )
2832 #endif // wxUSE_INTL
2833 m_encoding = wxFONTENCODING_ISO8859_1;
2834 }
2835 break;
2836
2837 case wxFONTENCODING_DEFAULT:
2838 // wxFONTENCODING_DEFAULT is same as US-ASCII in this context
2839 m_encoding = wxFONTENCODING_ISO8859_1;
2840 break;
2841
2842 default:
2843 // Just use the provided encoding.
2844 m_encoding = encoding;
2845 }
2846 }
2847
wxCSConv(const wxString & charset)2848 wxCSConv::wxCSConv(const wxString& charset)
2849 {
2850 Init();
2851
2852 if ( !charset.empty() )
2853 {
2854 SetName(charset.ToAscii());
2855 }
2856
2857 #if wxUSE_FONTMAP
2858 SetEncoding(wxFontMapperBase::GetEncodingFromName(charset));
2859 #else
2860 SetEncoding(wxFONTENCODING_SYSTEM);
2861 #endif
2862
2863 m_convReal = DoCreate();
2864 }
2865
wxCSConv(wxFontEncoding encoding)2866 wxCSConv::wxCSConv(wxFontEncoding encoding)
2867 {
2868 if ( encoding == wxFONTENCODING_MAX || encoding == wxFONTENCODING_DEFAULT )
2869 {
2870 wxFAIL_MSG( wxT("invalid encoding value in wxCSConv ctor") );
2871
2872 encoding = wxFONTENCODING_SYSTEM;
2873 }
2874
2875 Init();
2876
2877 SetEncoding(encoding);
2878
2879 m_convReal = DoCreate();
2880 }
2881
~wxCSConv()2882 wxCSConv::~wxCSConv()
2883 {
2884 Clear();
2885 }
2886
wxCSConv(const wxCSConv & conv)2887 wxCSConv::wxCSConv(const wxCSConv& conv)
2888 : wxMBConv()
2889 {
2890 Init();
2891
2892 SetName(conv.m_name);
2893 SetEncoding(conv.m_encoding);
2894
2895 m_convReal = DoCreate();
2896 }
2897
operator =(const wxCSConv & conv)2898 wxCSConv& wxCSConv::operator=(const wxCSConv& conv)
2899 {
2900 Clear();
2901
2902 SetName(conv.m_name);
2903 SetEncoding(conv.m_encoding);
2904
2905 m_convReal = DoCreate();
2906
2907 return *this;
2908 }
2909
Clear()2910 void wxCSConv::Clear()
2911 {
2912 free(m_name);
2913 m_name = NULL;
2914
2915 wxDELETE(m_convReal);
2916 }
2917
SetName(const char * charset)2918 void wxCSConv::SetName(const char *charset)
2919 {
2920 if ( charset )
2921 m_name = wxStrdup(charset);
2922 }
2923
2924 #if wxUSE_FONTMAP
2925
2926 WX_DECLARE_HASH_MAP( wxFontEncoding, wxString, wxIntegerHash, wxIntegerEqual,
2927 wxEncodingNameCache );
2928
2929 static wxEncodingNameCache gs_nameCache;
2930 #endif
2931
DoCreate() const2932 wxMBConv *wxCSConv::DoCreate() const
2933 {
2934 #if wxUSE_FONTMAP
2935 wxLogTrace(TRACE_STRCONV,
2936 wxT("creating conversion for %s"),
2937 (m_name ? m_name
2938 : (const char*)wxFontMapperBase::GetEncodingName(m_encoding).mb_str()));
2939 #endif // wxUSE_FONTMAP
2940
2941 // check for the special case of ASCII or ISO8859-1 charset: as we have
2942 // special knowledge of it anyhow, we don't need to create a special
2943 // conversion object
2944 if ( m_encoding == wxFONTENCODING_ISO8859_1 )
2945 {
2946 // don't convert at all
2947 return NULL;
2948 }
2949
2950 // we trust OS to do conversion better than we can so try external
2951 // conversion methods first
2952 //
2953 // the full order is:
2954 // 1. OS conversion (iconv() under Unix or Win32 API)
2955 // 2. hard coded conversions for UTF
2956 // 3. wxEncodingConverter as fall back
2957
2958 // step (1)
2959 #ifdef HAVE_ICONV
2960 #if !wxUSE_FONTMAP
2961 if ( m_name )
2962 #endif // !wxUSE_FONTMAP
2963 {
2964 #if wxUSE_FONTMAP
2965 wxFontEncoding encoding(m_encoding);
2966 #endif
2967
2968 if ( m_name )
2969 {
2970 wxMBConv_iconv *conv = new wxMBConv_iconv(m_name);
2971 if ( conv->IsOk() )
2972 return conv;
2973
2974 delete conv;
2975
2976 #if wxUSE_FONTMAP
2977 encoding =
2978 wxFontMapperBase::Get()->CharsetToEncoding(m_name, false);
2979 #endif // wxUSE_FONTMAP
2980 }
2981 #if wxUSE_FONTMAP
2982 {
2983 const wxEncodingNameCache::iterator it = gs_nameCache.find(encoding);
2984 if ( it != gs_nameCache.end() )
2985 {
2986 if ( it->second.empty() )
2987 return NULL;
2988
2989 wxMBConv_iconv *conv = new wxMBConv_iconv(it->second.ToAscii());
2990 if ( conv->IsOk() )
2991 return conv;
2992
2993 delete conv;
2994 }
2995
2996 const wxChar* const* names = wxFontMapperBase::GetAllEncodingNames(encoding);
2997 // CS : in case this does not return valid names (eg for MacRoman)
2998 // encoding got a 'failure' entry in the cache all the same,
2999 // although it just has to be created using a different method, so
3000 // only store failed iconv creation attempts (or perhaps we
3001 // shoulnd't do this at all ?)
3002 if ( names[0] != NULL )
3003 {
3004 for ( ; *names; ++names )
3005 {
3006 // FIXME-UTF8: wxFontMapperBase::GetAllEncodingNames()
3007 // will need changes that will obsolete this
3008 wxString name(*names);
3009 wxMBConv_iconv *conv = new wxMBConv_iconv(name.ToAscii());
3010 if ( conv->IsOk() )
3011 {
3012 gs_nameCache[encoding] = *names;
3013 return conv;
3014 }
3015
3016 delete conv;
3017 }
3018
3019 gs_nameCache[encoding] = wxT(""); // cache the failure
3020 }
3021 }
3022 #endif // wxUSE_FONTMAP
3023 }
3024 #endif // HAVE_ICONV
3025
3026 #ifdef wxHAVE_WIN32_MB2WC
3027 {
3028 #if wxUSE_FONTMAP
3029 wxMBConv_win32 *conv = m_name ? new wxMBConv_win32(m_name)
3030 : new wxMBConv_win32(m_encoding);
3031 #else
3032 wxMBConv_win32* conv = new wxMBConv_win32(m_encoding);
3033 #endif
3034 if ( conv->IsOk() )
3035 return conv;
3036
3037 delete conv;
3038 }
3039 #endif // wxHAVE_WIN32_MB2WC
3040
3041 #ifdef __DARWIN__
3042 {
3043 // leave UTF16 and UTF32 to the built-ins of wx
3044 if ( m_name || ( m_encoding < wxFONTENCODING_UTF16BE ||
3045 ( m_encoding >= wxFONTENCODING_MACMIN && m_encoding <= wxFONTENCODING_MACMAX ) ) )
3046 {
3047 #if wxUSE_FONTMAP
3048 wxMBConv_cf *conv = m_name ? new wxMBConv_cf(m_name)
3049 : new wxMBConv_cf(m_encoding);
3050 #else
3051 wxMBConv_cf *conv = new wxMBConv_cf(m_encoding);
3052 #endif
3053
3054 if ( conv->IsOk() )
3055 return conv;
3056
3057 delete conv;
3058 }
3059 }
3060 #endif // __DARWIN__
3061
3062 // step (2)
3063 wxFontEncoding enc = m_encoding;
3064 #if wxUSE_FONTMAP
3065 if ( enc == wxFONTENCODING_SYSTEM && m_name )
3066 {
3067 // use "false" to suppress interactive dialogs -- we can be called from
3068 // anywhere and popping up a dialog from here is the last thing we want to
3069 // do
3070 enc = wxFontMapperBase::Get()->CharsetToEncoding(m_name, false);
3071 }
3072 #endif // wxUSE_FONTMAP
3073
3074 switch ( enc )
3075 {
3076 case wxFONTENCODING_UTF7:
3077 return new wxMBConvUTF7;
3078
3079 case wxFONTENCODING_UTF8:
3080 return new wxMBConvUTF8;
3081
3082 case wxFONTENCODING_UTF16BE:
3083 return new wxMBConvUTF16BE;
3084
3085 case wxFONTENCODING_UTF16LE:
3086 return new wxMBConvUTF16LE;
3087
3088 case wxFONTENCODING_UTF32BE:
3089 return new wxMBConvUTF32BE;
3090
3091 case wxFONTENCODING_UTF32LE:
3092 return new wxMBConvUTF32LE;
3093
3094 default:
3095 // nothing to do but put here to suppress gcc warnings
3096 break;
3097 }
3098
3099 // step (3)
3100 #if wxUSE_FONTMAP
3101 {
3102 wxMBConv_wxwin *conv = m_name ? new wxMBConv_wxwin(m_name)
3103 : new wxMBConv_wxwin(m_encoding);
3104 if ( conv->IsOk() )
3105 return conv;
3106
3107 delete conv;
3108 }
3109
3110 wxLogTrace(TRACE_STRCONV,
3111 wxT("encoding \"%s\" is not supported by this system"),
3112 (m_name ? wxString(m_name)
3113 : wxFontMapperBase::GetEncodingName(m_encoding)));
3114 #endif // wxUSE_FONTMAP
3115
3116 return NULL;
3117 }
3118
IsOk() const3119 bool wxCSConv::IsOk() const
3120 {
3121 // special case: no convReal created for wxFONTENCODING_ISO8859_1
3122 if ( m_encoding == wxFONTENCODING_ISO8859_1 )
3123 return true; // always ok as we do it ourselves
3124
3125 // m_convReal->IsOk() is called at its own creation, so we know it must
3126 // be ok if m_convReal is non-NULL
3127 return m_convReal != NULL;
3128 }
3129
ToWChar(wchar_t * dst,size_t dstLen,const char * src,size_t srcLen) const3130 size_t wxCSConv::ToWChar(wchar_t *dst, size_t dstLen,
3131 const char *src, size_t srcLen) const
3132 {
3133 if (m_convReal)
3134 return m_convReal->ToWChar(dst, dstLen, src, srcLen);
3135
3136 // latin-1 (direct)
3137 if ( srcLen == wxNO_LEN )
3138 srcLen = strlen(src) + 1; // take trailing NUL too
3139
3140 if ( dst )
3141 {
3142 if ( dstLen < srcLen )
3143 return wxCONV_FAILED;
3144
3145 for ( size_t n = 0; n < srcLen; n++ )
3146 dst[n] = (unsigned char)(src[n]);
3147 }
3148
3149 return srcLen;
3150 }
3151
FromWChar(char * dst,size_t dstLen,const wchar_t * src,size_t srcLen) const3152 size_t wxCSConv::FromWChar(char *dst, size_t dstLen,
3153 const wchar_t *src, size_t srcLen) const
3154 {
3155 if (m_convReal)
3156 return m_convReal->FromWChar(dst, dstLen, src, srcLen);
3157
3158 // latin-1 (direct)
3159 if ( srcLen == wxNO_LEN )
3160 srcLen = wxWcslen(src) + 1;
3161
3162 if ( dst )
3163 {
3164 if ( dstLen < srcLen )
3165 return wxCONV_FAILED;
3166
3167 for ( size_t n = 0; n < srcLen; n++ )
3168 {
3169 if ( src[n] > 0xFF )
3170 return wxCONV_FAILED;
3171
3172 dst[n] = (char)src[n];
3173 }
3174
3175 }
3176 else // still need to check the input validity
3177 {
3178 for ( size_t n = 0; n < srcLen; n++ )
3179 {
3180 if ( src[n] > 0xFF )
3181 return wxCONV_FAILED;
3182 }
3183 }
3184
3185 return srcLen;
3186 }
3187
GetMBNulLen() const3188 size_t wxCSConv::GetMBNulLen() const
3189 {
3190 if ( m_convReal )
3191 return m_convReal->GetMBNulLen();
3192
3193 // otherwise, we are ISO-8859-1
3194 return 1;
3195 }
3196
IsUTF8() const3197 bool wxCSConv::IsUTF8() const
3198 {
3199 if ( m_convReal )
3200 return m_convReal->IsUTF8();
3201
3202 // otherwise, we are ISO-8859-1
3203 return false;
3204 }
3205
3206
3207 // ============================================================================
3208 // wxWhateverWorksConv
3209 // ============================================================================
3210
3211 size_t
ToWChar(wchar_t * dst,size_t dstLen,const char * src,size_t srcLen) const3212 wxWhateverWorksConv::ToWChar(wchar_t *dst, size_t dstLen,
3213 const char *src, size_t srcLen) const
3214 {
3215 size_t rc = wxConvUTF8.ToWChar(dst, dstLen, src, srcLen);
3216 if ( rc != wxCONV_FAILED )
3217 return rc;
3218
3219 rc = wxConvLibc.ToWChar(dst, dstLen, src, srcLen);
3220 if ( rc != wxCONV_FAILED )
3221 return rc;
3222
3223 rc = wxConvISO8859_1.ToWChar(dst, dstLen, src, srcLen);
3224
3225 return rc;
3226 }
3227
3228 size_t
FromWChar(char * dst,size_t dstLen,const wchar_t * src,size_t srcLen) const3229 wxWhateverWorksConv::FromWChar(char *dst, size_t dstLen,
3230 const wchar_t *src, size_t srcLen) const
3231 {
3232 size_t rc = wxConvLibc.FromWChar(dst, dstLen, src, srcLen);
3233 if ( rc != wxCONV_FAILED )
3234 return rc;
3235
3236 rc = wxConvUTF8.FromWChar(dst, dstLen, src, srcLen);
3237
3238 return rc;
3239 }
3240
3241 // ----------------------------------------------------------------------------
3242 // globals
3243 // ----------------------------------------------------------------------------
3244
3245 // NB: The reason why we create converted objects in this convoluted way,
3246 // using a factory function instead of global variable, is that they
3247 // may be used at static initialization time (some of them are used by
3248 // wxString ctors and there may be a global wxString object). In other
3249 // words, possibly _before_ the converter global object would be
3250 // initialized.
3251
3252 #undef wxConvLibc
3253 #undef wxConvUTF8
3254 #undef wxConvUTF7
3255 #undef wxConvWhateverWorks
3256 #undef wxConvLocal
3257 #undef wxConvISO8859_1
3258
3259 #define WX_DEFINE_GLOBAL_CONV2(klass, impl_klass, name, ctor_args) \
3260 WXDLLIMPEXP_DATA_BASE(klass*) name##Ptr = NULL; \
3261 WXDLLIMPEXP_BASE klass* wxGet_##name##Ptr() \
3262 { \
3263 static impl_klass name##Obj ctor_args; \
3264 return &name##Obj; \
3265 } \
3266 /* this ensures that all global converter objects are created */ \
3267 /* by the time static initialization is done, i.e. before any */ \
3268 /* thread is launched: */ \
3269 static klass* gs_##name##instance = wxGet_##name##Ptr()
3270
3271 #define WX_DEFINE_GLOBAL_CONV(klass, name, ctor_args) \
3272 WX_DEFINE_GLOBAL_CONV2(klass, klass, name, ctor_args)
3273
3274 #ifdef __INTELC__
3275 // disable warning "variable 'xxx' was declared but never referenced"
3276 #pragma warning(disable: 177)
3277 #endif // Intel C++
3278
3279 #ifdef __WINDOWS__
3280 WX_DEFINE_GLOBAL_CONV2(wxMBConv, wxMBConv_win32, wxConvLibc, wxEMPTY_PARAMETER_VALUE);
3281 #elif 0 // defined(__WXOSX__)
3282 WX_DEFINE_GLOBAL_CONV2(wxMBConv, wxMBConv_cf, wxConvLibc, (wxFONTENCODING_UTF8));
3283 #else
3284 WX_DEFINE_GLOBAL_CONV2(wxMBConv, wxMBConvLibc, wxConvLibc, wxEMPTY_PARAMETER_VALUE);
3285 #endif
3286
3287 // NB: we can't use wxEMPTY_PARAMETER_VALUE as final argument here because it's
3288 // passed to WX_DEFINE_GLOBAL_CONV2 after a macro expansion and so still
3289 // provokes an error message about "not enough macro parameters"; and we
3290 // can't use "()" here as the name##Obj declaration would be parsed as a
3291 // function declaration then, so use a semicolon and live with an extra
3292 // empty statement (and hope that no compilers warns about this)
3293 WX_DEFINE_GLOBAL_CONV(wxMBConvStrictUTF8, wxConvUTF8, ;);
3294 WX_DEFINE_GLOBAL_CONV(wxMBConvUTF7, wxConvUTF7, ;);
3295 WX_DEFINE_GLOBAL_CONV(wxWhateverWorksConv, wxConvWhateverWorks, ;);
3296
3297 WX_DEFINE_GLOBAL_CONV(wxCSConv, wxConvLocal, (wxFONTENCODING_SYSTEM));
3298 WX_DEFINE_GLOBAL_CONV(wxCSConv, wxConvISO8859_1, (wxFONTENCODING_ISO8859_1));
3299
3300 WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvCurrent = wxGet_wxConvLibcPtr();
3301 WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvUI = wxGet_wxConvLocalPtr();
3302
3303 #ifdef __DARWIN__
3304 // It is important to use this conversion object under Darwin as it ensures
3305 // that Unicode strings are (re)composed correctly even though xnu kernel uses
3306 // decomposed form internally (at least for the file names).
3307 static wxMBConvD_cf wxConvMacUTF8DObj(wxFONTENCODING_UTF8);
3308 #endif
3309
3310 WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvFileName =
3311 #ifdef __DARWIN__
3312 &wxConvMacUTF8DObj;
3313 #else // !__DARWIN__
3314 wxGet_wxConvWhateverWorksPtr();
3315 #endif // __DARWIN__/!__DARWIN__
3316