1 ///////////////////////////////////////////////////////////////////////////////
2 // Name:        src/common/convauto.cpp
3 // Purpose:     implementation of wxConvAuto
4 // Author:      Vadim Zeitlin
5 // Created:     2006-04-04
6 // Copyright:   (c) 2006 Vadim Zeitlin <vadim@wxwindows.org>
7 // Licence:     wxWindows licence
8 ///////////////////////////////////////////////////////////////////////////////
9 
10 // ============================================================================
11 // declarations
12 // ============================================================================
13 
14 // ----------------------------------------------------------------------------
15 // headers
16 // ----------------------------------------------------------------------------
17 
18 // for compilers that support precompilation, includes "wx.h".
19 #include "wx/wxprec.h"
20 
21 #ifdef __BORLANDC__
22     #pragma hdrstop
23 #endif
24 
25 #include "wx/convauto.h"
26 
27 // we use latin1 by default as it seems the least bad choice: the files we need
28 // to detect input of don't always come from the user system (they are often
29 // received from other machines) and so using wxFONTENCODING_SYSTEM doesn't
30 // seem to be a good idea and there is no other reasonable alternative
31 wxFontEncoding wxConvAuto::ms_defaultMBEncoding = wxFONTENCODING_ISO8859_1;
32 
33 namespace
34 {
35 
36 const char BOM_UTF32BE[] = { '\x00', '\x00', '\xFE', '\xFF' };
37 const char BOM_UTF32LE[] = { '\xFF', '\xFE', '\x00', '\x00' };
38 const char BOM_UTF16BE[] = { '\xFE', '\xFF'                 };
39 const char BOM_UTF16LE[] = { '\xFF', '\xFE'                 };
40 const char BOM_UTF8[]    = { '\xEF', '\xBB', '\xBF'         };
41 
42 } // anonymous namespace
43 
44 // ============================================================================
45 // implementation
46 // ============================================================================
47 
48 /* static */
SetFallbackEncoding(wxFontEncoding enc)49 void wxConvAuto::SetFallbackEncoding(wxFontEncoding enc)
50 {
51     wxASSERT_MSG( enc != wxFONTENCODING_DEFAULT,
52                   wxT("wxFONTENCODING_DEFAULT doesn't make sense here") );
53 
54     ms_defaultMBEncoding = enc;
55 }
56 
57 /* static */
GetBOMChars(wxBOM bom,size_t * count)58 const char* wxConvAuto::GetBOMChars(wxBOM bom, size_t* count)
59 {
60     wxCHECK_MSG( count , NULL, wxS("count pointer must be provided") );
61 
62     switch ( bom )
63     {
64         case wxBOM_UTF32BE: *count = WXSIZEOF(BOM_UTF32BE); return BOM_UTF32BE;
65         case wxBOM_UTF32LE: *count = WXSIZEOF(BOM_UTF32LE); return BOM_UTF32LE;
66         case wxBOM_UTF16BE: *count = WXSIZEOF(BOM_UTF16BE); return BOM_UTF16BE;
67         case wxBOM_UTF16LE: *count = WXSIZEOF(BOM_UTF16LE); return BOM_UTF16LE;
68         case wxBOM_UTF8   : *count = WXSIZEOF(BOM_UTF8   ); return BOM_UTF8;
69         case wxBOM_Unknown:
70         case wxBOM_None:
71             wxFAIL_MSG( wxS("Invalid BOM type") );
72             return NULL;
73     }
74 
75     wxFAIL_MSG( wxS("Unknown BOM type") );
76     return NULL;
77 }
78 
79 /* static */
DetectBOM(const char * src,size_t srcLen)80 wxBOM wxConvAuto::DetectBOM(const char *src, size_t srcLen)
81 {
82     // examine the buffer for BOM presence
83     //
84     // quoting from http://www.unicode.org/faq/utf_bom.html#BOM:
85     //
86     //  Bytes           Encoding Form
87     //
88     //  00 00 FE FF     UTF-32, big-endian
89     //  FF FE 00 00     UTF-32, little-endian
90     //  FE FF           UTF-16, big-endian
91     //  FF FE           UTF-16, little-endian
92     //  EF BB BF        UTF-8
93     //
94     // as some BOMs are prefixes of other ones we may need to read more bytes
95     // to disambiguate them
96 
97     switch ( srcLen )
98     {
99         case 0:
100             return wxBOM_Unknown;
101 
102         case 1:
103             if ( src[0] == '\x00' || src[0] == '\xFF' ||
104                  src[0] == '\xFE' || src[0] == '\xEF')
105             {
106                 // this could be a BOM but we don't know yet
107                 return wxBOM_Unknown;
108             }
109             break;
110 
111         case 2:
112         case 3:
113             if ( src[0] == '\xEF' && src[1] == '\xBB' )
114             {
115                 if ( srcLen == 3 )
116                     return src[2] == '\xBF' ? wxBOM_UTF8 : wxBOM_None;
117 
118                 return wxBOM_Unknown;
119             }
120 
121             if ( src[0] == '\xFE' && src[1] == '\xFF' )
122                 return wxBOM_UTF16BE;
123 
124             if ( src[0] == '\xFF' && src[1] == '\xFE' )
125             {
126                 // if the next byte is 0, it could be an UTF-32LE BOM but if it
127                 // isn't we can be sure it's UTF-16LE
128                 if ( srcLen == 3 && src[2] != '\x00' )
129                     return wxBOM_UTF16LE;
130 
131                 return wxBOM_Unknown;
132             }
133 
134             if ( src[0] == '\x00' && src[1] == '\x00' )
135             {
136                 // this could only be UTF-32BE, check that the data we have so
137                 // far allows for it
138                 if ( srcLen == 3 && src[2] != '\xFE' )
139                     return wxBOM_None;
140 
141                 return wxBOM_Unknown;
142             }
143             break;
144 
145         default:
146             // we have at least 4 characters so we may finally decide whether
147             // we have a BOM or not
148             if ( src[0] == '\xEF' && src[1] == '\xBB' && src[2] == '\xBF' )
149                 return wxBOM_UTF8;
150 
151             if ( src[0] == '\x00' && src[1] == '\x00' &&
152                  src[2] == '\xFE' && src[3] == '\xFF' )
153                 return wxBOM_UTF32BE;
154 
155             if ( src[0] == '\xFF' && src[1] == '\xFE' &&
156                  src[2] == '\x00' && src[3] == '\x00' )
157                 return wxBOM_UTF32LE;
158 
159             if ( src[0] == '\xFE' && src[1] == '\xFF' )
160                 return wxBOM_UTF16BE;
161 
162             if ( src[0] == '\xFF' && src[1] == '\xFE' )
163                 return wxBOM_UTF16LE;
164     }
165 
166     return wxBOM_None;
167 }
168 
InitFromBOM(wxBOM bomType)169 void wxConvAuto::InitFromBOM(wxBOM bomType)
170 {
171     m_consumedBOM = false;
172 
173     switch ( bomType )
174     {
175         case wxBOM_Unknown:
176             wxFAIL_MSG( "shouldn't be called for this BOM type" );
177             break;
178 
179         case wxBOM_None:
180             // use the default
181             break;
182 
183         case wxBOM_UTF32BE:
184             m_conv = new wxMBConvUTF32BE;
185             m_ownsConv = true;
186             break;
187 
188         case wxBOM_UTF32LE:
189             m_conv = new wxMBConvUTF32LE;
190             m_ownsConv = true;
191             break;
192 
193         case wxBOM_UTF16BE:
194             m_conv = new wxMBConvUTF16BE;
195             m_ownsConv = true;
196             break;
197 
198         case wxBOM_UTF16LE:
199             m_conv = new wxMBConvUTF16LE;
200             m_ownsConv = true;
201             break;
202 
203         case wxBOM_UTF8:
204             InitWithUTF8();
205             break;
206 
207         default:
208             wxFAIL_MSG( "unknown BOM type" );
209     }
210 
211     if ( !m_conv )
212     {
213         // we end up here if there is no BOM or we didn't recognize it somehow
214         // (this shouldn't happen but still don't crash if it does), so use the
215         // default encoding
216         InitWithUTF8();
217         m_consumedBOM = true; // as there is nothing to consume
218     }
219 }
220 
SkipBOM(const char ** src,size_t * len) const221 void wxConvAuto::SkipBOM(const char **src, size_t *len) const
222 {
223     int ofs;
224     switch ( m_bomType )
225     {
226         case wxBOM_Unknown:
227             wxFAIL_MSG( "shouldn't be called for this BOM type" );
228             return;
229 
230         case wxBOM_None:
231             ofs = 0;
232             break;
233 
234         case wxBOM_UTF32BE:
235         case wxBOM_UTF32LE:
236             ofs = 4;
237             break;
238 
239         case wxBOM_UTF16BE:
240         case wxBOM_UTF16LE:
241             ofs = 2;
242             break;
243 
244         case wxBOM_UTF8:
245             ofs = 3;
246             break;
247 
248         default:
249             wxFAIL_MSG( "unknown BOM type" );
250             return;
251     }
252 
253     *src += ofs;
254     if ( *len != (size_t)-1 )
255         *len -= ofs;
256 }
257 
InitFromInput(const char * src,size_t len)258 bool wxConvAuto::InitFromInput(const char *src, size_t len)
259 {
260     m_bomType = DetectBOM(src, len == wxNO_LEN ? strlen(src) : len);
261     if ( m_bomType == wxBOM_Unknown )
262         return false;
263 
264     InitFromBOM(m_bomType);
265 
266     return true;
267 }
268 
269 size_t
ToWChar(wchar_t * dst,size_t dstLen,const char * src,size_t srcLen) const270 wxConvAuto::ToWChar(wchar_t *dst, size_t dstLen,
271                     const char *src, size_t srcLen) const
272 {
273     // we check BOM and create the appropriate conversion the first time we're
274     // called but we also need to ensure that the BOM is skipped not only
275     // during this initial call but also during the first call with non-NULL
276     // dst as typically we're first called with NULL dst to calculate the
277     // needed buffer size
278     wxConvAuto *self = const_cast<wxConvAuto *>(this);
279 
280 
281     if ( !m_conv )
282     {
283         if ( !self->InitFromInput(src, srcLen) )
284         {
285             // there is not enough data to determine whether we have a BOM or
286             // not, so fail for now -- the caller is supposed to call us again
287             // with more data
288             return wxCONV_FAILED;
289         }
290     }
291 
292     if ( !m_consumedBOM )
293     {
294         SkipBOM(&src, &srcLen);
295         if ( srcLen == 0 )
296         {
297             // there is nothing left except the BOM so we'd return 0 below but
298             // this is unexpected: decoding a non-empty string must either fail
299             // or return something non-empty, in particular this would break
300             // the code in wxTextInputStream::NextChar()
301             //
302             // so still return an error as we need some more data to be able to
303             // decode it
304             return wxCONV_FAILED;
305         }
306     }
307 
308     // try to convert using the auto-detected encoding
309     size_t rc = m_conv->ToWChar(dst, dstLen, src, srcLen);
310     if ( rc == wxCONV_FAILED && m_bomType == wxBOM_None )
311     {
312         // if the conversion failed but we didn't really detect anything and
313         // simply tried UTF-8 by default, retry it using the fall-back
314         if ( m_encDefault != wxFONTENCODING_MAX )
315         {
316             if ( m_ownsConv )
317                 delete m_conv;
318 
319             self->m_conv = new wxCSConv(m_encDefault == wxFONTENCODING_DEFAULT
320                                             ? GetFallbackEncoding()
321                                             : m_encDefault);
322             self->m_ownsConv = true;
323 
324             rc = m_conv->ToWChar(dst, dstLen, src, srcLen);
325         }
326     }
327 
328     // don't skip the BOM again the next time if we really consumed it
329     if ( rc != wxCONV_FAILED && dst && !m_consumedBOM )
330         self->m_consumedBOM = true;
331 
332     return rc;
333 }
334 
335 size_t
FromWChar(char * dst,size_t dstLen,const wchar_t * src,size_t srcLen) const336 wxConvAuto::FromWChar(char *dst, size_t dstLen,
337                       const wchar_t *src, size_t srcLen) const
338 {
339     if ( !m_conv )
340     {
341         // default to UTF-8 for the multibyte output
342         const_cast<wxConvAuto *>(this)->InitWithUTF8();
343     }
344 
345     return m_conv->FromWChar(dst, dstLen, src, srcLen);
346 }
347