1 /////////////////////////////////////////////////////////////////////////////
2 // Name:        convauto.h
3 // Purpose:     interface of wxConvAuto
4 // Author:      wxWidgets team
5 // Licence:     wxWindows licence
6 /////////////////////////////////////////////////////////////////////////////
7 
8 /**
9     Constants representing various BOM types.
10 
11     BOM is an abbreviation for "Byte Order Mark", a special Unicode character
12     which may be inserted into the beginning of a text stream to indicate its
13     encoding.
14 
15     @since 2.9.3
16  */
17 enum wxBOM
18 {
19     /**
20         Unknown BOM.
21 
22         This is returned if BOM presence couldn't be determined and normally
23         happens because not enough bytes of input have been analysed.
24      */
25     wxBOM_Unknown = -1,
26 
27     /**
28         No BOM.
29 
30         The stream doesn't contain BOM character at all.
31      */
32     wxBOM_None,
33 
34     /**
35         UTF-32 big endian BOM.
36 
37         The stream is encoded in big endian variant of UTF-32.
38      */
39     wxBOM_UTF32BE,
40 
41     /**
42         UTF-32 little endian BOM.
43 
44         The stream is encoded in little endian variant of UTF-32.
45      */
46     wxBOM_UTF32LE,
47 
48     /**
49         UTF-16 big endian BOM.
50 
51         The stream is encoded in big endian variant of UTF-16.
52      */
53     wxBOM_UTF16BE,
54 
55     /**
56         UTF-16 little endian BOM.
57 
58         The stream is encoded in little endian variant of UTF-16.
59      */
60     wxBOM_UTF16LE,
61 
62     /**
63         UTF-8 BOM.
64 
65         The stream is encoded in UTF-8.
66 
67         Notice that contrary to a popular belief, it's perfectly possible and,
68         n fact, common under Microsoft Windows systems, to have a BOM in an
69         UTF-8 stream: while it's not used to indicate the endianness of UTF-8
70         stream (as it's byte-oriented), the BOM can still be useful just as an
71         unambiguous indicator of UTF-8 being used.
72      */
73     wxBOM_UTF8
74 };
75 
76 /**
77     @class wxConvAuto
78 
79     This class implements a Unicode to/from multibyte converter capable of
80     automatically recognizing the encoding of the multibyte text on input. The
81     logic used is very simple: the class uses the BOM (byte order mark) if it's
82     present and tries to interpret the input as UTF-8 otherwise. If this fails,
83     the input is interpreted as being in the default multibyte encoding which
84     can be specified in the constructor of a wxConvAuto instance and, in turn,
85     defaults to the value of GetFallbackEncoding() if not explicitly given.
86 
87     For the conversion from Unicode to multibyte, the same encoding as was
88     previously used for multibyte to Unicode conversion is reused. If there had
89     been no previous multibyte to Unicode conversion, UTF-8 is used by default.
90     Notice that once the multibyte encoding is automatically detected, it
91     doesn't change any more, i.e. it is entirely determined by the first use of
92     wxConvAuto object in the multibyte-to-Unicode direction. However creating a
93     copy of wxConvAuto object, either via the usual copy constructor or
94     assignment operator, or using wxMBConv::Clone(), resets the automatically
95     detected encoding so that the new copy will try to detect the encoding of
96     the input on first use.
97 
98     This class is used by default in wxWidgets classes and functions reading
99     text from files such as wxFile, wxFFile, wxTextFile, wxFileConfig and
100     various stream classes so the encoding set with its SetFallbackEncoding()
101     method will affect how these classes treat input files. In particular, use
102     this method to change the fall-back multibyte encoding used to interpret
103     the contents of the files whose contents isn't valid UTF-8 or to disallow
104     it completely.
105 
106     @library{wxbase}
107     @category{data}
108 
109     @see @ref overview_mbconv
110 */
111 class wxConvAuto : public wxMBConv
112 {
113 public:
114     /**
115         Constructs a new wxConvAuto instance. The object will try to detect the
116         input of the multibyte text given to its wxMBConv::ToWChar() method
117         automatically but if the automatic detection of Unicode encodings
118         fails, the fall-back encoding @a enc will be used to interpret it as
119         multibyte text.
120 
121         The default value of @a enc, @c wxFONTENCODING_DEFAULT, means that the
122         global default value (which can be set using SetFallbackEncoding())
123         should be used. As with that method, passing @c wxFONTENCODING_MAX
124         inhibits using this encoding completely so the input multibyte text
125         will always be interpreted as UTF-8 in the absence of BOM and the
126         conversion will fail if the input doesn't form valid UTF-8 sequence.
127 
128         Another special value is @c wxFONTENCODING_SYSTEM which means to use
129         the encoding currently used on the user system, i.e. the encoding
130         returned by wxLocale::GetSystemEncoding(). Any other encoding will be
131         used as is, e.g. passing @c wxFONTENCODING_ISO8859_1 ensures that
132         non-UTF-8 input will be treated as latin1.
133     */
134     wxConvAuto(wxFontEncoding enc = wxFONTENCODING_DEFAULT);
135 
136 
137     /**
138         Return the detected BOM type.
139 
140         The BOM type is detected after sufficiently many initial bytes have
141         passed through this conversion object so it will always return
142         wxBOM_Unknown immediately after the object creation but may return a
143         different value later.
144 
145         @since 2.9.3
146     */
147     wxBOM GetBOM() const;
148 
149     /**
150         Return a pointer to the characters that makes up this BOM.
151 
152         The returned character count is 2, 3 or 4, or undefined if the return
153         value is NULL.
154 
155         @param bom
156             A valid BOM type, i.e. not wxBOM_Unknown or wxBOM_None.
157         @param count
158             A non-@NULL pointer receiving the number of characters in this BOM.
159         @return
160             Pointer to characters composing the BOM or @NULL if BOM is unknown
161             or invalid. Notice that the returned string is not NUL-terminated
162             and may contain embedded NULs so @a count must be used to handle it
163             correctly.
164 
165         @since 2.9.3
166     */
167     const char* GetBOMChars(wxBOM bom, size_t* count);
168 
169     /**
170         Disable the use of the fall back encoding: if the input doesn't have a
171         BOM and is not valid UTF-8, the conversion will fail.
172     */
173     static void DisableFallbackEncoding();
174 
175     /**
176         Returns the encoding used by default by wxConvAuto if no other encoding
177         is explicitly specified in constructor. By default, returns
178         @c wxFONTENCODING_ISO8859_1 but can be changed using
179         SetFallbackEncoding().
180     */
181     static wxFontEncoding GetFallbackEncoding();
182 
183     /**
184         Changes the encoding used by default by wxConvAuto if no other encoding
185         is explicitly specified in constructor. The default value, which can be
186         retrieved using GetFallbackEncoding(), is @c wxFONTENCODING_ISO8859_1.
187 
188         Special values of @c wxFONTENCODING_SYSTEM or @c wxFONTENCODING_MAX can
189         be used for the @a enc parameter to use the encoding of the current
190         user locale as fall back or not use any encoding for fall back at all,
191         respectively (just as with the similar constructor parameter). However,
192         @c wxFONTENCODING_DEFAULT can't be used here.
193     */
194     static void SetFallbackEncoding(wxFontEncoding enc);
195 
196     /**
197         Return the BOM type of this buffer.
198 
199         This is a helper function which is normally only used internally by
200         wxConvAuto but provided for convenience of the code that wants to
201         detect the encoding of a stream by checking it for BOM presence on its
202         own.
203 
204         @since 2.9.3
205     */
206     static wxBOM DetectBOM(const char *src, size_t srcLen);
207 };
208