1 ///////////////////////////////////////////////////////////////////////////// 2 // Name: convauto.h 3 // Purpose: interface of wxConvAuto 4 // Author: wxWidgets team 5 // Licence: wxWindows licence 6 ///////////////////////////////////////////////////////////////////////////// 7 8 /** 9 Constants representing various BOM types. 10 11 BOM is an abbreviation for "Byte Order Mark", a special Unicode character 12 which may be inserted into the beginning of a text stream to indicate its 13 encoding. 14 15 @since 2.9.3 16 */ 17 enum wxBOM 18 { 19 /** 20 Unknown BOM. 21 22 This is returned if BOM presence couldn't be determined and normally 23 happens because not enough bytes of input have been analysed. 24 */ 25 wxBOM_Unknown = -1, 26 27 /** 28 No BOM. 29 30 The stream doesn't contain BOM character at all. 31 */ 32 wxBOM_None, 33 34 /** 35 UTF-32 big endian BOM. 36 37 The stream is encoded in big endian variant of UTF-32. 38 */ 39 wxBOM_UTF32BE, 40 41 /** 42 UTF-32 little endian BOM. 43 44 The stream is encoded in little endian variant of UTF-32. 45 */ 46 wxBOM_UTF32LE, 47 48 /** 49 UTF-16 big endian BOM. 50 51 The stream is encoded in big endian variant of UTF-16. 52 */ 53 wxBOM_UTF16BE, 54 55 /** 56 UTF-16 little endian BOM. 57 58 The stream is encoded in little endian variant of UTF-16. 59 */ 60 wxBOM_UTF16LE, 61 62 /** 63 UTF-8 BOM. 64 65 The stream is encoded in UTF-8. 66 67 Notice that contrary to a popular belief, it's perfectly possible and, 68 n fact, common under Microsoft Windows systems, to have a BOM in an 69 UTF-8 stream: while it's not used to indicate the endianness of UTF-8 70 stream (as it's byte-oriented), the BOM can still be useful just as an 71 unambiguous indicator of UTF-8 being used. 72 */ 73 wxBOM_UTF8 74 }; 75 76 /** 77 @class wxConvAuto 78 79 This class implements a Unicode to/from multibyte converter capable of 80 automatically recognizing the encoding of the multibyte text on input. The 81 logic used is very simple: the class uses the BOM (byte order mark) if it's 82 present and tries to interpret the input as UTF-8 otherwise. If this fails, 83 the input is interpreted as being in the default multibyte encoding which 84 can be specified in the constructor of a wxConvAuto instance and, in turn, 85 defaults to the value of GetFallbackEncoding() if not explicitly given. 86 87 For the conversion from Unicode to multibyte, the same encoding as was 88 previously used for multibyte to Unicode conversion is reused. If there had 89 been no previous multibyte to Unicode conversion, UTF-8 is used by default. 90 Notice that once the multibyte encoding is automatically detected, it 91 doesn't change any more, i.e. it is entirely determined by the first use of 92 wxConvAuto object in the multibyte-to-Unicode direction. However creating a 93 copy of wxConvAuto object, either via the usual copy constructor or 94 assignment operator, or using wxMBConv::Clone(), resets the automatically 95 detected encoding so that the new copy will try to detect the encoding of 96 the input on first use. 97 98 This class is used by default in wxWidgets classes and functions reading 99 text from files such as wxFile, wxFFile, wxTextFile, wxFileConfig and 100 various stream classes so the encoding set with its SetFallbackEncoding() 101 method will affect how these classes treat input files. In particular, use 102 this method to change the fall-back multibyte encoding used to interpret 103 the contents of the files whose contents isn't valid UTF-8 or to disallow 104 it completely. 105 106 @library{wxbase} 107 @category{data} 108 109 @see @ref overview_mbconv 110 */ 111 class wxConvAuto : public wxMBConv 112 { 113 public: 114 /** 115 Constructs a new wxConvAuto instance. The object will try to detect the 116 input of the multibyte text given to its wxMBConv::ToWChar() method 117 automatically but if the automatic detection of Unicode encodings 118 fails, the fall-back encoding @a enc will be used to interpret it as 119 multibyte text. 120 121 The default value of @a enc, @c wxFONTENCODING_DEFAULT, means that the 122 global default value (which can be set using SetFallbackEncoding()) 123 should be used. As with that method, passing @c wxFONTENCODING_MAX 124 inhibits using this encoding completely so the input multibyte text 125 will always be interpreted as UTF-8 in the absence of BOM and the 126 conversion will fail if the input doesn't form valid UTF-8 sequence. 127 128 Another special value is @c wxFONTENCODING_SYSTEM which means to use 129 the encoding currently used on the user system, i.e. the encoding 130 returned by wxLocale::GetSystemEncoding(). Any other encoding will be 131 used as is, e.g. passing @c wxFONTENCODING_ISO8859_1 ensures that 132 non-UTF-8 input will be treated as latin1. 133 */ 134 wxConvAuto(wxFontEncoding enc = wxFONTENCODING_DEFAULT); 135 136 137 /** 138 Return the detected BOM type. 139 140 The BOM type is detected after sufficiently many initial bytes have 141 passed through this conversion object so it will always return 142 wxBOM_Unknown immediately after the object creation but may return a 143 different value later. 144 145 @since 2.9.3 146 */ 147 wxBOM GetBOM() const; 148 149 /** 150 Return a pointer to the characters that makes up this BOM. 151 152 The returned character count is 2, 3 or 4, or undefined if the return 153 value is NULL. 154 155 @param bom 156 A valid BOM type, i.e. not wxBOM_Unknown or wxBOM_None. 157 @param count 158 A non-@NULL pointer receiving the number of characters in this BOM. 159 @return 160 Pointer to characters composing the BOM or @NULL if BOM is unknown 161 or invalid. Notice that the returned string is not NUL-terminated 162 and may contain embedded NULs so @a count must be used to handle it 163 correctly. 164 165 @since 2.9.3 166 */ 167 const char* GetBOMChars(wxBOM bom, size_t* count); 168 169 /** 170 Disable the use of the fall back encoding: if the input doesn't have a 171 BOM and is not valid UTF-8, the conversion will fail. 172 */ 173 static void DisableFallbackEncoding(); 174 175 /** 176 Returns the encoding used by default by wxConvAuto if no other encoding 177 is explicitly specified in constructor. By default, returns 178 @c wxFONTENCODING_ISO8859_1 but can be changed using 179 SetFallbackEncoding(). 180 */ 181 static wxFontEncoding GetFallbackEncoding(); 182 183 /** 184 Changes the encoding used by default by wxConvAuto if no other encoding 185 is explicitly specified in constructor. The default value, which can be 186 retrieved using GetFallbackEncoding(), is @c wxFONTENCODING_ISO8859_1. 187 188 Special values of @c wxFONTENCODING_SYSTEM or @c wxFONTENCODING_MAX can 189 be used for the @a enc parameter to use the encoding of the current 190 user locale as fall back or not use any encoding for fall back at all, 191 respectively (just as with the similar constructor parameter). However, 192 @c wxFONTENCODING_DEFAULT can't be used here. 193 */ 194 static void SetFallbackEncoding(wxFontEncoding enc); 195 196 /** 197 Return the BOM type of this buffer. 198 199 This is a helper function which is normally only used internally by 200 wxConvAuto but provided for convenience of the code that wants to 201 detect the encoding of a stream by checking it for BOM presence on its 202 own. 203 204 @since 2.9.3 205 */ 206 static wxBOM DetectBOM(const char *src, size_t srcLen); 207 }; 208