1 /* 2 * Usage: utf16le inputfile outputfile 3 * 4 * This is a tool and is compiled using the host compiler, 5 * i.e. on Linux gcc and not mingw-gcc (cross-compiler). 6 * It's a converter from utf-8, utf-16 (LE/BE) and utf-32 (LE/BE) 7 * to utf-16 LE and especially made for automatic conversions of 8 * INF-files from utf-8 to utf-16LE (so we can furthermore 9 * store the INF files in utf-8 for subversion. 10 * 11 * Author: Matthias Kupfer (mkupfer@reactos.org) 12 */ 13 14 #include <fstream> 15 #include <iostream> 16 #include <string.h> 17 18 //#define DISPLAY_DETECTED_UNICODE 19 20 using namespace std; 21 22 #ifdef _MSC_VER 23 #define strcasecmp _stricmp 24 #endif 25 26 class utf_converter 27 { 28 public: 29 // detect can detect utf-8 and both utf-16 variants, but assume utf-32 only 30 // due to ambiguous BOM 31 enum enc_types { detect, utf8, utf16le, utf16be, utf32le, utf32be }; 32 enum err_types { none, iopen, oopen, eof, read, write, decode }; 33 enum bom_types { bom, nobom }; 34 protected: 35 err_types error; 36 enc_types encoding; 37 bom_types bom_type; 38 unsigned char buffer[4], index; // need 4 char buffer for optional BOM handling 39 std::streamsize fill; 40 fstream inputfile,outputfile; 41 static const unsigned char utf8table[64]; 42 public: 43 utf_converter(string ifname, string ofname, bom_types ofbom = bom, enc_types enc = detect) : error(none), bom_type(ofbom), encoding(enc), fill(0), index(0) 44 { 45 enc_types tmp_enc; 46 inputfile.open(ifname.c_str(), ios::in | ios::binary); 47 if (!inputfile) 48 { 49 error = iopen; 50 return; 51 } 52 outputfile.open(ofname.c_str(), ios::out | ios::binary); 53 if (!outputfile) 54 { 55 error = oopen; 56 return; 57 } 58 tmp_enc = getBOM(); 59 if (enc != detect) 60 { 61 if (enc != tmp_enc) 62 cerr << "Warning: UTF-BOM doesn't match encoding setting, but given encoding forced" << endl; 63 } 64 else 65 encoding = tmp_enc; 66 } 67 err_types getError() 68 { 69 return error; 70 } 71 enc_types getBOM() 72 { 73 index = 0; 74 /* first byte can also detect with: 75 if ((buffer[0] & 0x11) || !buffer[0])) 76 valid values are 0xef, 0xff, 0xfe, 0x00 77 */ 78 inputfile.read(reinterpret_cast<char*>(&buffer),4); 79 fill = inputfile.gcount(); 80 // stupid utf8 bom 81 if ((fill > 2) && 82 (buffer[0] == 0xef) && 83 (buffer[1] == 0xbb) && 84 (buffer[2] == 0xbf)) 85 { 86 index += 3; 87 fill -=3; 88 #ifdef DISPLAY_DETECTED_UNICODE 89 cerr << "UTF-8 BOM found" << endl; 90 #endif 91 return utf8; 92 } 93 if ((fill > 1) && 94 (buffer[0] == 0xfe) && 95 (buffer[1] == 0xff)) 96 { 97 index += 2; 98 fill -= 2; 99 #ifdef DISPLAY_DETECTED_UNICODE 100 cerr << "UTF-16BE BOM found" << endl; 101 #endif 102 return utf16be; 103 } 104 if ((fill > 1) && 105 (buffer[0] == 0xff) && 106 (buffer[1] == 0xfe)) 107 { 108 if ((fill == 4) && 109 (buffer[2] == 0x00) && 110 (buffer[3] == 0x00)) 111 { 112 cerr << "UTF Error: ambiguous BOM UTF-16 or UTF-32; assume UTF-32" << endl; 113 fill = 0; 114 index = 0; 115 return utf32le; 116 } 117 fill -= 2; 118 index += 2; 119 #ifdef DISPLAY_DETECTED_UNICODE 120 cerr << "UTF-16LE BOM found" << endl; 121 #endif 122 return utf16le; 123 } 124 if ((fill == 4) && 125 (buffer[0] == 0x00) && 126 (buffer[1] == 0x00) && 127 (buffer[2] == 0xfe) && 128 (buffer[3] == 0xff)) 129 { 130 fill = 0; 131 index = 0; 132 #ifdef DISPLAY_DETECTED_UNICODE 133 cerr << "UTF-32BE BOM found" << endl; 134 #endif 135 return utf32be; 136 } 137 return utf8; // no valid bom so use utf8 as default 138 } 139 std::streamsize getByte(unsigned char &c) 140 { 141 if (fill) 142 { 143 index %= 4; 144 --fill; 145 c = buffer[index++]; 146 return 1; 147 } else 148 { 149 inputfile.read(reinterpret_cast<char*>(&c),1); 150 return inputfile.gcount(); 151 } 152 } 153 std::streamsize getWord(unsigned short &w) 154 { 155 unsigned char c[2]; 156 if (!getByte(c[0])) 157 return 0; 158 if (!getByte(c[1])) 159 return 1; 160 if (encoding == utf16le) 161 w = c[0] | (c[1] << 8); 162 else 163 w = c[1] | (c[0] << 8); 164 return 2; 165 } 166 std::streamsize getDWord(wchar_t &d) 167 { 168 unsigned char c[4]; 169 for (int i=0;i<4;i++) 170 if (!getByte(c[i])) 171 return i; 172 if (encoding == utf32le) 173 d = c[0] | (c[1] << 8) | (c[2] << 16) | (c[3] << 24); 174 else 175 d = c[3] | (c[2] << 8) | (c[1] << 16) | (c[0] << 24); 176 return 4; 177 } 178 wchar_t get_wchar_t() 179 { 180 wchar_t ret = (wchar_t)-1; 181 switch (encoding) 182 { 183 case detect: // if still unknown 184 encoding = utf8; // assume utf8 as default 185 case utf8: 186 unsigned char c, tmp; 187 if (!getByte(tmp)) 188 return ret; 189 // table for 64 bytes (all 11xxxxxx resp. >=192) 190 // resulting byte is determined: 191 // lower 3 bits: number of following bytes (max.8) 0=error 192 // upper 5 bits: data filled with 0 193 if (tmp & 0x80) 194 { 195 if ((tmp & 0xc0) != 0xc0) 196 { 197 cerr << "UTF-8 Error: invalid data byte" << endl; 198 return ret; 199 } 200 unsigned char i = utf8table[tmp & 0x3f]; 201 ret = i >> 3; 202 i &= 7; 203 while (i--) 204 { 205 ret <<= 6; 206 if (!getByte(c)) 207 return wchar_t(-1); 208 ret |= c & 0x3f; 209 } 210 return ret; 211 } 212 else 213 return wchar_t(tmp); 214 case utf16le: 215 case utf16be: 216 unsigned short w,w2; 217 if (getWord(w) != 2) 218 return ret; 219 if ((w & 0xfc00) == 0xd800) // high surrogate first 220 { 221 if (getWord(w2) != 2) 222 return ret; 223 if ((w2 & 0xfc00) != 0xdc00) 224 { 225 cerr << "UTF-16 Error: invalid low surrogate" << endl; 226 return ret; 227 } 228 return (((w & 0x3ff) + 0x40) << 10) | (w2 & 0x3ff); 229 } 230 return w; 231 case utf32le: 232 case utf32be: 233 if (getDWord(ret) != 4) 234 return wchar_t (-1); 235 return ret; 236 } 237 return ret; 238 } 239 void convert2utf16le() 240 { 241 unsigned char buffer[2] = { 0xff, 0xfe }; 242 243 if (bom_type == bom) 244 { 245 outputfile.write(reinterpret_cast<char*>(&buffer), 2); // write BOM 246 } 247 248 wchar_t c = get_wchar_t(); 249 250 while (!inputfile.eof()) 251 { 252 buffer[0] = c & 0xff; 253 buffer[1] = (c >> 8) & 0xff; // create utf16-le char 254 outputfile.write(reinterpret_cast<char*>(&buffer),2); // write char 255 c = get_wchar_t(); 256 } 257 } 258 ~utf_converter() 259 { 260 if (inputfile) 261 inputfile.close(); 262 if (outputfile) 263 outputfile.close(); 264 } 265 }; 266 267 const unsigned char utf_converter::utf8table[64] = { 268 1, 9, 17, 25, 33, 41, 49, 57, 65, 73, 81, 89, 97, 105, 113, 121, 269 129, 137, 145, 153, 161, 169, 177, 185, 193, 201, 209, 217, 225, 233, 241, 249, 270 2, 10, 18, 26, 34, 42, 50, 58, 66, 74, 82, 90, 98, 106, 114, 122, 271 3, 11, 19, 27, 35, 43, 51, 59, 4, 12, 20, 28, 5, 13, 6, 7 272 }; 273 274 275 int main(int argc, char* argv[]) 276 { 277 utf_converter::err_types err; 278 279 if (argc < 3) 280 { 281 cout << "usage: " << argv[0] << " inputfile outputfile" << endl; 282 return -1; 283 } 284 285 utf_converter::bom_types bom_type = utf_converter::bom; 286 287 if (argc == 4 && strcasecmp(argv[3], "nobom") == 0) 288 { 289 bom_type = utf_converter::nobom; 290 } 291 292 utf_converter conv(argv[1], argv[2], bom_type); 293 294 if ((err = conv.getError())!=utf_converter::none) 295 { 296 switch (err) 297 { 298 case utf_converter::iopen: 299 cerr << "Couldn't open input file." << endl; 300 break; 301 case utf_converter::oopen: 302 cerr << "Couldn't open output file." << endl; 303 break; 304 default: 305 cerr << "Unknown error." << endl; 306 } 307 return -1; 308 } 309 else 310 { 311 conv.convert2utf16le(); 312 } 313 314 return 0; 315 } 316