xref: /reactos/sdk/tools/utf16le/utf16le.cpp (revision 9cfd8dd9)
1 /*
2  * Usage: utf16le inputfile outputfile
3  *
4  * This is a tool and is compiled using the host compiler,
5  * i.e. on Linux gcc and not mingw-gcc (cross-compiler).
6  * It's a converter from utf-8, utf-16 (LE/BE) and utf-32 (LE/BE)
7  * to utf-16 LE and especially made for automatic conversions of
8  * INF-files from utf-8 to utf-16LE (so we can furthermore
9  * store the INF files in utf-8 for subversion.
10  *
11  * Author: Matthias Kupfer (mkupfer@reactos.org)
12  */
13 
14 #include <fstream>
15 #include <iostream>
16 #include <string.h>
17 
18 //#define DISPLAY_DETECTED_UNICODE
19 
20 using namespace std;
21 
22 #ifdef _MSC_VER
23 #define strcasecmp _stricmp
24 #endif
25 
26 class utf_converter
27 {
28 public:
29     // detect can detect utf-8 and both utf-16 variants, but assume utf-32 only
30     // due to ambiguous BOM
31     enum enc_types { detect, utf8, utf16le, utf16be, utf32le, utf32be };
32     enum err_types { none, iopen, oopen, eof, read, write, decode };
33     enum bom_types { bom, nobom };
34 protected:
35     err_types error;
36     enc_types encoding;
37     bom_types bom_type;
38     unsigned char buffer[4], index; // need 4 char buffer for optional BOM handling
39     std::streamsize fill;
40     fstream inputfile,outputfile;
41     static const unsigned char utf8table[64];
42 public:
43     utf_converter(string ifname, string ofname, bom_types ofbom = bom, enc_types enc = detect) : error(none), bom_type(ofbom), encoding(enc), fill(0), index(0)
44     {
45         enc_types tmp_enc;
46         inputfile.open(ifname.c_str(), ios::in | ios::binary);
47         if (!inputfile)
48         {
49             error = iopen;
50             return;
51         }
52         outputfile.open(ofname.c_str(), ios::out | ios::binary);
53         if (!outputfile)
54         {
55             error = oopen;
56             return;
57         }
58         tmp_enc = getBOM();
59         if (enc != detect)
60         {
61             if (enc != tmp_enc)
62                 cerr << "Warning: UTF-BOM doesn't match encoding setting, but given encoding forced" << endl;
63         }
64         else
65             encoding = tmp_enc;
66     }
67     err_types getError()
68     {
69         return error;
70     }
71     enc_types getBOM()
72     {
73         index = 0;
74         /* first byte can also detect with:
75         if ((buffer[0] & 0x11) || !buffer[0]))
76         valid values are 0xef, 0xff, 0xfe, 0x00
77         */
78         inputfile.read(reinterpret_cast<char*>(&buffer),4);
79         fill = inputfile.gcount();
80         // stupid utf8 bom
81         if ((fill > 2) &&
82             (buffer[0] == 0xef) &&
83             (buffer[1] == 0xbb) &&
84             (buffer[2] == 0xbf))
85         {
86             index += 3;
87             fill -=3;
88 #ifdef DISPLAY_DETECTED_UNICODE
89             cerr << "UTF-8 BOM found" << endl;
90 #endif
91             return utf8;
92         }
93         if ((fill > 1) &&
94             (buffer[0] == 0xfe) &&
95             (buffer[1] == 0xff))
96         {
97             index += 2;
98             fill -= 2;
99 #ifdef DISPLAY_DETECTED_UNICODE
100             cerr << "UTF-16BE BOM found" << endl;
101 #endif
102             return utf16be;
103         }
104         if ((fill > 1) &&
105             (buffer[0] == 0xff) &&
106             (buffer[1] == 0xfe))
107         {
108             if ((fill == 4) &&
109                 (buffer[2] == 0x00) &&
110                 (buffer[3] == 0x00))
111             {
112                 cerr << "UTF Error: ambiguous BOM UTF-16 or UTF-32; assume UTF-32" << endl;
113                 fill = 0;
114                 index = 0;
115                 return utf32le;
116             }
117             fill -= 2;
118             index += 2;
119 #ifdef DISPLAY_DETECTED_UNICODE
120             cerr << "UTF-16LE BOM found" << endl;
121 #endif
122             return utf16le;
123         }
124         if ((fill == 4) &&
125             (buffer[0] == 0x00) &&
126             (buffer[1] == 0x00) &&
127             (buffer[2] == 0xfe) &&
128             (buffer[3] == 0xff))
129         {
130             fill = 0;
131             index = 0;
132 #ifdef DISPLAY_DETECTED_UNICODE
133             cerr << "UTF-32BE BOM found" << endl;
134 #endif
135             return utf32be;
136         }
137         return utf8; // no valid bom so use utf8 as default
138     }
139     std::streamsize getByte(unsigned char &c)
140     {
141         if (fill)
142         {
143             index %= 4;
144             --fill;
145             c = buffer[index++];
146             return 1;
147         } else
148         {
149             inputfile.read(reinterpret_cast<char*>(&c),1);
150             return inputfile.gcount();
151         }
152     }
153     std::streamsize getWord(unsigned short &w)
154     {
155         unsigned char c[2];
156         if (!getByte(c[0]))
157                 return 0;
158         if (!getByte(c[1]))
159                 return 1;
160         if (encoding == utf16le)
161             w = c[0] | (c[1] << 8);
162         else
163             w = c[1] | (c[0] << 8);
164         return 2;
165     }
166     std::streamsize getDWord(wchar_t &d)
167     {
168         unsigned char c[4];
169         for (int i=0;i<4;i++)
170             if (!getByte(c[i]))
171                     return i;
172         if (encoding == utf32le)
173             d = c[0] | (c[1] << 8) | (c[2] << 16) | (c[3] << 24);
174         else
175             d = c[3] | (c[2] << 8) | (c[1] << 16) | (c[0] << 24);
176         return 4;
177     }
178     wchar_t get_wchar_t()
179     {
180         wchar_t ret = (wchar_t)-1;
181         switch (encoding)
182         {
183             case detect: // if still unknown
184                 encoding = utf8; // assume utf8 as default
185             case utf8:
186                 unsigned char c, tmp;
187                 if (!getByte(tmp))
188                     return ret;
189                 // table for 64 bytes (all 11xxxxxx resp. >=192)
190                 // resulting byte is determined:
191                 // lower 3 bits: number of following bytes (max.8) 0=error
192                 // upper 5 bits: data filled with 0
193                 if (tmp & 0x80)
194                 {
195                     if ((tmp & 0xc0) != 0xc0)
196                     {
197                         cerr << "UTF-8 Error: invalid data byte" << endl;
198                         return ret;
199                     }
200                     unsigned char i = utf8table[tmp & 0x3f];
201                     ret = i >> 3;
202                     i &= 7;
203                     while (i--)
204                     {
205                         ret <<= 6;
206                         if (!getByte(c))
207                             return wchar_t(-1);
208                         ret |= c & 0x3f;
209                     }
210                     return ret;
211                 }
212                 else
213                     return wchar_t(tmp);
214             case utf16le:
215             case utf16be:
216                 unsigned short w,w2;
217                 if (getWord(w) != 2)
218                     return ret;
219                 if ((w & 0xfc00) == 0xd800) // high surrogate first
220                 {
221                     if (getWord(w2) != 2)
222                         return ret;
223                     if ((w2 & 0xfc00) != 0xdc00)
224                     {
225                         cerr << "UTF-16 Error: invalid low surrogate" << endl;
226                         return ret;
227                     }
228                     return (((w & 0x3ff) + 0x40) << 10) | (w2 & 0x3ff);
229                 }
230                 return w;
231             case utf32le:
232             case utf32be:
233                 if (getDWord(ret) != 4)
234                     return wchar_t (-1);
235                 return ret;
236         }
237         return ret;
238     }
239     void convert2utf16le()
240     {
241         unsigned char buffer[2] = { 0xff, 0xfe };
242 
243         if (bom_type == bom)
244         {
245             outputfile.write(reinterpret_cast<char*>(&buffer), 2); // write BOM
246         }
247 
248         wchar_t c = get_wchar_t();
249 
250         while (!inputfile.eof())
251         {
252             buffer[0] = c & 0xff;
253             buffer[1] = (c >> 8) & 0xff; // create utf16-le char
254             outputfile.write(reinterpret_cast<char*>(&buffer),2); // write char
255             c = get_wchar_t();
256         }
257     }
258     ~utf_converter()
259     {
260         if (inputfile)
261             inputfile.close();
262         if (outputfile)
263             outputfile.close();
264     }
265 };
266 
267 const unsigned char utf_converter::utf8table[64] = {
268 1, 9, 17, 25, 33, 41, 49, 57, 65, 73, 81, 89, 97, 105, 113, 121,
269 129, 137, 145, 153, 161, 169, 177, 185, 193, 201, 209, 217, 225, 233, 241, 249,
270 2, 10, 18, 26, 34, 42, 50, 58, 66, 74, 82, 90, 98, 106, 114, 122,
271 3, 11, 19, 27, 35, 43, 51, 59, 4, 12, 20, 28, 5, 13, 6, 7
272 };
273 
274 
275 int main(int argc, char* argv[])
276 {
277     utf_converter::err_types err;
278 
279     if (argc < 3)
280     {
281         cout << "usage: " << argv[0] << " inputfile outputfile" << endl;
282         return -1;
283     }
284 
285     utf_converter::bom_types bom_type = utf_converter::bom;
286 
287     if (argc == 4 && strcasecmp(argv[3], "nobom") == 0)
288     {
289         bom_type = utf_converter::nobom;
290     }
291 
292     utf_converter conv(argv[1], argv[2], bom_type);
293 
294     if ((err = conv.getError())!=utf_converter::none)
295     {
296         switch (err)
297         {
298             case utf_converter::iopen:
299                 cerr << "Couldn't open input file." << endl;
300                 break;
301             case utf_converter::oopen:
302                 cerr << "Couldn't open output file." << endl;
303                 break;
304             default:
305                 cerr << "Unknown error." << endl;
306         }
307         return -1;
308     }
309     else
310     {
311         conv.convert2utf16le();
312     }
313 
314     return 0;
315 }
316