xref: /reactos/sdk/tools/utf16le/utf16le.cpp (revision 2ea03b5b)
1c2c66affSColin Finck /*
2c2c66affSColin Finck  * Usage: utf16le inputfile outputfile
3c2c66affSColin Finck  *
4c2c66affSColin Finck  * This is a tool and is compiled using the host compiler,
5c2c66affSColin Finck  * i.e. on Linux gcc and not mingw-gcc (cross-compiler).
6c2c66affSColin Finck  * It's a converter from utf-8, utf-16 (LE/BE) and utf-32 (LE/BE)
7c2c66affSColin Finck  * to utf-16 LE and especially made for automatic conversions of
8c2c66affSColin Finck  * INF-files from utf-8 to utf-16LE (so we can furthermore
9c2c66affSColin Finck  * store the INF files in utf-8 for subversion.
10c2c66affSColin Finck  *
11c2c66affSColin Finck  * Author: Matthias Kupfer (mkupfer@reactos.org)
12c2c66affSColin Finck  */
13c2c66affSColin Finck 
14c2c66affSColin Finck #include <fstream>
15c2c66affSColin Finck #include <iostream>
16c2c66affSColin Finck #include <string.h>
17c2c66affSColin Finck 
18c2c66affSColin Finck //#define DISPLAY_DETECTED_UNICODE
19c2c66affSColin Finck 
20c2c66affSColin Finck using namespace std;
21c2c66affSColin Finck 
22c2c66affSColin Finck #ifdef _MSC_VER
23c2c66affSColin Finck #define strcasecmp _stricmp
24c2c66affSColin Finck #endif
25c2c66affSColin Finck 
26c2c66affSColin Finck class utf_converter
27c2c66affSColin Finck {
28c2c66affSColin Finck public:
29c2c66affSColin Finck     // detect can detect utf-8 and both utf-16 variants, but assume utf-32 only
30c2c66affSColin Finck     // due to ambiguous BOM
31c2c66affSColin Finck     enum enc_types { detect, utf8, utf16le, utf16be, utf32le, utf32be };
32c2c66affSColin Finck     enum err_types { none, iopen, oopen, eof, read, write, decode };
33c2c66affSColin Finck     enum bom_types { bom, nobom };
34c2c66affSColin Finck protected:
35c2c66affSColin Finck     err_types error;
36c2c66affSColin Finck     enc_types encoding;
37c2c66affSColin Finck     bom_types bom_type;
38f47f45dbSTimo Kreuzer     unsigned char buffer[4], index; // need 4 char buffer for optional BOM handling
39f47f45dbSTimo Kreuzer     std::streamsize fill;
40c2c66affSColin Finck     fstream inputfile,outputfile;
41c2c66affSColin Finck     static const unsigned char utf8table[64];
42c2c66affSColin Finck public:
utf_converter(string ifname,string ofname,bom_types ofbom=bom,enc_types enc=detect)43c2c66affSColin Finck     utf_converter(string ifname, string ofname, bom_types ofbom = bom, enc_types enc = detect) : error(none), bom_type(ofbom), encoding(enc), fill(0), index(0)
44c2c66affSColin Finck     {
45c2c66affSColin Finck         enc_types tmp_enc;
46c2c66affSColin Finck         inputfile.open(ifname.c_str(), ios::in | ios::binary);
47c2c66affSColin Finck         if (!inputfile)
48c2c66affSColin Finck         {
49c2c66affSColin Finck             error = iopen;
50c2c66affSColin Finck             return;
51c2c66affSColin Finck         }
52c2c66affSColin Finck         outputfile.open(ofname.c_str(), ios::out | ios::binary);
53c2c66affSColin Finck         if (!outputfile)
54c2c66affSColin Finck         {
55c2c66affSColin Finck             error = oopen;
56c2c66affSColin Finck             return;
57c2c66affSColin Finck         }
58c2c66affSColin Finck         tmp_enc = getBOM();
59c2c66affSColin Finck         if (enc != detect)
60c2c66affSColin Finck         {
61c2c66affSColin Finck             if (enc != tmp_enc)
62c2c66affSColin Finck                 cerr << "Warning: UTF-BOM doesn't match encoding setting, but given encoding forced" << endl;
63c2c66affSColin Finck         }
64c2c66affSColin Finck         else
65c2c66affSColin Finck             encoding = tmp_enc;
66c2c66affSColin Finck     }
getError()67c2c66affSColin Finck     err_types getError()
68c2c66affSColin Finck     {
69c2c66affSColin Finck         return error;
70c2c66affSColin Finck     }
getBOM()71c2c66affSColin Finck     enc_types getBOM()
72c2c66affSColin Finck     {
73c2c66affSColin Finck         index = 0;
74c2c66affSColin Finck         /* first byte can also detect with:
75c2c66affSColin Finck         if ((buffer[0] & 0x11) || !buffer[0]))
76c2c66affSColin Finck         valid values are 0xef, 0xff, 0xfe, 0x00
77c2c66affSColin Finck         */
78c2c66affSColin Finck         inputfile.read(reinterpret_cast<char*>(&buffer),4);
79c2c66affSColin Finck         fill = inputfile.gcount();
80c2c66affSColin Finck         // stupid utf8 bom
81c2c66affSColin Finck         if ((fill > 2) &&
82c2c66affSColin Finck             (buffer[0] == 0xef) &&
83c2c66affSColin Finck             (buffer[1] == 0xbb) &&
84c2c66affSColin Finck             (buffer[2] == 0xbf))
85c2c66affSColin Finck         {
86c2c66affSColin Finck             index += 3;
87c2c66affSColin Finck             fill -=3;
88c2c66affSColin Finck #ifdef DISPLAY_DETECTED_UNICODE
89c2c66affSColin Finck             cerr << "UTF-8 BOM found" << endl;
90c2c66affSColin Finck #endif
91c2c66affSColin Finck             return utf8;
92c2c66affSColin Finck         }
93c2c66affSColin Finck         if ((fill > 1) &&
94c2c66affSColin Finck             (buffer[0] == 0xfe) &&
95c2c66affSColin Finck             (buffer[1] == 0xff))
96c2c66affSColin Finck         {
97c2c66affSColin Finck             index += 2;
98c2c66affSColin Finck             fill -= 2;
99c2c66affSColin Finck #ifdef DISPLAY_DETECTED_UNICODE
100c2c66affSColin Finck             cerr << "UTF-16BE BOM found" << endl;
101c2c66affSColin Finck #endif
102c2c66affSColin Finck             return utf16be;
103c2c66affSColin Finck         }
104c2c66affSColin Finck         if ((fill > 1) &&
105c2c66affSColin Finck             (buffer[0] == 0xff) &&
106c2c66affSColin Finck             (buffer[1] == 0xfe))
107c2c66affSColin Finck         {
108c2c66affSColin Finck             if ((fill == 4) &&
109c2c66affSColin Finck                 (buffer[2] == 0x00) &&
110c2c66affSColin Finck                 (buffer[3] == 0x00))
111c2c66affSColin Finck             {
112c2c66affSColin Finck                 cerr << "UTF Error: ambiguous BOM UTF-16 or UTF-32; assume UTF-32" << endl;
113c2c66affSColin Finck                 fill = 0;
114c2c66affSColin Finck                 index = 0;
115c2c66affSColin Finck                 return utf32le;
116c2c66affSColin Finck             }
117c2c66affSColin Finck             fill -= 2;
118c2c66affSColin Finck             index += 2;
119c2c66affSColin Finck #ifdef DISPLAY_DETECTED_UNICODE
120c2c66affSColin Finck             cerr << "UTF-16LE BOM found" << endl;
121c2c66affSColin Finck #endif
122c2c66affSColin Finck             return utf16le;
123c2c66affSColin Finck         }
124c2c66affSColin Finck         if ((fill == 4) &&
125c2c66affSColin Finck             (buffer[0] == 0x00) &&
126c2c66affSColin Finck             (buffer[1] == 0x00) &&
127c2c66affSColin Finck             (buffer[2] == 0xfe) &&
128c2c66affSColin Finck             (buffer[3] == 0xff))
129c2c66affSColin Finck         {
130c2c66affSColin Finck             fill = 0;
131c2c66affSColin Finck             index = 0;
132c2c66affSColin Finck #ifdef DISPLAY_DETECTED_UNICODE
133c2c66affSColin Finck             cerr << "UTF-32BE BOM found" << endl;
134c2c66affSColin Finck #endif
135c2c66affSColin Finck             return utf32be;
136c2c66affSColin Finck         }
137c2c66affSColin Finck         return utf8; // no valid bom so use utf8 as default
138c2c66affSColin Finck     }
getByte(unsigned char & c)139f47f45dbSTimo Kreuzer     std::streamsize getByte(unsigned char &c)
140c2c66affSColin Finck     {
141c2c66affSColin Finck         if (fill)
142c2c66affSColin Finck         {
143c2c66affSColin Finck             index %= 4;
144c2c66affSColin Finck             --fill;
145c2c66affSColin Finck             c = buffer[index++];
146c2c66affSColin Finck             return 1;
147c2c66affSColin Finck         } else
148c2c66affSColin Finck         {
149c2c66affSColin Finck             inputfile.read(reinterpret_cast<char*>(&c),1);
150c2c66affSColin Finck             return inputfile.gcount();
151c2c66affSColin Finck         }
152c2c66affSColin Finck     }
getWord(unsigned short & w)153f47f45dbSTimo Kreuzer     std::streamsize getWord(unsigned short &w)
154c2c66affSColin Finck     {
155c2c66affSColin Finck         unsigned char c[2];
156c2c66affSColin Finck         if (!getByte(c[0]))
157c2c66affSColin Finck                 return 0;
158c2c66affSColin Finck         if (!getByte(c[1]))
159c2c66affSColin Finck                 return 1;
160c2c66affSColin Finck         if (encoding == utf16le)
161c2c66affSColin Finck             w = c[0] | (c[1] << 8);
162c2c66affSColin Finck         else
163c2c66affSColin Finck             w = c[1] | (c[0] << 8);
164c2c66affSColin Finck         return 2;
165c2c66affSColin Finck     }
getDWord(wchar_t & d)166f47f45dbSTimo Kreuzer     std::streamsize getDWord(wchar_t &d)
167c2c66affSColin Finck     {
168c2c66affSColin Finck         unsigned char c[4];
169c2c66affSColin Finck         for (int i=0;i<4;i++)
170c2c66affSColin Finck             if (!getByte(c[i]))
171c2c66affSColin Finck                     return i;
172c2c66affSColin Finck         if (encoding == utf32le)
173c2c66affSColin Finck             d = c[0] | (c[1] << 8) | (c[2] << 16) | (c[3] << 24);
174c2c66affSColin Finck         else
175c2c66affSColin Finck             d = c[3] | (c[2] << 8) | (c[1] << 16) | (c[0] << 24);
176c2c66affSColin Finck         return 4;
177c2c66affSColin Finck     }
get_wchar_t()178c2c66affSColin Finck     wchar_t get_wchar_t()
179c2c66affSColin Finck     {
180c2c66affSColin Finck         wchar_t ret = (wchar_t)-1;
181c2c66affSColin Finck         switch (encoding)
182c2c66affSColin Finck         {
183*2ea03b5bSAndriy Shevchenko             case detect: // if still unknown
184c2c66affSColin Finck                 encoding = utf8; // assume utf8 as default
185c2c66affSColin Finck             case utf8:
186c2c66affSColin Finck                 unsigned char c, tmp;
187c2c66affSColin Finck                 if (!getByte(tmp))
188c2c66affSColin Finck                     return ret;
189c2c66affSColin Finck                 // table for 64 bytes (all 11xxxxxx resp. >=192)
190c2c66affSColin Finck                 // resulting byte is determined:
191c2c66affSColin Finck                 // lower 3 bits: number of following bytes (max.8) 0=error
192c2c66affSColin Finck                 // upper 5 bits: data filled with 0
193c2c66affSColin Finck                 if (tmp & 0x80)
194c2c66affSColin Finck                 {
195c2c66affSColin Finck                     if ((tmp & 0xc0) != 0xc0)
196c2c66affSColin Finck                     {
197c2c66affSColin Finck                         cerr << "UTF-8 Error: invalid data byte" << endl;
198c2c66affSColin Finck                         return ret;
199c2c66affSColin Finck                     }
200c2c66affSColin Finck                     unsigned char i = utf8table[tmp & 0x3f];
201c2c66affSColin Finck                     ret = i >> 3;
202c2c66affSColin Finck                     i &= 7;
203c2c66affSColin Finck                     while (i--)
204c2c66affSColin Finck                     {
205c2c66affSColin Finck                         ret <<= 6;
206c2c66affSColin Finck                         if (!getByte(c))
207c2c66affSColin Finck                             return wchar_t(-1);
208c2c66affSColin Finck                         ret |= c & 0x3f;
209c2c66affSColin Finck                     }
210c2c66affSColin Finck                     return ret;
211c2c66affSColin Finck                 }
212c2c66affSColin Finck                 else
213c2c66affSColin Finck                     return wchar_t(tmp);
214c2c66affSColin Finck             case utf16le:
215c2c66affSColin Finck             case utf16be:
216c2c66affSColin Finck                 unsigned short w,w2;
217c2c66affSColin Finck                 if (getWord(w) != 2)
218c2c66affSColin Finck                     return ret;
219c2c66affSColin Finck                 if ((w & 0xfc00) == 0xd800) // high surrogate first
220c2c66affSColin Finck                 {
221c2c66affSColin Finck                     if (getWord(w2) != 2)
222c2c66affSColin Finck                         return ret;
223c2c66affSColin Finck                     if ((w2 & 0xfc00) != 0xdc00)
224c2c66affSColin Finck                     {
225c2c66affSColin Finck                         cerr << "UTF-16 Error: invalid low surrogate" << endl;
226c2c66affSColin Finck                         return ret;
227c2c66affSColin Finck                     }
228c2c66affSColin Finck                     return (((w & 0x3ff) + 0x40) << 10) | (w2 & 0x3ff);
229c2c66affSColin Finck                 }
230c2c66affSColin Finck                 return w;
231c2c66affSColin Finck             case utf32le:
232c2c66affSColin Finck             case utf32be:
233c2c66affSColin Finck                 if (getDWord(ret) != 4)
234c2c66affSColin Finck                     return wchar_t (-1);
235c2c66affSColin Finck                 return ret;
236c2c66affSColin Finck         }
237c2c66affSColin Finck         return ret;
238c2c66affSColin Finck     }
convert2utf16le()239c2c66affSColin Finck     void convert2utf16le()
240c2c66affSColin Finck     {
241c2c66affSColin Finck         unsigned char buffer[2] = { 0xff, 0xfe };
242c2c66affSColin Finck 
243c2c66affSColin Finck         if (bom_type == bom)
244c2c66affSColin Finck         {
245c2c66affSColin Finck             outputfile.write(reinterpret_cast<char*>(&buffer), 2); // write BOM
246c2c66affSColin Finck         }
247c2c66affSColin Finck 
248c2c66affSColin Finck         wchar_t c = get_wchar_t();
249c2c66affSColin Finck 
250c2c66affSColin Finck         while (!inputfile.eof())
251c2c66affSColin Finck         {
252c2c66affSColin Finck             buffer[0] = c & 0xff;
253c2c66affSColin Finck             buffer[1] = (c >> 8) & 0xff; // create utf16-le char
254c2c66affSColin Finck             outputfile.write(reinterpret_cast<char*>(&buffer),2); // write char
255c2c66affSColin Finck             c = get_wchar_t();
256c2c66affSColin Finck         }
257c2c66affSColin Finck     }
~utf_converter()258c2c66affSColin Finck     ~utf_converter()
259c2c66affSColin Finck     {
260c2c66affSColin Finck         if (inputfile)
261c2c66affSColin Finck             inputfile.close();
262c2c66affSColin Finck         if (outputfile)
263c2c66affSColin Finck             outputfile.close();
264c2c66affSColin Finck     }
265c2c66affSColin Finck };
266c2c66affSColin Finck 
267c2c66affSColin Finck const unsigned char utf_converter::utf8table[64] = {
268c2c66affSColin Finck 1, 9, 17, 25, 33, 41, 49, 57, 65, 73, 81, 89, 97, 105, 113, 121,
269c2c66affSColin Finck 129, 137, 145, 153, 161, 169, 177, 185, 193, 201, 209, 217, 225, 233, 241, 249,
270c2c66affSColin Finck 2, 10, 18, 26, 34, 42, 50, 58, 66, 74, 82, 90, 98, 106, 114, 122,
271c2c66affSColin Finck 3, 11, 19, 27, 35, 43, 51, 59, 4, 12, 20, 28, 5, 13, 6, 7
272c2c66affSColin Finck };
273c2c66affSColin Finck 
274c2c66affSColin Finck 
main(int argc,char * argv[])275c2c66affSColin Finck int main(int argc, char* argv[])
276c2c66affSColin Finck {
277c2c66affSColin Finck     utf_converter::err_types err;
278c2c66affSColin Finck 
279c2c66affSColin Finck     if (argc < 3)
280c2c66affSColin Finck     {
281c2c66affSColin Finck         cout << "usage: " << argv[0] << " inputfile outputfile" << endl;
282c2c66affSColin Finck         return -1;
283c2c66affSColin Finck     }
284c2c66affSColin Finck 
285c2c66affSColin Finck     utf_converter::bom_types bom_type = utf_converter::bom;
286c2c66affSColin Finck 
287c2c66affSColin Finck     if (argc == 4 && strcasecmp(argv[3], "nobom") == 0)
288c2c66affSColin Finck     {
289c2c66affSColin Finck         bom_type = utf_converter::nobom;
290c2c66affSColin Finck     }
291c2c66affSColin Finck 
292c2c66affSColin Finck     utf_converter conv(argv[1], argv[2], bom_type);
293c2c66affSColin Finck 
294c2c66affSColin Finck     if ((err = conv.getError())!=utf_converter::none)
295c2c66affSColin Finck     {
296c2c66affSColin Finck         switch (err)
297c2c66affSColin Finck         {
298c2c66affSColin Finck             case utf_converter::iopen:
299c2c66affSColin Finck                 cerr << "Couldn't open input file." << endl;
300c2c66affSColin Finck                 break;
301c2c66affSColin Finck             case utf_converter::oopen:
302c2c66affSColin Finck                 cerr << "Couldn't open output file." << endl;
303c2c66affSColin Finck                 break;
304c2c66affSColin Finck             default:
305c2c66affSColin Finck                 cerr << "Unknown error." << endl;
306c2c66affSColin Finck         }
307c2c66affSColin Finck         return -1;
308c2c66affSColin Finck     }
309c2c66affSColin Finck     else
310c2c66affSColin Finck     {
311c2c66affSColin Finck         conv.convert2utf16le();
312c2c66affSColin Finck     }
313c2c66affSColin Finck 
314c2c66affSColin Finck     return 0;
315c2c66affSColin Finck }
316