1c2c66affSColin Finck /*
2c2c66affSColin Finck * Usage: utf16le inputfile outputfile
3c2c66affSColin Finck *
4c2c66affSColin Finck * This is a tool and is compiled using the host compiler,
5c2c66affSColin Finck * i.e. on Linux gcc and not mingw-gcc (cross-compiler).
6c2c66affSColin Finck * It's a converter from utf-8, utf-16 (LE/BE) and utf-32 (LE/BE)
7c2c66affSColin Finck * to utf-16 LE and especially made for automatic conversions of
8c2c66affSColin Finck * INF-files from utf-8 to utf-16LE (so we can furthermore
9c2c66affSColin Finck * store the INF files in utf-8 for subversion.
10c2c66affSColin Finck *
11c2c66affSColin Finck * Author: Matthias Kupfer (mkupfer@reactos.org)
12c2c66affSColin Finck */
13c2c66affSColin Finck
14c2c66affSColin Finck #include <fstream>
15c2c66affSColin Finck #include <iostream>
16c2c66affSColin Finck #include <string.h>
17c2c66affSColin Finck
18c2c66affSColin Finck //#define DISPLAY_DETECTED_UNICODE
19c2c66affSColin Finck
20c2c66affSColin Finck using namespace std;
21c2c66affSColin Finck
22c2c66affSColin Finck #ifdef _MSC_VER
23c2c66affSColin Finck #define strcasecmp _stricmp
24c2c66affSColin Finck #endif
25c2c66affSColin Finck
26c2c66affSColin Finck class utf_converter
27c2c66affSColin Finck {
28c2c66affSColin Finck public:
29c2c66affSColin Finck // detect can detect utf-8 and both utf-16 variants, but assume utf-32 only
30c2c66affSColin Finck // due to ambiguous BOM
31c2c66affSColin Finck enum enc_types { detect, utf8, utf16le, utf16be, utf32le, utf32be };
32c2c66affSColin Finck enum err_types { none, iopen, oopen, eof, read, write, decode };
33c2c66affSColin Finck enum bom_types { bom, nobom };
34c2c66affSColin Finck protected:
35c2c66affSColin Finck err_types error;
36c2c66affSColin Finck enc_types encoding;
37c2c66affSColin Finck bom_types bom_type;
38f47f45dbSTimo Kreuzer unsigned char buffer[4], index; // need 4 char buffer for optional BOM handling
39f47f45dbSTimo Kreuzer std::streamsize fill;
40c2c66affSColin Finck fstream inputfile,outputfile;
41c2c66affSColin Finck static const unsigned char utf8table[64];
42c2c66affSColin Finck public:
utf_converter(string ifname,string ofname,bom_types ofbom=bom,enc_types enc=detect)43c2c66affSColin Finck utf_converter(string ifname, string ofname, bom_types ofbom = bom, enc_types enc = detect) : error(none), bom_type(ofbom), encoding(enc), fill(0), index(0)
44c2c66affSColin Finck {
45c2c66affSColin Finck enc_types tmp_enc;
46c2c66affSColin Finck inputfile.open(ifname.c_str(), ios::in | ios::binary);
47c2c66affSColin Finck if (!inputfile)
48c2c66affSColin Finck {
49c2c66affSColin Finck error = iopen;
50c2c66affSColin Finck return;
51c2c66affSColin Finck }
52c2c66affSColin Finck outputfile.open(ofname.c_str(), ios::out | ios::binary);
53c2c66affSColin Finck if (!outputfile)
54c2c66affSColin Finck {
55c2c66affSColin Finck error = oopen;
56c2c66affSColin Finck return;
57c2c66affSColin Finck }
58c2c66affSColin Finck tmp_enc = getBOM();
59c2c66affSColin Finck if (enc != detect)
60c2c66affSColin Finck {
61c2c66affSColin Finck if (enc != tmp_enc)
62c2c66affSColin Finck cerr << "Warning: UTF-BOM doesn't match encoding setting, but given encoding forced" << endl;
63c2c66affSColin Finck }
64c2c66affSColin Finck else
65c2c66affSColin Finck encoding = tmp_enc;
66c2c66affSColin Finck }
getError()67c2c66affSColin Finck err_types getError()
68c2c66affSColin Finck {
69c2c66affSColin Finck return error;
70c2c66affSColin Finck }
getBOM()71c2c66affSColin Finck enc_types getBOM()
72c2c66affSColin Finck {
73c2c66affSColin Finck index = 0;
74c2c66affSColin Finck /* first byte can also detect with:
75c2c66affSColin Finck if ((buffer[0] & 0x11) || !buffer[0]))
76c2c66affSColin Finck valid values are 0xef, 0xff, 0xfe, 0x00
77c2c66affSColin Finck */
78c2c66affSColin Finck inputfile.read(reinterpret_cast<char*>(&buffer),4);
79c2c66affSColin Finck fill = inputfile.gcount();
80c2c66affSColin Finck // stupid utf8 bom
81c2c66affSColin Finck if ((fill > 2) &&
82c2c66affSColin Finck (buffer[0] == 0xef) &&
83c2c66affSColin Finck (buffer[1] == 0xbb) &&
84c2c66affSColin Finck (buffer[2] == 0xbf))
85c2c66affSColin Finck {
86c2c66affSColin Finck index += 3;
87c2c66affSColin Finck fill -=3;
88c2c66affSColin Finck #ifdef DISPLAY_DETECTED_UNICODE
89c2c66affSColin Finck cerr << "UTF-8 BOM found" << endl;
90c2c66affSColin Finck #endif
91c2c66affSColin Finck return utf8;
92c2c66affSColin Finck }
93c2c66affSColin Finck if ((fill > 1) &&
94c2c66affSColin Finck (buffer[0] == 0xfe) &&
95c2c66affSColin Finck (buffer[1] == 0xff))
96c2c66affSColin Finck {
97c2c66affSColin Finck index += 2;
98c2c66affSColin Finck fill -= 2;
99c2c66affSColin Finck #ifdef DISPLAY_DETECTED_UNICODE
100c2c66affSColin Finck cerr << "UTF-16BE BOM found" << endl;
101c2c66affSColin Finck #endif
102c2c66affSColin Finck return utf16be;
103c2c66affSColin Finck }
104c2c66affSColin Finck if ((fill > 1) &&
105c2c66affSColin Finck (buffer[0] == 0xff) &&
106c2c66affSColin Finck (buffer[1] == 0xfe))
107c2c66affSColin Finck {
108c2c66affSColin Finck if ((fill == 4) &&
109c2c66affSColin Finck (buffer[2] == 0x00) &&
110c2c66affSColin Finck (buffer[3] == 0x00))
111c2c66affSColin Finck {
112c2c66affSColin Finck cerr << "UTF Error: ambiguous BOM UTF-16 or UTF-32; assume UTF-32" << endl;
113c2c66affSColin Finck fill = 0;
114c2c66affSColin Finck index = 0;
115c2c66affSColin Finck return utf32le;
116c2c66affSColin Finck }
117c2c66affSColin Finck fill -= 2;
118c2c66affSColin Finck index += 2;
119c2c66affSColin Finck #ifdef DISPLAY_DETECTED_UNICODE
120c2c66affSColin Finck cerr << "UTF-16LE BOM found" << endl;
121c2c66affSColin Finck #endif
122c2c66affSColin Finck return utf16le;
123c2c66affSColin Finck }
124c2c66affSColin Finck if ((fill == 4) &&
125c2c66affSColin Finck (buffer[0] == 0x00) &&
126c2c66affSColin Finck (buffer[1] == 0x00) &&
127c2c66affSColin Finck (buffer[2] == 0xfe) &&
128c2c66affSColin Finck (buffer[3] == 0xff))
129c2c66affSColin Finck {
130c2c66affSColin Finck fill = 0;
131c2c66affSColin Finck index = 0;
132c2c66affSColin Finck #ifdef DISPLAY_DETECTED_UNICODE
133c2c66affSColin Finck cerr << "UTF-32BE BOM found" << endl;
134c2c66affSColin Finck #endif
135c2c66affSColin Finck return utf32be;
136c2c66affSColin Finck }
137c2c66affSColin Finck return utf8; // no valid bom so use utf8 as default
138c2c66affSColin Finck }
getByte(unsigned char & c)139f47f45dbSTimo Kreuzer std::streamsize getByte(unsigned char &c)
140c2c66affSColin Finck {
141c2c66affSColin Finck if (fill)
142c2c66affSColin Finck {
143c2c66affSColin Finck index %= 4;
144c2c66affSColin Finck --fill;
145c2c66affSColin Finck c = buffer[index++];
146c2c66affSColin Finck return 1;
147c2c66affSColin Finck } else
148c2c66affSColin Finck {
149c2c66affSColin Finck inputfile.read(reinterpret_cast<char*>(&c),1);
150c2c66affSColin Finck return inputfile.gcount();
151c2c66affSColin Finck }
152c2c66affSColin Finck }
getWord(unsigned short & w)153f47f45dbSTimo Kreuzer std::streamsize getWord(unsigned short &w)
154c2c66affSColin Finck {
155c2c66affSColin Finck unsigned char c[2];
156c2c66affSColin Finck if (!getByte(c[0]))
157c2c66affSColin Finck return 0;
158c2c66affSColin Finck if (!getByte(c[1]))
159c2c66affSColin Finck return 1;
160c2c66affSColin Finck if (encoding == utf16le)
161c2c66affSColin Finck w = c[0] | (c[1] << 8);
162c2c66affSColin Finck else
163c2c66affSColin Finck w = c[1] | (c[0] << 8);
164c2c66affSColin Finck return 2;
165c2c66affSColin Finck }
getDWord(wchar_t & d)166f47f45dbSTimo Kreuzer std::streamsize getDWord(wchar_t &d)
167c2c66affSColin Finck {
168c2c66affSColin Finck unsigned char c[4];
169c2c66affSColin Finck for (int i=0;i<4;i++)
170c2c66affSColin Finck if (!getByte(c[i]))
171c2c66affSColin Finck return i;
172c2c66affSColin Finck if (encoding == utf32le)
173c2c66affSColin Finck d = c[0] | (c[1] << 8) | (c[2] << 16) | (c[3] << 24);
174c2c66affSColin Finck else
175c2c66affSColin Finck d = c[3] | (c[2] << 8) | (c[1] << 16) | (c[0] << 24);
176c2c66affSColin Finck return 4;
177c2c66affSColin Finck }
get_wchar_t()178c2c66affSColin Finck wchar_t get_wchar_t()
179c2c66affSColin Finck {
180c2c66affSColin Finck wchar_t ret = (wchar_t)-1;
181c2c66affSColin Finck switch (encoding)
182c2c66affSColin Finck {
183*2ea03b5bSAndriy Shevchenko case detect: // if still unknown
184c2c66affSColin Finck encoding = utf8; // assume utf8 as default
185c2c66affSColin Finck case utf8:
186c2c66affSColin Finck unsigned char c, tmp;
187c2c66affSColin Finck if (!getByte(tmp))
188c2c66affSColin Finck return ret;
189c2c66affSColin Finck // table for 64 bytes (all 11xxxxxx resp. >=192)
190c2c66affSColin Finck // resulting byte is determined:
191c2c66affSColin Finck // lower 3 bits: number of following bytes (max.8) 0=error
192c2c66affSColin Finck // upper 5 bits: data filled with 0
193c2c66affSColin Finck if (tmp & 0x80)
194c2c66affSColin Finck {
195c2c66affSColin Finck if ((tmp & 0xc0) != 0xc0)
196c2c66affSColin Finck {
197c2c66affSColin Finck cerr << "UTF-8 Error: invalid data byte" << endl;
198c2c66affSColin Finck return ret;
199c2c66affSColin Finck }
200c2c66affSColin Finck unsigned char i = utf8table[tmp & 0x3f];
201c2c66affSColin Finck ret = i >> 3;
202c2c66affSColin Finck i &= 7;
203c2c66affSColin Finck while (i--)
204c2c66affSColin Finck {
205c2c66affSColin Finck ret <<= 6;
206c2c66affSColin Finck if (!getByte(c))
207c2c66affSColin Finck return wchar_t(-1);
208c2c66affSColin Finck ret |= c & 0x3f;
209c2c66affSColin Finck }
210c2c66affSColin Finck return ret;
211c2c66affSColin Finck }
212c2c66affSColin Finck else
213c2c66affSColin Finck return wchar_t(tmp);
214c2c66affSColin Finck case utf16le:
215c2c66affSColin Finck case utf16be:
216c2c66affSColin Finck unsigned short w,w2;
217c2c66affSColin Finck if (getWord(w) != 2)
218c2c66affSColin Finck return ret;
219c2c66affSColin Finck if ((w & 0xfc00) == 0xd800) // high surrogate first
220c2c66affSColin Finck {
221c2c66affSColin Finck if (getWord(w2) != 2)
222c2c66affSColin Finck return ret;
223c2c66affSColin Finck if ((w2 & 0xfc00) != 0xdc00)
224c2c66affSColin Finck {
225c2c66affSColin Finck cerr << "UTF-16 Error: invalid low surrogate" << endl;
226c2c66affSColin Finck return ret;
227c2c66affSColin Finck }
228c2c66affSColin Finck return (((w & 0x3ff) + 0x40) << 10) | (w2 & 0x3ff);
229c2c66affSColin Finck }
230c2c66affSColin Finck return w;
231c2c66affSColin Finck case utf32le:
232c2c66affSColin Finck case utf32be:
233c2c66affSColin Finck if (getDWord(ret) != 4)
234c2c66affSColin Finck return wchar_t (-1);
235c2c66affSColin Finck return ret;
236c2c66affSColin Finck }
237c2c66affSColin Finck return ret;
238c2c66affSColin Finck }
convert2utf16le()239c2c66affSColin Finck void convert2utf16le()
240c2c66affSColin Finck {
241c2c66affSColin Finck unsigned char buffer[2] = { 0xff, 0xfe };
242c2c66affSColin Finck
243c2c66affSColin Finck if (bom_type == bom)
244c2c66affSColin Finck {
245c2c66affSColin Finck outputfile.write(reinterpret_cast<char*>(&buffer), 2); // write BOM
246c2c66affSColin Finck }
247c2c66affSColin Finck
248c2c66affSColin Finck wchar_t c = get_wchar_t();
249c2c66affSColin Finck
250c2c66affSColin Finck while (!inputfile.eof())
251c2c66affSColin Finck {
252c2c66affSColin Finck buffer[0] = c & 0xff;
253c2c66affSColin Finck buffer[1] = (c >> 8) & 0xff; // create utf16-le char
254c2c66affSColin Finck outputfile.write(reinterpret_cast<char*>(&buffer),2); // write char
255c2c66affSColin Finck c = get_wchar_t();
256c2c66affSColin Finck }
257c2c66affSColin Finck }
~utf_converter()258c2c66affSColin Finck ~utf_converter()
259c2c66affSColin Finck {
260c2c66affSColin Finck if (inputfile)
261c2c66affSColin Finck inputfile.close();
262c2c66affSColin Finck if (outputfile)
263c2c66affSColin Finck outputfile.close();
264c2c66affSColin Finck }
265c2c66affSColin Finck };
266c2c66affSColin Finck
267c2c66affSColin Finck const unsigned char utf_converter::utf8table[64] = {
268c2c66affSColin Finck 1, 9, 17, 25, 33, 41, 49, 57, 65, 73, 81, 89, 97, 105, 113, 121,
269c2c66affSColin Finck 129, 137, 145, 153, 161, 169, 177, 185, 193, 201, 209, 217, 225, 233, 241, 249,
270c2c66affSColin Finck 2, 10, 18, 26, 34, 42, 50, 58, 66, 74, 82, 90, 98, 106, 114, 122,
271c2c66affSColin Finck 3, 11, 19, 27, 35, 43, 51, 59, 4, 12, 20, 28, 5, 13, 6, 7
272c2c66affSColin Finck };
273c2c66affSColin Finck
274c2c66affSColin Finck
main(int argc,char * argv[])275c2c66affSColin Finck int main(int argc, char* argv[])
276c2c66affSColin Finck {
277c2c66affSColin Finck utf_converter::err_types err;
278c2c66affSColin Finck
279c2c66affSColin Finck if (argc < 3)
280c2c66affSColin Finck {
281c2c66affSColin Finck cout << "usage: " << argv[0] << " inputfile outputfile" << endl;
282c2c66affSColin Finck return -1;
283c2c66affSColin Finck }
284c2c66affSColin Finck
285c2c66affSColin Finck utf_converter::bom_types bom_type = utf_converter::bom;
286c2c66affSColin Finck
287c2c66affSColin Finck if (argc == 4 && strcasecmp(argv[3], "nobom") == 0)
288c2c66affSColin Finck {
289c2c66affSColin Finck bom_type = utf_converter::nobom;
290c2c66affSColin Finck }
291c2c66affSColin Finck
292c2c66affSColin Finck utf_converter conv(argv[1], argv[2], bom_type);
293c2c66affSColin Finck
294c2c66affSColin Finck if ((err = conv.getError())!=utf_converter::none)
295c2c66affSColin Finck {
296c2c66affSColin Finck switch (err)
297c2c66affSColin Finck {
298c2c66affSColin Finck case utf_converter::iopen:
299c2c66affSColin Finck cerr << "Couldn't open input file." << endl;
300c2c66affSColin Finck break;
301c2c66affSColin Finck case utf_converter::oopen:
302c2c66affSColin Finck cerr << "Couldn't open output file." << endl;
303c2c66affSColin Finck break;
304c2c66affSColin Finck default:
305c2c66affSColin Finck cerr << "Unknown error." << endl;
306c2c66affSColin Finck }
307c2c66affSColin Finck return -1;
308c2c66affSColin Finck }
309c2c66affSColin Finck else
310c2c66affSColin Finck {
311c2c66affSColin Finck conv.convert2utf16le();
312c2c66affSColin Finck }
313c2c66affSColin Finck
314c2c66affSColin Finck return 0;
315c2c66affSColin Finck }
316