1 #include "to_utf8.hpp"
2
3 #include <vector>
4 #include <cassert>
5 #include <stdexcept>
6
7 #include <components/debug/debuglog.hpp>
8
9 /* This file contains the code to translate from WINDOWS-1252 (native
10 charset used in English version of Morrowind) to UTF-8. The library
11 is designed to be extened to support more source encodings later,
12 which means that we may add support for Russian, Polish and Chinese
13 files and so on.
14
15 The code does not depend on any external library at
16 runtime. Instead, it uses a pregenerated table made with iconv (see
17 gen_iconv.cpp and the Makefile) which is located in tables_gen.hpp.
18
19 This is both faster and uses less dependencies. The tables would
20 only need to be regenerated if we are adding support more input
21 encodings. As such, there is no need to make the generator code
22 platform independent.
23
24 The library is optimized for the case of pure ASCII input strings,
25 which is the vast majority of cases at least for the English
26 version. A test of my version of Morrowind.esm got 130 non-ASCII vs
27 236195 ASCII strings, or less than 0.06% of strings containing
28 non-ASCII characters.
29
30 To optmize for this, ff the first pass of the string does not find
31 any non-ASCII characters, the entire string is passed along without
32 any modification.
33
34 Most of the non-ASCII strings are books, and are quite large. (The
35 non-ASCII characters are typically starting and ending quotation
36 marks.) Within these, almost all the characters are ASCII. For this
37 purpose, the library is also optimized for mostly-ASCII contents
38 even in the cases where some conversion is necessary.
39 */
40
41
42 // Generated tables
43 #include "tables_gen.hpp"
44
45 using namespace ToUTF8;
46
Utf8Encoder(const FromType sourceEncoding)47 Utf8Encoder::Utf8Encoder(const FromType sourceEncoding):
48 mOutput(50*1024)
49 {
50 switch (sourceEncoding)
51 {
52 case ToUTF8::WINDOWS_1252:
53 {
54 translationArray = ToUTF8::windows_1252;
55 break;
56 }
57 case ToUTF8::WINDOWS_1250:
58 {
59 translationArray = ToUTF8::windows_1250;
60 break;
61 }
62 case ToUTF8::WINDOWS_1251:
63 {
64 translationArray = ToUTF8::windows_1251;
65 break;
66 }
67 case ToUTF8::CP437:
68 {
69 translationArray = ToUTF8::cp437;
70 break;
71 }
72
73 default:
74 {
75 assert(0);
76 }
77 }
78 }
79
getUtf8(const char * input,size_t size)80 std::string Utf8Encoder::getUtf8(const char* input, size_t size)
81 {
82 // Double check that the input string stops at some point (it might
83 // contain zero terminators before this, inside its own data, which
84 // is also ok.)
85 assert(input[size] == 0);
86
87 // Note: The rest of this function is designed for single-character
88 // input encodings only. It also assumes that the input encoding
89 // shares its first 128 values (0-127) with ASCII. There are no plans
90 // to add more encodings to this module (we are using utf8 for new
91 // content files), so that shouldn't be an issue.
92
93 // Compute output length, and check for pure ascii input at the same
94 // time.
95 bool ascii;
96 size_t outlen = getLength(input, ascii);
97
98 // If we're pure ascii, then don't bother converting anything.
99 if(ascii)
100 return std::string(input, outlen);
101
102 // Make sure the output is large enough
103 resize(outlen);
104 char *out = &mOutput[0];
105
106 // Translate
107 while (*input)
108 copyFromArray(*(input++), out);
109
110 // Make sure that we wrote the correct number of bytes
111 assert((out-&mOutput[0]) == (int)outlen);
112
113 // And make extra sure the output is null terminated
114 assert(mOutput.size() > outlen);
115 assert(mOutput[outlen] == 0);
116
117 // Return a string
118 return std::string(&mOutput[0], outlen);
119 }
120
getLegacyEnc(const char * input,size_t size)121 std::string Utf8Encoder::getLegacyEnc(const char *input, size_t size)
122 {
123 // Double check that the input string stops at some point (it might
124 // contain zero terminators before this, inside its own data, which
125 // is also ok.)
126 assert(input[size] == 0);
127
128 // TODO: The rest of this function is designed for single-character
129 // input encodings only. It also assumes that the input the input
130 // encoding shares its first 128 values (0-127) with ASCII. These
131 // conditions must be checked again if you add more input encodings
132 // later.
133
134 // Compute output length, and check for pure ascii input at the same
135 // time.
136 bool ascii;
137 size_t outlen = getLength2(input, ascii);
138
139 // If we're pure ascii, then don't bother converting anything.
140 if(ascii)
141 return std::string(input, outlen);
142
143 // Make sure the output is large enough
144 resize(outlen);
145 char *out = &mOutput[0];
146
147 // Translate
148 while(*input)
149 copyFromArray2(input, out);
150
151 // Make sure that we wrote the correct number of bytes
152 assert((out-&mOutput[0]) == (int)outlen);
153
154 // And make extra sure the output is null terminated
155 assert(mOutput.size() > outlen);
156 assert(mOutput[outlen] == 0);
157
158 // Return a string
159 return std::string(&mOutput[0], outlen);
160 }
161
162 // Make sure the output vector is large enough for 'size' bytes,
163 // including a terminating zero after it.
resize(size_t size)164 void Utf8Encoder::resize(size_t size)
165 {
166 if (mOutput.size() <= size)
167 // Add some extra padding to reduce the chance of having to resize
168 // again later.
169 mOutput.resize(3*size);
170
171 // And make sure the string is zero terminated
172 mOutput[size] = 0;
173 }
174
175 /** Get the total length length needed to decode the given string with
176 the given translation array. The arrays are encoded with 6 bytes
177 per character, with the first giving the length and the next 5 the
178 actual data.
179
180 The function serves a dual purpose for optimization reasons: it
181 checks if the input is pure ascii (all values are <= 127). If this
182 is the case, then the ascii parameter is set to true, and the
183 caller can optimize for this case.
184 */
getLength(const char * input,bool & ascii)185 size_t Utf8Encoder::getLength(const char* input, bool &ascii)
186 {
187 ascii = true;
188 size_t len = 0;
189 const char* ptr = input;
190 unsigned char inp = *ptr;
191
192 // Do away with the ascii part of the string first (this is almost
193 // always the entire string.)
194 while (inp && inp < 128)
195 inp = *(++ptr);
196 len += (ptr-input);
197
198 // If we're not at the null terminator at this point, then there
199 // were some non-ascii characters to deal with. Go to slow-mode for
200 // the rest of the string.
201 if (inp)
202 {
203 ascii = false;
204 while (inp)
205 {
206 // Find the translated length of this character in the
207 // lookup table.
208 len += translationArray[inp*6];
209 inp = *(++ptr);
210 }
211 }
212 return len;
213 }
214
215 // Translate one character 'ch' using the translation array 'arr', and
216 // advance the output pointer accordingly.
copyFromArray(unsigned char ch,char * & out)217 void Utf8Encoder::copyFromArray(unsigned char ch, char* &out)
218 {
219 // Optimize for ASCII values
220 if (ch < 128)
221 {
222 *(out++) = ch;
223 return;
224 }
225
226 const signed char *in = translationArray + ch*6;
227 int len = *(in++);
228 for (int i=0; i<len; i++)
229 *(out++) = *(in++);
230 }
231
getLength2(const char * input,bool & ascii)232 size_t Utf8Encoder::getLength2(const char* input, bool &ascii)
233 {
234 ascii = true;
235 size_t len = 0;
236 const char* ptr = input;
237 unsigned char inp = *ptr;
238
239 // Do away with the ascii part of the string first (this is almost
240 // always the entire string.)
241 while (inp && inp < 128)
242 inp = *(++ptr);
243 len += (ptr-input);
244
245 // If we're not at the null terminator at this point, then there
246 // were some non-ascii characters to deal with. Go to slow-mode for
247 // the rest of the string.
248 if (inp)
249 {
250 ascii = false;
251 while(inp)
252 {
253 len += 1;
254 // Find the translated length of this character in the
255 // lookup table.
256 switch(inp)
257 {
258 case 0xe2: len -= 2; break;
259 case 0xc2:
260 case 0xcb:
261 case 0xc4:
262 case 0xc6:
263 case 0xc3:
264 case 0xd0:
265 case 0xd1:
266 case 0xd2:
267 case 0xc5: len -= 1; break;
268 }
269
270 inp = *(++ptr);
271 }
272 }
273 return len;
274 }
275
copyFromArray2(const char * & chp,char * & out)276 void Utf8Encoder::copyFromArray2(const char*& chp, char* &out)
277 {
278 unsigned char ch = *(chp++);
279 // Optimize for ASCII values
280 if (ch < 128)
281 {
282 *(out++) = ch;
283 return;
284 }
285
286 int len = 1;
287 switch (ch)
288 {
289 case 0xe2: len = 3; break;
290 case 0xc2:
291 case 0xcb:
292 case 0xc4:
293 case 0xc6:
294 case 0xc3:
295 case 0xd0:
296 case 0xd1:
297 case 0xd2:
298 case 0xc5: len = 2; break;
299 }
300
301 if (len == 1) // There is no 1 length utf-8 glyph that is not 0x20 (empty space)
302 {
303 *(out++) = ch;
304 return;
305 }
306
307 unsigned char ch2 = *(chp++);
308 unsigned char ch3 = '\0';
309 if (len == 3)
310 ch3 = *(chp++);
311
312 for (int i = 128; i < 256; i++)
313 {
314 unsigned char b1 = translationArray[i*6 + 1], b2 = translationArray[i*6 + 2], b3 = translationArray[i*6 + 3];
315 if (b1 == ch && b2 == ch2 && (len != 3 || b3 == ch3))
316 {
317 *(out++) = (char)i;
318 return;
319 }
320 }
321
322 Log(Debug::Info) << "Could not find glyph " << std::hex << (int)ch << " " << (int)ch2 << " " << (int)ch3;
323
324 *(out++) = ch; // Could not find glyph, just put whatever
325 }
326
calculateEncoding(const std::string & encodingName)327 ToUTF8::FromType ToUTF8::calculateEncoding(const std::string& encodingName)
328 {
329 if (encodingName == "win1250")
330 return ToUTF8::WINDOWS_1250;
331 else if (encodingName == "win1251")
332 return ToUTF8::WINDOWS_1251;
333 else if (encodingName == "win1252")
334 return ToUTF8::WINDOWS_1252;
335 else
336 throw std::runtime_error(std::string("Unknown encoding '") + encodingName + std::string("', see openmw --help for available options."));
337 }
338
encodingUsingMessage(const std::string & encodingName)339 std::string ToUTF8::encodingUsingMessage(const std::string& encodingName)
340 {
341 if (encodingName == "win1250")
342 return "Using Central and Eastern European font encoding.";
343 else if (encodingName == "win1251")
344 return "Using Cyrillic font encoding.";
345 else if (encodingName == "win1252")
346 return "Using default (English) font encoding.";
347 else
348 throw std::runtime_error(std::string("Unknown encoding '") + encodingName + std::string("', see openmw --help for available options."));
349 }
350