1 #include "to_utf8.hpp"
2 
3 #include <vector>
4 #include <cassert>
5 #include <stdexcept>
6 
7 #include <components/debug/debuglog.hpp>
8 
9 /* This file contains the code to translate from WINDOWS-1252 (native
10    charset used in English version of Morrowind) to UTF-8. The library
11    is designed to be extened to support more source encodings later,
12    which means that we may add support for Russian, Polish and Chinese
13    files and so on.
14 
15    The code does not depend on any external library at
16    runtime. Instead, it uses a pregenerated table made with iconv (see
17    gen_iconv.cpp and the Makefile) which is located in tables_gen.hpp.
18 
19    This is both faster and uses less dependencies. The tables would
20    only need to be regenerated if we are adding support more input
21    encodings. As such, there is no need to make the generator code
22    platform independent.
23 
24    The library is optimized for the case of pure ASCII input strings,
25    which is the vast majority of cases at least for the English
26    version. A test of my version of Morrowind.esm got 130 non-ASCII vs
27    236195 ASCII strings, or less than 0.06% of strings containing
28    non-ASCII characters.
29 
30    To optmize for this, ff the first pass of the string does not find
31    any non-ASCII characters, the entire string is passed along without
32    any modification.
33 
34    Most of the non-ASCII strings are books, and are quite large. (The
35    non-ASCII characters are typically starting and ending quotation
36    marks.) Within these, almost all the characters are ASCII. For this
37    purpose, the library is also optimized for mostly-ASCII contents
38    even in the cases where some conversion is necessary.
39  */
40 
41 
42 // Generated tables
43 #include "tables_gen.hpp"
44 
45 using namespace ToUTF8;
46 
Utf8Encoder(const FromType sourceEncoding)47 Utf8Encoder::Utf8Encoder(const FromType sourceEncoding):
48     mOutput(50*1024)
49 {
50     switch (sourceEncoding)
51     {
52         case ToUTF8::WINDOWS_1252:
53         {
54             translationArray = ToUTF8::windows_1252;
55             break;
56         }
57         case ToUTF8::WINDOWS_1250:
58         {
59             translationArray = ToUTF8::windows_1250;
60             break;
61         }
62         case ToUTF8::WINDOWS_1251:
63         {
64             translationArray = ToUTF8::windows_1251;
65             break;
66         }
67         case ToUTF8::CP437:
68         {
69             translationArray = ToUTF8::cp437;
70             break;
71         }
72 
73         default:
74         {
75             assert(0);
76         }
77     }
78 }
79 
getUtf8(const char * input,size_t size)80 std::string Utf8Encoder::getUtf8(const char* input, size_t size)
81 {
82     // Double check that the input string stops at some point (it might
83     // contain zero terminators before this, inside its own data, which
84     // is also ok.)
85     assert(input[size] == 0);
86 
87     // Note: The rest of this function is designed for single-character
88     // input encodings only. It also assumes that the input encoding
89     // shares its first 128 values (0-127) with ASCII. There are no plans
90     // to add more encodings to this module (we are using utf8 for new
91     // content files), so that shouldn't be an issue.
92 
93     // Compute output length, and check for pure ascii input at the same
94     // time.
95     bool ascii;
96     size_t outlen = getLength(input, ascii);
97 
98     // If we're pure ascii, then don't bother converting anything.
99     if(ascii)
100         return std::string(input, outlen);
101 
102     // Make sure the output is large enough
103     resize(outlen);
104     char *out = &mOutput[0];
105 
106     // Translate
107     while (*input)
108         copyFromArray(*(input++), out);
109 
110     // Make sure that we wrote the correct number of bytes
111     assert((out-&mOutput[0]) == (int)outlen);
112 
113     // And make extra sure the output is null terminated
114     assert(mOutput.size() > outlen);
115     assert(mOutput[outlen] == 0);
116 
117     // Return a string
118     return std::string(&mOutput[0], outlen);
119 }
120 
getLegacyEnc(const char * input,size_t size)121 std::string Utf8Encoder::getLegacyEnc(const char *input, size_t size)
122 {
123     // Double check that the input string stops at some point (it might
124     // contain zero terminators before this, inside its own data, which
125     // is also ok.)
126     assert(input[size] == 0);
127 
128     // TODO: The rest of this function is designed for single-character
129     // input encodings only. It also assumes that the input the input
130     // encoding shares its first 128 values (0-127) with ASCII. These
131     // conditions must be checked again if you add more input encodings
132     // later.
133 
134     // Compute output length, and check for pure ascii input at the same
135     // time.
136     bool ascii;
137     size_t outlen = getLength2(input, ascii);
138 
139     // If we're pure ascii, then don't bother converting anything.
140     if(ascii)
141         return std::string(input, outlen);
142 
143     // Make sure the output is large enough
144     resize(outlen);
145     char *out = &mOutput[0];
146 
147     // Translate
148     while(*input)
149         copyFromArray2(input, out);
150 
151     // Make sure that we wrote the correct number of bytes
152     assert((out-&mOutput[0]) == (int)outlen);
153 
154     // And make extra sure the output is null terminated
155     assert(mOutput.size() > outlen);
156     assert(mOutput[outlen] == 0);
157 
158     // Return a string
159     return std::string(&mOutput[0], outlen);
160 }
161 
162 // Make sure the output vector is large enough for 'size' bytes,
163 // including a terminating zero after it.
resize(size_t size)164 void Utf8Encoder::resize(size_t size)
165 {
166     if (mOutput.size() <= size)
167         // Add some extra padding to reduce the chance of having to resize
168         // again later.
169         mOutput.resize(3*size);
170 
171     // And make sure the string is zero terminated
172     mOutput[size] = 0;
173 }
174 
175 /** Get the total length length needed to decode the given string with
176   the given translation array. The arrays are encoded with 6 bytes
177   per character, with the first giving the length and the next 5 the
178   actual data.
179 
180   The function serves a dual purpose for optimization reasons: it
181   checks if the input is pure ascii (all values are <= 127). If this
182   is the case, then the ascii parameter is set to true, and the
183   caller can optimize for this case.
184  */
getLength(const char * input,bool & ascii)185 size_t Utf8Encoder::getLength(const char* input, bool &ascii)
186 {
187     ascii = true;
188     size_t len = 0;
189     const char* ptr = input;
190     unsigned char inp = *ptr;
191 
192     // Do away with the ascii part of the string first (this is almost
193     // always the entire string.)
194     while (inp && inp < 128)
195         inp = *(++ptr);
196     len += (ptr-input);
197 
198     // If we're not at the null terminator at this point, then there
199     // were some non-ascii characters to deal with. Go to slow-mode for
200     // the rest of the string.
201     if (inp)
202     {
203         ascii = false;
204         while (inp)
205         {
206             // Find the translated length of this character in the
207             // lookup table.
208             len += translationArray[inp*6];
209             inp = *(++ptr);
210         }
211     }
212     return len;
213 }
214 
215 // Translate one character 'ch' using the translation array 'arr', and
216 // advance the output pointer accordingly.
copyFromArray(unsigned char ch,char * & out)217 void Utf8Encoder::copyFromArray(unsigned char ch, char* &out)
218 {
219     // Optimize for ASCII values
220     if (ch < 128)
221     {
222         *(out++) = ch;
223         return;
224     }
225 
226     const signed char *in = translationArray + ch*6;
227     int len = *(in++);
228     for (int i=0; i<len; i++)
229         *(out++) = *(in++);
230 }
231 
getLength2(const char * input,bool & ascii)232 size_t Utf8Encoder::getLength2(const char* input, bool &ascii)
233 {
234     ascii = true;
235     size_t len = 0;
236     const char* ptr = input;
237     unsigned char inp = *ptr;
238 
239     // Do away with the ascii part of the string first (this is almost
240     // always the entire string.)
241     while (inp && inp < 128)
242         inp = *(++ptr);
243     len += (ptr-input);
244 
245     // If we're not at the null terminator at this point, then there
246     // were some non-ascii characters to deal with. Go to slow-mode for
247     // the rest of the string.
248     if (inp)
249     {
250         ascii = false;
251         while(inp)
252         {
253             len += 1;
254             // Find the translated length of this character in the
255             // lookup table.
256             switch(inp)
257             {
258                 case 0xe2: len -= 2; break;
259                 case 0xc2:
260                 case 0xcb:
261                 case 0xc4:
262                 case 0xc6:
263                 case 0xc3:
264                 case 0xd0:
265                 case 0xd1:
266                 case 0xd2:
267                 case 0xc5: len -= 1; break;
268             }
269 
270             inp = *(++ptr);
271         }
272     }
273     return len;
274 }
275 
copyFromArray2(const char * & chp,char * & out)276 void Utf8Encoder::copyFromArray2(const char*& chp, char* &out)
277 {
278     unsigned char ch = *(chp++);
279     // Optimize for ASCII values
280     if (ch < 128)
281     {
282         *(out++) = ch;
283         return;
284     }
285 
286     int len = 1;
287     switch (ch)
288     {
289         case 0xe2: len = 3; break;
290         case 0xc2:
291         case 0xcb:
292         case 0xc4:
293         case 0xc6:
294         case 0xc3:
295         case 0xd0:
296         case 0xd1:
297         case 0xd2:
298         case 0xc5: len = 2; break;
299     }
300 
301     if (len == 1) // There is no 1 length utf-8 glyph that is not 0x20 (empty space)
302     {
303         *(out++) = ch;
304         return;
305     }
306 
307     unsigned char ch2 = *(chp++);
308     unsigned char ch3 = '\0';
309     if (len == 3)
310         ch3 = *(chp++);
311 
312     for (int i = 128; i < 256; i++)
313     {
314         unsigned char b1 = translationArray[i*6 + 1], b2 = translationArray[i*6 + 2], b3 = translationArray[i*6 + 3];
315         if (b1 == ch && b2 == ch2 && (len != 3 || b3 == ch3))
316         {
317             *(out++) = (char)i;
318             return;
319         }
320     }
321 
322     Log(Debug::Info) << "Could not find glyph " << std::hex << (int)ch << " " << (int)ch2 << " " << (int)ch3;
323 
324     *(out++) = ch; // Could not find glyph, just put whatever
325 }
326 
calculateEncoding(const std::string & encodingName)327 ToUTF8::FromType ToUTF8::calculateEncoding(const std::string& encodingName)
328 {
329     if (encodingName == "win1250")
330         return ToUTF8::WINDOWS_1250;
331     else if (encodingName == "win1251")
332         return ToUTF8::WINDOWS_1251;
333     else if (encodingName == "win1252")
334         return ToUTF8::WINDOWS_1252;
335     else
336         throw std::runtime_error(std::string("Unknown encoding '") + encodingName + std::string("', see openmw --help for available options."));
337 }
338 
encodingUsingMessage(const std::string & encodingName)339 std::string ToUTF8::encodingUsingMessage(const std::string& encodingName)
340 {
341     if (encodingName == "win1250")
342         return "Using Central and Eastern European font encoding.";
343     else if (encodingName == "win1251")
344         return "Using Cyrillic font encoding.";
345     else if (encodingName == "win1252")
346         return "Using default (English) font encoding.";
347     else
348         throw std::runtime_error(std::string("Unknown encoding '") + encodingName + std::string("', see openmw --help for available options."));
349 }
350