1 /*
2 	OpenLieroX
3 
4 	UTF8/Unicode conversions
5 
6 	code under LGPL
7 	created 01-05-2007
8 	by Albert Zeyer and Dark Charlie
9 */
10 
11 #ifdef _MSC_VER
12 #pragma warning(disable: 4786)  // WARNING: identifier XXX was truncated to 255 characters in the debug info
13 #pragma warning(disable: 4503)  // WARNING: decorated name length exceeded, name was truncated
14 #endif
15 
16 #include "Unicode.h"
17 #include "MathLib.h" // for SIGN
18 #include "StringUtils.h"
19 
20 
21 // Table used for removing diacritics and other backward incompatible characters
22 ConversionItem tConversionTable[] = {
23 	{ 0x84, {0xE2, 0x80, 0x80, 0x00}, '"'},
24 	{ 0x93, {0xE2, 0x80, 0x80, 0x80}, '"'},
25 	{ 0x96, {0xC2, 0x96, 0x00, 0x00}, '-'},
26 	{ 0xA0, {0xC2, 0xA0, 0x00, 0x00}, ' '},
27 	{ 0xA1, {0xC2, 0xA1, 0x00, 0x00}, '!'},
28 	{ 0xA2, {0xC2, 0xA2, 0x00, 0x00}, 'c'},
29 	{ 0xA3, {0xC2, 0xA3, 0x00, 0x00}, 'L'},
30 	{ 0xA4, {0xC2, 0xA4, 0x00, 0x00}, 'o'},
31 	{ 0xA5, {0xC2, 0xA5, 0x00, 0x00}, 'Y'},
32 	{ 0xA6, {0xC2, 0xA6, 0x00, 0x00}, '|'},
33 	{ 0xA7, {0xC2, 0xA7, 0x00, 0x00}, '$'},
34 	{ 0xA8, {0xC2, 0xA8, 0x00, 0x00}, ' '},
35 	{ 0xA9, {0xC2, 0xA9, 0x00, 0x00}, 'c'},
36 	{ 0xAA, {0xC2, 0xAA, 0x00, 0x00}, 'a'},
37 	{ 0xAB, {0xC2, 0xAB, 0x00, 0x00}, '<'},
38 	{ 0xAC, {0xC2, 0xAC, 0x00, 0x00}, '-'},
39 	{ 0xAD, {0xC2, 0xAD, 0x00, 0x00}, '-'},
40 	{ 0xAE, {0xC2, 0xAE, 0x00, 0x00}, 'r'},
41 	{ 0xAF, {0xC2, 0xAF, 0x00, 0x00}, '-'},
42 	{ 0xB0, {0xC2, 0xB0, 0x00, 0x00}, '*'},
43 	{ 0xB1, {0xC2, 0xB1, 0x00, 0x00}, '+'},
44 	{ 0xB2, {0xC2, 0xB2, 0x00, 0x00}, '2'},
45 	{ 0xB3, {0xC2, 0xB3, 0x00, 0x00}, '3'},
46 	{ 0xB4, {0xC2, 0xB4, 0x00, 0x00}, ' '},
47 	{ 0xB5, {0xC2, 0xB5, 0x00, 0x00}, 'u'},
48 	{ 0xB6, {0xC2, 0xB6, 0x00, 0x00}, 'P'},
49 	{ 0xB7, {0xC2, 0xB7, 0x00, 0x00}, '.'},
50 	{ 0xB8, {0xC2, 0xB8, 0x00, 0x00}, ','},
51 	{ 0xB9, {0xC2, 0xB9, 0x00, 0x00}, '1'},
52 	{ 0xBA, {0xC2, 0xBA, 0x00, 0x00}, '0'},
53 	{ 0xBB, {0xC2, 0xBB, 0x00, 0x00}, '>'},
54 	{ 0xBC, {0xC2, 0xBC, 0x00, 0x00}, '4'},
55 	{ 0xBD, {0xC2, 0xBD, 0x00, 0x00}, '2'},
56 	{ 0xBE, {0xC2, 0xBE, 0x00, 0x00}, '4'},
57 	{ 0xBF, {0xC2, 0xBF, 0x00, 0x00}, '?'},
58 	{ 0xC0, {0xC3, 0x80, 0x00, 0x00}, 'A'},
59 	{ 0xC1, {0xC3, 0x81, 0x00, 0x00}, 'A'},
60 	{ 0xC2, {0xC3, 0x82, 0x00, 0x00}, 'A'},
61 	{ 0xC3, {0xC3, 0x83, 0x00, 0x00}, 'A'},
62 	{ 0xC4, {0xC3, 0x84, 0x00, 0x00}, 'A'},
63 	{ 0xC5, {0xC3, 0x85, 0x00, 0x00}, 'A'},
64 	{ 0xC6, {0xC3, 0x86, 0x00, 0x00}, 'A'},
65 	{ 0xC7, {0xC3, 0x87, 0x00, 0x00}, 'C'},
66 	{ 0xC8, {0xC3, 0x88, 0x00, 0x00}, 'E'},
67 	{ 0xC9, {0xC3, 0x89, 0x00, 0x00}, 'E'},
68 	{ 0xCA, {0xC3, 0x8A, 0x00, 0x00}, 'E'},
69 	{ 0xCB, {0xC3, 0x8B, 0x00, 0x00}, 'E'},
70 	{ 0xCC, {0xC3, 0x8C, 0x00, 0x00}, 'I'},
71 	{ 0xCD, {0xC3, 0x8D, 0x00, 0x00}, 'I'},
72 	{ 0xCE, {0xC3, 0x8E, 0x00, 0x00}, 'I'},
73 	{ 0xCF, {0xC3, 0x8F, 0x00, 0x00}, 'I'},
74 	{ 0xD0, {0xC3, 0x90, 0x00, 0x00}, 'D'},
75 	{ 0xD1, {0xC3, 0x91, 0x00, 0x00}, 'N'},
76 	{ 0xD2, {0xC3, 0x92, 0x00, 0x00}, 'O'},
77 	{ 0xD3, {0xC3, 0x93, 0x00, 0x00}, 'O'},
78 	{ 0xD4, {0xC3, 0x94, 0x00, 0x00}, 'O'},
79 	{ 0xD5, {0xC3, 0x95, 0x00, 0x00}, 'O'},
80 	{ 0xD6, {0xC3, 0x96, 0x00, 0x00}, 'O'},
81 	{ 0xD7, {0xC3, 0x97, 0x00, 0x00}, 'X'},
82 	{ 0xD8, {0xC3, 0x98, 0x00, 0x00}, 'O'},
83 	{ 0xD9, {0xC3, 0x99, 0x00, 0x00}, 'U'},
84 	{ 0xDA, {0xC3, 0x9A, 0x00, 0x00}, 'U'},
85 	{ 0xDB, {0xC3, 0x9B, 0x00, 0x00}, 'U'},
86 	{ 0xDC, {0xC3, 0x9C, 0x00, 0x00}, 'U'},
87 	{ 0xDD, {0xC3, 0x9D, 0x00, 0x00}, 'Y'},
88 	{ 0xDE, {0xC3, 0x9E, 0x00, 0x00}, 'b'},
89 	{ 0xDF, {0xC3, 0x9F, 0x00, 0x00}, 'S'},
90 	{ 0xE0, {0xC3, 0xA0, 0x00, 0x00}, 'a'},
91 	{ 0xE1, {0xC3, 0xA1, 0x00, 0x00}, 'a'},
92 	{ 0xE2, {0xC3, 0xA2, 0x00, 0x00}, 'a'},
93 	{ 0xE3, {0xC3, 0xA3, 0x00, 0x00}, 'a'},
94 	{ 0xE4, {0xC3, 0xA4, 0x00, 0x00}, 'a'},
95 	{ 0xE5, {0xC3, 0xA5, 0x00, 0x00}, 'a'},
96 	{ 0xE6, {0xC3, 0xA6, 0x00, 0x00}, 'a'},
97 	{ 0xE7, {0xC3, 0xA7, 0x00, 0x00}, 'c'},
98 	{ 0xE8, {0xC3, 0xA8, 0x00, 0x00}, 'e'},
99 	{ 0xE9, {0xC3, 0xA9, 0x00, 0x00}, 'e'},
100 	{ 0xEA, {0xC3, 0xAA, 0x00, 0x00}, 'e'},
101 	{ 0xEB, {0xC3, 0xAB, 0x00, 0x00}, 'e'},
102 	{ 0xEC, {0xC3, 0xAC, 0x00, 0x00}, 'i'},
103 	{ 0xED, {0xC3, 0xAD, 0x00, 0x00}, 'i'},
104 	{ 0xEE, {0xC3, 0xAE, 0x00, 0x00}, 'i'},
105 	{ 0xEF, {0xC3, 0xAF, 0x00, 0x00}, 'i'},
106 	{ 0xF0, {0xC3, 0xB0, 0x00, 0x00}, 'd'},
107 	{ 0xF1, {0xC3, 0xB1, 0x00, 0x00}, 'n'},
108 	{ 0xF2, {0xC3, 0xB2, 0x00, 0x00}, 'o'},
109 	{ 0xF3, {0xC3, 0xB3, 0x00, 0x00}, 'o'},
110 	{ 0xF4, {0xC3, 0xB4, 0x00, 0x00}, 'o'},
111 	{ 0xF5, {0xC3, 0xB5, 0x00, 0x00}, 'o'},
112 	{ 0xF6, {0xC3, 0xB6, 0x00, 0x00}, 'o'},
113 	{ 0xF7, {0xC3, 0xB7, 0x00, 0x00}, '/'},
114 	{ 0xF8, {0xC3, 0xB8, 0x00, 0x00}, 'o'},
115 	{ 0xF9, {0xC3, 0xB9, 0x00, 0x00}, 'u'},
116 	{ 0xFA, {0xC3, 0xBA, 0x00, 0x00}, 'u'},
117 	{ 0xFB, {0xC3, 0xBB, 0x00, 0x00}, 'u'},
118 	{ 0xFC, {0xC3, 0xBC, 0x00, 0x00}, 'u'},
119 	{ 0xFD, {0xC3, 0xBD, 0x00, 0x00}, 'y'},
120 	{ 0xFE, {0xC3, 0xBE, 0x00, 0x00}, 'b'},
121 	{ 0xFF, {0xC3, 0xBF, 0x00, 0x00}, 'y'},
122 
123 	{ 0x0100, {0xC4, 0x80, 0x00, 0x00}, 'A'},
124 	{ 0x0101, {0xC4, 0x81, 0x00, 0x00}, 'a'},
125 	{ 0x0102, {0xC4, 0x82, 0x00, 0x00}, 'A'},
126 	{ 0x0103, {0xC4, 0x83, 0x00, 0x00}, 'a'},
127 	{ 0x0104, {0xC4, 0x84, 0x00, 0x00}, 'A'},
128 	{ 0x0105, {0xC4, 0x85, 0x00, 0x00}, 'a'},
129 	{ 0x0106, {0xC4, 0x86, 0x00, 0x00}, 'C'},
130 	{ 0x0107, {0xC4, 0x87, 0x00, 0x00}, 'c'},
131 	{ 0x0108, {0xC4, 0x88, 0x00, 0x00}, 'C'},
132 	{ 0x0109, {0xC4, 0x89, 0x00, 0x00}, 'c'},
133 	{ 0x010A, {0xC4, 0x8A, 0x00, 0x00}, 'C'},
134 	{ 0x010B, {0xC4, 0x8B, 0x00, 0x00}, 'c'},
135 	{ 0x010C, {0xC4, 0x8C, 0x00, 0x00}, 'C'},
136 	{ 0x010D, {0xC4, 0x8D, 0x00, 0x00}, 'c'},
137 	{ 0x010E, {0xC4, 0x8E, 0x00, 0x00}, 'D'},
138 	{ 0x010F, {0xC4, 0x8F, 0x00, 0x00}, 'd'},
139 	{ 0x0110, {0xC4, 0x90, 0x00, 0x00}, 'D'},
140 	{ 0x0111, {0xC4, 0x91, 0x00, 0x00}, 'd'},
141 	{ 0x0112, {0xC4, 0x92, 0x00, 0x00}, 'E'},
142 	{ 0x0113, {0xC4, 0x93, 0x00, 0x00}, 'e'},
143 	{ 0x0114, {0xC4, 0x94, 0x00, 0x00}, 'E'},
144 	{ 0x0115, {0xC4, 0x95, 0x00, 0x00}, 'e'},
145 	{ 0x0116, {0xC4, 0x96, 0x00, 0x00}, 'E'},
146 	{ 0x0117, {0xC4, 0x97, 0x00, 0x00}, 'e'},
147 	{ 0x0118, {0xC4, 0x98, 0x00, 0x00}, 'E'},
148 	{ 0x0119, {0xC4, 0x99, 0x00, 0x00}, 'e'},
149 	{ 0x011A, {0xC4, 0x9A, 0x00, 0x00}, 'E'},
150 	{ 0x011B, {0xC4, 0x9B, 0x00, 0x00}, 'e'},
151 	{ 0x011C, {0xC4, 0x9C, 0x00, 0x00}, 'G'},
152 	{ 0x011D, {0xC4, 0x9D, 0x00, 0x00}, 'g'},
153 	{ 0x011E, {0xC4, 0x9E, 0x00, 0x00}, 'G'},
154 	{ 0x011F, {0xC4, 0x9F, 0x00, 0x00}, 'g'},
155 	{ 0x0120, {0xC4, 0xA0, 0x00, 0x00}, 'G'},
156 	{ 0x0121, {0xC4, 0xA1, 0x00, 0x00}, 'g'},
157 	{ 0x0122, {0xC4, 0xA2, 0x00, 0x00}, 'G'},
158 	{ 0x0123, {0xC4, 0xA3, 0x00, 0x00}, 'g'},
159 	{ 0x0124, {0xC4, 0xA4, 0x00, 0x00}, 'H'},
160 	{ 0x0125, {0xC4, 0xA5, 0x00, 0x00}, 'h'},
161 	{ 0x0126, {0xC4, 0xA6, 0x00, 0x00}, 'H'},
162 	{ 0x0127, {0xC4, 0xA7, 0x00, 0x00}, 'h'},
163 	{ 0x0128, {0xC4, 0xA8, 0x00, 0x00}, 'I'},
164 	{ 0x0129, {0xC4, 0xA9, 0x00, 0x00}, 'i'},
165 	{ 0x012A, {0xC4, 0xAA, 0x00, 0x00}, 'I'},
166 	{ 0x012B, {0xC4, 0xAB, 0x00, 0x00}, 'i'},
167 	{ 0x012C, {0xC4, 0xAC, 0x00, 0x00}, 'I'},
168 	{ 0x012D, {0xC4, 0xAD, 0x00, 0x00}, 'i'},
169 	{ 0x012E, {0xC4, 0xAE, 0x00, 0x00}, 'I'},
170 	{ 0x012F, {0xC4, 0xAF, 0x00, 0x00}, 'i'},
171 	{ 0x0130, {0xC4, 0xB0, 0x00, 0x00}, 'I'},
172 	{ 0x0131, {0xC4, 0xB1, 0x00, 0x00}, 'i'},
173 	{ 0x0132, {0xC4, 0xB2, 0x00, 0x00}, 'I'},
174 	{ 0x0133, {0xC4, 0xB3, 0x00, 0x00}, 'i'},
175 	{ 0x0134, {0xC4, 0xB4, 0x00, 0x00}, 'J'},
176 	{ 0x0135, {0xC4, 0xB5, 0x00, 0x00}, 'j'},
177 	{ 0x0136, {0xC4, 0xB6, 0x00, 0x00}, 'K'},
178 	{ 0x0137, {0xC4, 0xB7, 0x00, 0x00}, 'k'},
179 	{ 0x0138, {0xC4, 0xB8, 0x00, 0x00}, 'k'},
180 	{ 0x0139, {0xC4, 0xB9, 0x00, 0x00}, 'L'},
181 	{ 0x013A, {0xC4, 0xBA, 0x00, 0x00}, 'l'},
182 	{ 0x013B, {0xC4, 0xBB, 0x00, 0x00}, 'L'},
183 	{ 0x013C, {0xC4, 0xBC, 0x00, 0x00}, 'l'},
184 	{ 0x013D, {0xC4, 0xBD, 0x00, 0x00}, 'L'},
185 	{ 0x013E, {0xC4, 0xBE, 0x00, 0x00}, 'l'},
186 	{ 0x013F, {0xC4, 0xBF, 0x00, 0x00}, 'L'},
187 
188 	{ 0x0140, {0xC5, 0x80, 0x00, 0x00}, 'l'},
189 	{ 0x0141, {0xC5, 0x81, 0x00, 0x00}, 'L'},
190 	{ 0x0142, {0xC5, 0x82, 0x00, 0x00}, 'l'},
191 	{ 0x0143, {0xC5, 0x83, 0x00, 0x00}, 'N'},
192 	{ 0x0144, {0xC5, 0x84, 0x00, 0x00}, 'n'},
193 	{ 0x0145, {0xC5, 0x85, 0x00, 0x00}, 'N'},
194 	{ 0x0146, {0xC5, 0x86, 0x00, 0x00}, 'n'},
195 	{ 0x0147, {0xC5, 0x87, 0x00, 0x00}, 'N'},
196 	{ 0x0148, {0xC5, 0x88, 0x00, 0x00}, 'n'},
197 	{ 0x0149, {0xC5, 0x89, 0x00, 0x00}, 'n'},
198 	{ 0x014A, {0xC5, 0x8A, 0x00, 0x00}, 'N'},
199 	{ 0x014B, {0xC5, 0x8B, 0x00, 0x00}, 'n'},
200 	{ 0x014C, {0xC5, 0x8C, 0x00, 0x00}, 'O'},
201 	{ 0x014D, {0xC5, 0x8D, 0x00, 0x00}, 'o'},
202 	{ 0x014E, {0xC5, 0x8E, 0x00, 0x00}, 'O'},
203 	{ 0x014F, {0xC5, 0x8F, 0x00, 0x00}, 'o'},
204 	{ 0x0150, {0xC5, 0x90, 0x00, 0x00}, 'O'},
205 	{ 0x0151, {0xC5, 0x91, 0x00, 0x00}, 'o'},
206 	{ 0x0152, {0xC5, 0x92, 0x00, 0x00}, 'O'},
207 	{ 0x0153, {0xC5, 0x93, 0x00, 0x00}, 'o'},
208 	{ 0x0154, {0xC5, 0x94, 0x00, 0x00}, 'R'},
209 	{ 0x0155, {0xC5, 0x95, 0x00, 0x00}, 'r'},
210 	{ 0x0156, {0xC5, 0x96, 0x00, 0x00}, 'R'},
211 	{ 0x0157, {0xC5, 0x97, 0x00, 0x00}, 'r'},
212 	{ 0x0158, {0xC5, 0x98, 0x00, 0x00}, 'R'},
213 	{ 0x0159, {0xC5, 0x99, 0x00, 0x00}, 'r'},
214 	{ 0x015A, {0xC5, 0x9A, 0x00, 0x00}, 'S'},
215 	{ 0x015B, {0xC5, 0x9B, 0x00, 0x00}, 's'},
216 	{ 0x015C, {0xC5, 0x9C, 0x00, 0x00}, 'S'},
217 	{ 0x015D, {0xC5, 0x9D, 0x00, 0x00}, 's'},
218 	{ 0x015E, {0xC5, 0x9E, 0x00, 0x00}, 'S'},
219 	{ 0x015F, {0xC5, 0x9F, 0x00, 0x00}, 's'},
220 	{ 0x0160, {0xC5, 0xA0, 0x00, 0x00}, 'S'},
221 	{ 0x0161, {0xC5, 0xA1, 0x00, 0x00}, 's'},
222 	{ 0x0162, {0xC5, 0xA2, 0x00, 0x00}, 'T'},
223 	{ 0x0163, {0xC5, 0xA3, 0x00, 0x00}, 't'},
224 	{ 0x0164, {0xC5, 0xA4, 0x00, 0x00}, 'T'},
225 	{ 0x0165, {0xC5, 0xA5, 0x00, 0x00}, 't'},
226 	{ 0x0166, {0xC5, 0xA6, 0x00, 0x00}, 'T'},
227 	{ 0x0167, {0xC5, 0xA7, 0x00, 0x00}, 't'},
228 	{ 0x0168, {0xC5, 0xA8, 0x00, 0x00}, 'U'},
229 	{ 0x0169, {0xC5, 0xA9, 0x00, 0x00}, 'u'},
230 	{ 0x016A, {0xC5, 0xAA, 0x00, 0x00}, 'U'},
231 	{ 0x016B, {0xC5, 0xAB, 0x00, 0x00}, 'u'},
232 	{ 0x016C, {0xC5, 0xAC, 0x00, 0x00}, 'U'},
233 	{ 0x016D, {0xC5, 0xAD, 0x00, 0x00}, 'u'},
234 	{ 0x016E, {0xC5, 0xAE, 0x00, 0x00}, 'U'},
235 	{ 0x016F, {0xC5, 0xAF, 0x00, 0x00}, 'u'},
236 	{ 0x0170, {0xC5, 0xB0, 0x00, 0x00}, 'U'},
237 	{ 0x0171, {0xC5, 0xB1, 0x00, 0x00}, 'u'},
238 	{ 0x0172, {0xC5, 0xB2, 0x00, 0x00}, 'U'},
239 	{ 0x0173, {0xC5, 0xB3, 0x00, 0x00}, 'u'},
240 	{ 0x0174, {0xC5, 0xB4, 0x00, 0x00}, 'W'},
241 	{ 0x0175, {0xC5, 0xB5, 0x00, 0x00}, 'w'},
242 	{ 0x0176, {0xC5, 0xB6, 0x00, 0x00}, 'Y'},
243 	{ 0x0177, {0xC5, 0xB7, 0x00, 0x00}, 'y'},
244 	{ 0x0178, {0xC5, 0xB8, 0x00, 0x00}, 'Y'},
245 	{ 0x0179, {0xC5, 0xB9, 0x00, 0x00}, 'Z'},
246 	{ 0x017A, {0xC5, 0xBA, 0x00, 0x00}, 'z'},
247 	{ 0x017B, {0xC5, 0xBB, 0x00, 0x00}, 'Z'},
248 	{ 0x017C, {0xC5, 0xBC, 0x00, 0x00}, 'z'},
249 	{ 0x017D, {0xC5, 0xBD, 0x00, 0x00}, 'Z'},
250 	{ 0x017E, {0xC5, 0xBE, 0x00, 0x00}, 'z'},
251 	{ 0x017F, {0xC5, 0xBF, 0x00, 0x00}, 'S'},
252 
253 	{ 0x018F, {0xC6, 0x8F, 0x00, 0x00}, 'e'},
254 	{ 0x0192, {0xC6, 0x92, 0x00, 0x00}, 'f'},
255 	{ 0x01A0, {0xC6, 0xA0, 0x00, 0x00}, 'O'},
256 	{ 0x01A1, {0xC6, 0xA1, 0x00, 0x00}, 'o'},
257 	{ 0x01AF, {0xC6, 0xAF, 0x00, 0x00}, 'U'},
258 	{ 0x01B0, {0xC6, 0xB0, 0x00, 0x00}, 'u'},
259 
260 	{ 0x01CD, {0xC7, 0x8D, 0x00, 0x00}, 'A'},
261 	{ 0x01CE, {0xC7, 0x8E, 0x00, 0x00}, 'a'},
262 	{ 0x01CF, {0xC7, 0x8F, 0x00, 0x00}, 'I'},
263 	{ 0x01D0, {0xC7, 0x90, 0x00, 0x00}, 'i'},
264 	{ 0x01D1, {0xC7, 0x91, 0x00, 0x00}, 'O'},
265 	{ 0x01D2, {0xC7, 0x92, 0x00, 0x00}, 'o'},
266 	{ 0x01D3, {0xC7, 0x93, 0x00, 0x00}, 'U'},
267 	{ 0x01D4, {0xC7, 0x94, 0x00, 0x00}, 'u'},
268 	{ 0x01D5, {0xC7, 0x95, 0x00, 0x00}, 'U'},
269 	{ 0x01D6, {0xC7, 0x96, 0x00, 0x00}, 'u'},
270 	{ 0x01D7, {0xC7, 0x97, 0x00, 0x00}, 'U'},
271 	{ 0x01D8, {0xC7, 0x98, 0x00, 0x00}, 'u'},
272 	{ 0x01D9, {0xC7, 0x99, 0x00, 0x00}, 'U'},
273 	{ 0x01DA, {0xC7, 0x9A, 0x00, 0x00}, 'u'},
274 	{ 0x01DB, {0xC7, 0x9B, 0x00, 0x00}, 'U'},
275 	{ 0x01DC, {0xC7, 0x9C, 0x00, 0x00}, 'u'},
276 
277 	{ 0x01FA, {0xC7, 0xBA, 0x00, 0x00}, 'A'},
278 	{ 0x01FB, {0xC7, 0xBB, 0x00, 0x00}, 'a'},
279 	{ 0x01FC, {0xC7, 0xBC, 0x00, 0x00}, 'A'},
280 	{ 0x01FD, {0xC7, 0xBD, 0x00, 0x00}, 'a'},
281 	{ 0x01FE, {0xC7, 0xBE, 0x00, 0x00}, 'O'},
282 	{ 0x01FF, {0xC7, 0xBF, 0x00, 0x00}, 'o'},
283 
284 	{ 0x0259, {0xC9, 0x99, 0x00, 0x00}, 'e'},
285 
286 	{ 0x02C9, {0xCB, 0x89, 0x00, 0x00}, '-'},
287 	{ 0x02DA, {0xCB, 0x9A, 0x00, 0x00}, '*'},
288 	{ 0x02DC, {0xCB, 0x9C, 0x00, 0x00}, '\"'},
289 	{ 0x02DD, {0xCB, 0x9D, 0x00, 0x00}, '\"'},
290 
291 	{ 0x0300, {0xCC, 0x80, 0x00, 0x00}, '\''},
292 	{ 0x0301, {0xCC, 0x81, 0x00, 0x00}, '\''},
293 	{ 0x0303, {0xCC, 0x83, 0x00, 0x00}, '\"'},
294 	{ 0x0323, {0xCC, 0xA3, 0x00, 0x00}, '.'},
295 
296 	{ 0x037E, {0xCD, 0xBE, 0x00, 0x00}, ';'},
297 
298 	{ 0x0384, {0xCE, 0x84, 0x00, 0x00}, '\''},
299 	{ 0x0386, {0xCE, 0x86, 0x00, 0x00}, 'A'},
300 	{ 0x0387, {0xCE, 0x87, 0x00, 0x00}, '.'},
301 	{ 0x0388, {0xCE, 0x88, 0x00, 0x00}, 'E'},
302 	{ 0x0389, {0xCE, 0x89, 0x00, 0x00}, 'G'},
303 	{ 0x038A, {0xCE, 0x8A, 0x00, 0x00}, 'I'},
304 	{ 0x038C, {0xCE, 0x8C, 0x00, 0x00}, 'O'},
305 	{ 0x038E, {0xCE, 0x8E, 0x00, 0x00}, 'Y'},
306 	{ 0x038F, {0xCE, 0x8F, 0x00, 0x00}, 'O'},
307 	{ 0x0390, {0xCE, 0x90, 0x00, 0x00}, 'i'},
308 
309 	{ 0x0391, {0xCE, 0x91, 0x00, 0x00}, 'A'},
310 	{ 0x0392, {0xCE, 0x92, 0x00, 0x00}, 'B'},
311 	{ 0x0393, {0xCE, 0x93, 0x00, 0x00}, 'G'},
312 	{ 0x0394, {0xCE, 0x94, 0x00, 0x00}, 'D'},
313 	{ 0x0395, {0xCE, 0x95, 0x00, 0x00}, 'E'},
314 	{ 0x0396, {0xCE, 0x96, 0x00, 0x00}, 'Z'},
315 	{ 0x0397, {0xCE, 0x97, 0x00, 0x00}, 'H'},
316 	{ 0x0398, {0xCE, 0x98, 0x00, 0x00}, 'T'},
317 	{ 0x0399, {0xCE, 0x99, 0x00, 0x00}, 'I'},
318 	{ 0x039A, {0xCE, 0x9A, 0x00, 0x00}, 'K'},
319 	{ 0x039B, {0xCE, 0x9B, 0x00, 0x00}, 'L'},
320 	{ 0x039C, {0xCE, 0x9C, 0x00, 0x00}, 'M'},
321 	{ 0x039D, {0xCE, 0x9D, 0x00, 0x00}, 'N'},
322 	{ 0x039E, {0xCE, 0x9E, 0x00, 0x00}, 'X'},
323 	{ 0x039F, {0xCE, 0x9F, 0x00, 0x00}, 'O'},
324 	{ 0x03A0, {0xCE, 0xA0, 0x00, 0x00}, 'P'},
325 	{ 0x03A1, {0xCE, 0xA1, 0x00, 0x00}, 'R'},
326 	{ 0x03A3, {0xCE, 0xA3, 0x00, 0x00}, 'S'},
327 	{ 0x03A4, {0xCE, 0xA4, 0x00, 0x00}, 'T'},
328 	{ 0x03A5, {0xCE, 0xA5, 0x00, 0x00}, 'Y'},
329 	{ 0x03A6, {0xCE, 0xA6, 0x00, 0x00}, 'F'},
330 	{ 0x03A7, {0xCE, 0xA7, 0x00, 0x00}, 'C'},
331 	{ 0x03A8, {0xCE, 0xA8, 0x00, 0x00}, 'P'},
332 	{ 0x03A9, {0xCE, 0xA9, 0x00, 0x00}, 'W'},
333 	{ 0x03AA, {0xCE, 0xAA, 0x00, 0x00}, 'I'},
334 	{ 0x03AB, {0xCE, 0xAB, 0x00, 0x00}, 'Y'},
335 	{ 0x03AC, {0xCE, 0xAC, 0x00, 0x00}, 'a'},
336 	{ 0x03AD, {0xCE, 0xAD, 0x00, 0x00}, 'e'},
337 	{ 0x03AE, {0xCE, 0xAE, 0x00, 0x00}, 'h'},
338 	{ 0x03AF, {0xCE, 0xAF, 0x00, 0x00}, 'i'},
339 	{ 0x03B0, {0xCE, 0xB0, 0x00, 0x00}, 'y'},
340 	{ 0x03B1, {0xCE, 0xB1, 0x00, 0x00}, 'I'},
341 	{ 0x03B2, {0xCE, 0xB2, 0x00, 0x00}, 'a'},
342 	{ 0x03B3, {0xCE, 0xB3, 0x00, 0x00}, 'b'},
343 	{ 0x03B4, {0xCE, 0xB4, 0x00, 0x00}, 'g'},
344 	{ 0x03B5, {0xCE, 0xB5, 0x00, 0x00}, 'd'},
345 	{ 0x03B6, {0xCE, 0xB6, 0x00, 0x00}, 'e'},
346 	{ 0x03B7, {0xCE, 0xB7, 0x00, 0x00}, 'z'},
347 	{ 0x03B8, {0xCE, 0xB8, 0x00, 0x00}, 'h'},
348 	{ 0x03B9, {0xCE, 0xB9, 0x00, 0x00}, 't'},
349 	{ 0x03BA, {0xCE, 0xBA, 0x00, 0x00}, 'i'},
350 	{ 0x03BB, {0xCE, 0xBB, 0x00, 0x00}, 'k'},
351 	{ 0x03BC, {0xCE, 0xBC, 0x00, 0x00}, 'l'},
352 	{ 0x03BD, {0xCE, 0xBD, 0x00, 0x00}, 'm'},
353 	{ 0x03BE, {0xCE, 0xBE, 0x00, 0x00}, 'n'},
354 	{ 0x03BF, {0xCE, 0xBF, 0x00, 0x00}, 'x'},
355 
356 	{ 0x03C0, {0xCF, 0x80, 0x00, 0x00}, 'o'},
357 	{ 0x03C1, {0xCF, 0x81, 0x00, 0x00}, 'p'},
358 	{ 0x03C2, {0xCF, 0x82, 0x00, 0x00}, 'r'},
359 	{ 0x03C3, {0xCF, 0x83, 0x00, 0x00}, 's'},
360 	{ 0x03C4, {0xCF, 0x84, 0x00, 0x00}, 't'},
361 	{ 0x03C5, {0xCF, 0x85, 0x00, 0x00}, 'y'},
362 	{ 0x03C6, {0xCF, 0x86, 0x00, 0x00}, 'f'},
363 	{ 0x03C7, {0xCF, 0x87, 0x00, 0x00}, 'c'},
364 	{ 0x03C8, {0xCF, 0x88, 0x00, 0x00}, 'p'},
365 	{ 0x03C9, {0xCF, 0x89, 0x00, 0x00}, 'w'},
366 	{ 0x03CA, {0xCF, 0x8A, 0x00, 0x00}, 'i'},
367 	{ 0x03CB, {0xCF, 0x8B, 0x00, 0x00}, 'y'},
368 	{ 0x03CC, {0xCF, 0x8C, 0x00, 0x00}, 'o'},
369 	{ 0x03CD, {0xCF, 0x8D, 0x00, 0x00}, 'y'},
370 	{ 0x03CE, {0xCF, 0x8E, 0x00, 0x00}, 'w'},
371 
372 	// TODO: finish cyrilic
373 	{ 0x0401, {0xD0, 0x81, 0x00, 0x00}, 'E'},
374 	{ 0x0404, {0xD0, 0x84, 0x00, 0x00}, 'E'},
375 	{ 0x0405, {0xD0, 0x85, 0x00, 0x00}, 'S'},
376 	{ 0x0406, {0xD0, 0x86, 0x00, 0x00}, 'I'},
377 	{ 0x0407, {0xD0, 0x87, 0x00, 0x00}, 'I'},
378 	{ 0x0408, {0xD0, 0x88, 0x00, 0x00}, 'J'},
379 	{ 0x040C, {0xD0, 0x8C, 0x00, 0x00}, 'K'},
380 	{ 0x040E, {0xD0, 0x8E, 0x00, 0x00}, 'Y'},
381 	{ 0x0425, {0xD0, 0xA5, 0x00, 0x00}, 'X'},
382 	{ 0x042C, {0xD0, 0xAC, 0x00, 0x00}, 'b'},
383 	{ 0x0430, {0xD0, 0xB0, 0x00, 0x00}, 'a'},
384 	{ 0x0432, {0xD0, 0xB2, 0x00, 0x00}, 'v'},
385 	{ 0x0435, {0xD0, 0xB5, 0x00, 0x00}, 'e'},
386 	{ 0x043A, {0xD0, 0xBA, 0x00, 0x00}, 'k'},
387 	{ 0x043C, {0xD0, 0xBC, 0x00, 0x00}, 'm'},
388 
389 	// TODO: hebrew
390 
391 	{ 0x1E80, {0xE1, 0xBA, 0x80, 0x00}, 'W'},
392 	{ 0x1E81, {0xE1, 0xBA, 0x81, 0x00}, 'w'},
393 	{ 0x1E82, {0xE1, 0xBA, 0x82, 0x00}, 'W'},
394 	{ 0x1E83, {0xE1, 0xBA, 0x83, 0x00}, 'w'},
395 	{ 0x1E84, {0xE1, 0xBA, 0x84, 0x00}, 'W'},
396 	{ 0x1E85, {0xE1, 0xBA, 0x85, 0x00}, 'w'},
397 	{ 0x1EA0, {0xE1, 0xBA, 0xA0, 0x00}, 'A'},
398 	{ 0x1EA1, {0xE1, 0xBA, 0xA1, 0x00}, 'a'},
399 	{ 0x1EA2, {0xE1, 0xBA, 0xA2, 0x00}, 'A'},
400 	{ 0x1EA3, {0xE1, 0xBA, 0xA3, 0x00}, 'a'},
401 	{ 0x1EA4, {0xE1, 0xBA, 0xA4, 0x00}, 'A'},
402 	{ 0x1EA5, {0xE1, 0xBA, 0xA5, 0x00}, 'a'},
403 	{ 0x1EA6, {0xE1, 0xBA, 0xA6, 0x00}, 'A'},
404 	{ 0x1EA7, {0xE1, 0xBA, 0xA7, 0x00}, 'a'},
405 	{ 0x1EA8, {0xE1, 0xBA, 0xA8, 0x00}, 'A'},
406 	{ 0x1EA9, {0xE1, 0xBA, 0xA9, 0x00}, 'a'},
407 	{ 0x1EAA, {0xE1, 0xBA, 0xAA, 0x00}, 'A'},
408 	{ 0x1EAB, {0xE1, 0xBA, 0xAB, 0x00}, 'a'},
409 	{ 0x1EAC, {0xE1, 0xBA, 0xAC, 0x00}, 'A'},
410 	{ 0x1EAD, {0xE1, 0xBA, 0xAD, 0x00}, 'a'},
411 	{ 0x1EAE, {0xE1, 0xBA, 0xAE, 0x00}, 'A'},
412 	{ 0x1EAF, {0xE1, 0xBA, 0xAF, 0x00}, 'a'},
413 	{ 0x1EB0, {0xE1, 0xBA, 0xB0, 0x00}, 'A'},
414 	{ 0x1EB1, {0xE1, 0xBA, 0xB1, 0x00}, 'a'},
415 	{ 0x1EB2, {0xE1, 0xBA, 0xB2, 0x00}, 'A'},
416 	{ 0x1EB3, {0xE1, 0xBA, 0xB3, 0x00}, 'a'},
417 	{ 0x1EB4, {0xE1, 0xBA, 0xB4, 0x00}, 'A'},
418 	{ 0x1EB5, {0xE1, 0xBA, 0xB5, 0x00}, 'a'},
419 	{ 0x1EB6, {0xE1, 0xBA, 0xB6, 0x00}, 'A'},
420 	{ 0x1EB7, {0xE1, 0xBA, 0xB7, 0x00}, 'a'},
421 	{ 0x1EB8, {0xE1, 0xBA, 0xB8, 0x00}, 'E'},
422 	{ 0x1EB8, {0xE1, 0xBA, 0xB9, 0x00}, 'e'},
423 	{ 0x1EBA, {0xE1, 0xBA, 0xBA, 0x00}, 'E'},
424 	{ 0x1EBB, {0xE1, 0xBA, 0xBB, 0x00}, 'e'},
425 	{ 0x1EBC, {0xE1, 0xBA, 0xBC, 0x00}, 'E'},
426 	{ 0x1EBD, {0xE1, 0xBA, 0xBD, 0x00}, 'e'},
427 	{ 0x1EBE, {0xE1, 0xBA, 0xBE, 0x00}, 'E'},
428 	{ 0x1EBF, {0xE1, 0xBA, 0xBF, 0x00}, 'e'},
429 
430 	{ 0x1EC0, {0xE1, 0xBB, 0x80, 0x00}, 'E'},
431 	{ 0x1EC1, {0xE1, 0xBB, 0x81, 0x00}, 'e'},
432 	{ 0x1EC2, {0xE1, 0xBB, 0x82, 0x00}, 'E'},
433 	{ 0x1EC3, {0xE1, 0xBB, 0x83, 0x00}, 'e'},
434 	{ 0x1EC4, {0xE1, 0xBB, 0x84, 0x00}, 'E'},
435 	{ 0x1EC5, {0xE1, 0xBB, 0x85, 0x00}, 'e'},
436 	{ 0x1EC6, {0xE1, 0xBB, 0x86, 0x00}, 'E'},
437 	{ 0x1EC7, {0xE1, 0xBB, 0x87, 0x00}, 'e'},
438 	{ 0x1EC8, {0xE1, 0xBB, 0x88, 0x00}, 'I'},
439 	{ 0x1EC9, {0xE1, 0xBB, 0x89, 0x00}, 'i'},
440 	{ 0x1ECA, {0xE1, 0xBB, 0x8A, 0x00}, 'I'},
441 	{ 0x1ECB, {0xE1, 0xBB, 0x8B, 0x00}, 'i'},
442 	{ 0x1ECC, {0xE1, 0xBB, 0x8C, 0x00}, 'O'},
443 	{ 0x1ECD, {0xE1, 0xBB, 0x8D, 0x00}, 'o'},
444 	{ 0x1ECE, {0xE1, 0xBB, 0x8E, 0x00}, 'O'},
445 	{ 0x1ECF, {0xE1, 0xBB, 0x8F, 0x00}, 'o'},
446 	{ 0x1ED0, {0xE1, 0xBB, 0x90, 0x00}, 'O'},
447 	{ 0x1ED1, {0xE1, 0xBB, 0x91, 0x00}, 'o'},
448 	{ 0x1ED2, {0xE1, 0xBB, 0x92, 0x00}, 'O'},
449 	{ 0x1ED3, {0xE1, 0xBB, 0x93, 0x00}, 'o'},
450 	{ 0x1ED4, {0xE1, 0xBB, 0x94, 0x00}, 'O'},
451 	{ 0x1ED5, {0xE1, 0xBB, 0x95, 0x00}, 'o'},
452 	{ 0x1ED6, {0xE1, 0xBB, 0x96, 0x00}, 'O'},
453 	{ 0x1ED7, {0xE1, 0xBB, 0x97, 0x00}, 'o'},
454 	{ 0x1ED8, {0xE1, 0xBB, 0x98, 0x00}, 'O'},
455 	{ 0x1ED9, {0xE1, 0xBB, 0x99, 0x00}, 'o'},
456 	{ 0x1EDA, {0xE1, 0xBB, 0x9A, 0x00}, 'O'},
457 	{ 0x1EDB, {0xE1, 0xBB, 0x9B, 0x00}, 'o'},
458 	{ 0x1EDC, {0xE1, 0xBB, 0x9C, 0x00}, 'O'},
459 	{ 0x1EDD, {0xE1, 0xBB, 0x9D, 0x00}, 'o'},
460 	{ 0x1EDE, {0xE1, 0xBB, 0x9E, 0x00}, 'O'},
461 	{ 0x1EDF, {0xE1, 0xBB, 0x9F, 0x00}, 'o'},
462 	{ 0x1EE0, {0xE1, 0xBB, 0xA0, 0x00}, 'O'},
463 	{ 0x1EE1, {0xE1, 0xBB, 0xA1, 0x00}, 'o'},
464 	{ 0x1EE2, {0xE1, 0xBB, 0xA2, 0x00}, 'O'},
465 	{ 0x1EE3, {0xE1, 0xBB, 0xA3, 0x00}, 'o'},
466 	{ 0x1EE4, {0xE1, 0xBB, 0xA4, 0x00}, 'U'},
467 	{ 0x1EE5, {0xE1, 0xBB, 0xA5, 0x00}, 'u'},
468 	{ 0x1EE6, {0xE1, 0xBB, 0xA6, 0x00}, 'U'},
469 	{ 0x1EE7, {0xE1, 0xBB, 0xA7, 0x00}, 'u'},
470 	{ 0x1EE8, {0xE1, 0xBB, 0xA8, 0x00}, 'U'},
471 	{ 0x1EE9, {0xE1, 0xBB, 0xA9, 0x00}, 'u'},
472 	{ 0x1EEA, {0xE1, 0xBB, 0xAA, 0x00}, 'U'},
473 	{ 0x1EEB, {0xE1, 0xBB, 0xAB, 0x00}, 'u'},
474 	{ 0x1EEC, {0xE1, 0xBB, 0xAC, 0x00}, 'U'},
475 	{ 0x1EED, {0xE1, 0xBB, 0xAD, 0x00}, 'u'},
476 	{ 0x1EEE, {0xE1, 0xBB, 0xAE, 0x00}, 'U'},
477 	{ 0x1EEF, {0xE1, 0xBB, 0xAF, 0x00}, 'u'},
478 	{ 0x1EF0, {0xE1, 0xBB, 0xB0, 0x00}, 'U'},
479 	{ 0x1EF1, {0xE1, 0xBB, 0xB1, 0x00}, 'u'},
480 	{ 0x1EF2, {0xE1, 0xBB, 0xB2, 0x00}, 'Y'},
481 	{ 0x1EF3, {0xE1, 0xBB, 0xB3, 0x00}, 'y'},
482 	{ 0x1EF4, {0xE1, 0xBB, 0xB4, 0x00}, 'Y'},
483 	{ 0x1EF5, {0xE1, 0xBB, 0xB5, 0x00}, 'y'},
484 	{ 0x1EF6, {0xE1, 0xBB, 0xB6, 0x00}, 'Y'},
485 	{ 0x1EF7, {0xE1, 0xBB, 0xB7, 0x00}, 'y'},
486 	{ 0x1EF8, {0xE1, 0xBB, 0xB8, 0x00}, 'Y'},
487 	{ 0x1EF9, {0xE1, 0xBB, 0xB9, 0x00}, 'y'},
488 
489 	{ 0x2013, {0xE2, 0x80, 0x93, 0x00}, '-'},
490 	{ 0x2014, {0xE2, 0x80, 0x94, 0x00}, '-'},
491 	{ 0x2015, {0xE2, 0x80, 0x95, 0x00}, '-'},
492 	{ 0x2017, {0xE2, 0x80, 0x97, 0x00}, '_'},
493 	{ 0x2018, {0xE2, 0x80, 0x98, 0x00}, '\''},
494 	{ 0x2019, {0xE2, 0x80, 0x99, 0x00}, '\''},
495 	{ 0x201A, {0xE2, 0x80, 0x9A, 0x00}, '\''},
496 	{ 0x201B, {0xE2, 0x80, 0x9B, 0x00}, '\''},
497 	{ 0x201C, {0xE2, 0x80, 0x9C, 0x00}, '\"'},
498 	{ 0x201D, {0xE2, 0x80, 0x9D, 0x00}, '\"'},
499 	{ 0x201E, {0xE2, 0x80, 0x9E, 0x00}, '\"'},
500 	{ 0x2020, {0xE2, 0x80, 0xA0, 0x00}, '+'},
501 	{ 0x2021, {0xE2, 0x80, 0xA1, 0x00}, '+'},
502 	{ 0x2022, {0xE2, 0x80, 0xA2, 0x00}, '*'}
503 
504 	// TODO: more :)
505 };
506 
507 
508 
509 
510 // grabbed from SDL_ttf (also LGPL)
UNICODE_to_UTF8(unsigned char * utf8,UnicodeChar unicode)511 static void UNICODE_to_UTF8(unsigned char *utf8, UnicodeChar unicode)
512 {
513     int j=0;
514 
515     if (unicode < 0x80)
516     {
517         utf8[j] = unicode & 0x7F;
518     }
519     else if (unicode < 0x800)
520     {
521         utf8[j] = 0xC0 | (unicode >> 6);
522         utf8[++j] = 0x80 | (unicode & 0x3F);
523     }
524     else if (unicode < 0x10000)
525     {
526         utf8[j] = 0xE0 | (unicode >> 12);
527         utf8[++j] = 0x80 | ((unicode >> 6) & 0x3F);
528         utf8[++j] = 0x80 | (unicode & 0x3F);
529     }
530     else if (unicode < 0x200000)
531     {
532         utf8[j] = 0xF0 | (unicode >> 18);
533         utf8[++j] = 0x80 | ((unicode >> 12) & 0x3F);
534         utf8[++j] = 0x80 | ((unicode >> 6) & 0x3F);
535         utf8[++j] = 0x80 | (unicode & 0x3F);
536     }
537     else if (unicode < 0x4000000)
538     {
539         utf8[j] = 0xF8 | (unicode >> 24);
540         utf8[++j] = 0x80 | ((unicode >> 18) & 0x3F);
541         utf8[++j] = 0x80 | ((unicode >> 12) & 0x3F);
542         utf8[++j] = 0x80 | ((unicode >> 6) & 0x3F);
543         utf8[++j] = 0x80 | (unicode & 0x3F);
544     }
545     else if (unicode < 0x80000000)
546     {
547         utf8[j] = 0xFC | (unicode >> 30);
548         utf8[++j] = 0x80 | ((unicode >> 24) & 0x3F);
549         utf8[++j] = 0x80 | ((unicode >> 18) & 0x3F);
550         utf8[++j] = 0x80 | ((unicode >> 12) & 0x3F);
551         utf8[++j] = 0x80 | ((unicode >> 6) & 0x3F);
552         utf8[++j] = 0x80 | (unicode & 0x3F);
553     }
554     else
555     	utf8[j] = 0;
556 
557     utf8[++j] = 0;
558 }
559 
560 /////////////////
561 // Convert unicode to UTF8
GetUtf8FromUnicode(UnicodeChar ch)562 std::string GetUtf8FromUnicode(UnicodeChar ch) {
563 	if(ch == 0) return std::string("\0", 1);
564 	unsigned char utf8[7];
565 	UNICODE_to_UTF8(utf8, ch);
566 	return (const char*)utf8;
567 }
568 
569 
570 ////////////////////
571 // Convert UTF8 to unicode (takes iterator pointing to the first UTF8-encoded character)
GetNextUnicodeFromUtf8(std::string::const_iterator & it,const std::string::const_iterator & last,size_t & num_skipped)572 UnicodeChar GetNextUnicodeFromUtf8(std::string::const_iterator &it, const std::string::const_iterator& last, size_t& num_skipped) {
573 	num_skipped = 0;
574 	if(it == last) return 0;
575 
576 	unsigned char ch = *it;
577 	UnicodeChar res = ch;
578 	if ( ch >= 0xFC ) {
579 		res  =  (ch&0x01) << 30; it++; num_skipped++; if(it == last) return 0; ch = *it;
580 		res |=  (ch&0x3F) << 24; it++; num_skipped++; if(it == last) return 0; ch = *it;
581 		res |=  (ch&0x3F) << 18; it++; num_skipped++; if(it == last) return 0; ch = *it;
582 		res |=  (ch&0x3F) << 12; it++; num_skipped++; if(it == last) return 0; ch = *it;
583 		res |=  (ch&0x3F) << 6; it++;  num_skipped++; if(it == last) return 0; ch = *it;
584 		res |=  (ch&0x3F);
585 	} else
586 	if ( ch >= 0xF8 ) {
587 		res  =  (ch&0x03) << 24; num_skipped++; it++; if(it == last) return 0; ch = *it;
588 		res |=  (ch&0x3F) << 18; num_skipped++; it++; if(it == last) return 0; ch = *it;
589 		res |=  (ch&0x3F) << 12; num_skipped++; it++; if(it == last) return 0; ch = *it;
590 		res |=  (ch&0x3F) << 6;  num_skipped++; it++; if(it == last) return 0; ch = *it;
591 		res |=  (ch&0x3F);
592 	} else
593 	if ( ch >= 0xF0 ) {
594 		res  =  (ch&0x07) << 18; it++; num_skipped++; if(it == last) return 0; ch = *it;
595 		res |=  (ch&0x3F) << 12; it++; num_skipped++; if(it == last) return 0; ch = *it;
596 		res |=  (ch&0x3F) << 6; it++;  num_skipped++; if(it == last) return 0; ch = *it;
597 		res |=  (ch&0x3F);
598 	} else
599 	if ( ch >= 0xE0 ) {
600 		res  =  (ch&0x0F) << 12; it++; num_skipped++; if(it == last) return 0; ch = *it;
601 		res |=  (ch&0x3F) << 6; it++; num_skipped++; if(it == last) return 0; ch = *it;
602 		res |=  (ch&0x3F);
603 	} else
604 	if ( ch >= 0xC0 ) {
605 		res  =  (ch&0x1F) << 6; it++; num_skipped++; if(it == last) return 0; ch = *it;
606 		res |=  (ch&0x3F);
607 	}
608 
609 	it++; num_skipped++;
610 	return res;
611 }
612 
613 // Conversion functions
614 
615 ////////////////////////////
616 // Finds an index for the character in conversion table, returns -1 if not found
FindTableIndex(UnicodeChar c)617 int FindTableIndex(UnicodeChar c)
618 {
619 	int left, right, middle;
620 
621 	left = 0;
622 	right = sizeof(tConversionTable)/sizeof(ConversionItem) - 1;
623 
624 	// Binary search
625 	while (left <= right)  {
626 		middle = (left + right) / 2;
627 		if (tConversionTable[middle].Unicode == c)
628 			return middle;
629 
630 		if (c < tConversionTable[middle].Unicode)
631 			right = middle - 1;
632 		else
633 			left = middle + 1;
634 	}
635 
636 	return -1; // No conversion available
637 }
638 
639 /////////////////////////
640 // Converts given unicode character to ascii, according to conversion table
641 // If impossible to convert, returns 0xFF
UnicodeCharToAsciiChar(UnicodeChar c)642 char UnicodeCharToAsciiChar(UnicodeChar c)
643 {
644 	// Regular ascii, just continue
645 	if (c <= 0x80)
646 		return (char) c;
647 
648 
649 	// Unicode, try to convert
650 	int index = FindTableIndex(c);
651 	if (index == -1)  // Cannot convert
652 		return (char)0xFF;
653 	else
654 		return tConversionTable[index].Ascii;
655 
656 }
657 
658 ////////////////////////
659 // Converts a UTF-8 string to Ascii while replacing unicode characters by the closest ones from ASCII
UnicodeToAscii(const std::string & utf8str)660 std::string UnicodeToAscii(const std::string& utf8str)
661 {
662 	std::string res;
663 	res.reserve(utf8str.size());
664 
665 	for (std::string::const_iterator it = utf8str.begin(); it != utf8str.end(); )  {
666 		if ((unsigned char)*it < 0x80)  {  // Normal Ascii
667 			res += *it;
668 			++it;
669 		} else  {  // Unicode
670 			const UnicodeChar c = GetNextUnicodeFromUtf8(it, utf8str.end());
671 			const int idx = FindTableIndex(c);
672 			if (idx != -1)
673 				res += tConversionTable[idx].Ascii;
674 		}
675 	}
676 
677 	return res;
678 }
679 
680 ////////////////////////
681 // Like tolower() but for all international characters
UnicodeToLower(UnicodeChar c)682 UnicodeChar	UnicodeToLower(UnicodeChar c)
683 {
684 	// ASCII
685 	if (c < 0xC0)
686 		return (UnicodeChar)tolower(c);
687 
688 	// Who the hell invented so crazzy mappings? :S
689 
690 	// European characters
691 	if (c >= 0xC0 && c <= 0xD6)
692 		return c + 0x20;
693 
694 	if (c >= 0xD8 && c <= 0xDE)
695 		return c + 0x20;
696 
697 	if (c >= 0x100 && c <= 0x177 && !(c & 1))
698 		return c + 1;
699 
700 	if (c == 0x178)
701 		return 0xFF;
702 
703 	if (c >= 0x179 && c <= 0x17E && (c & 1))
704 		return c + 1;
705 
706 	if (c == 0x18F)
707 		return 0x259;
708 
709 	if (c >= 0x1A0 && c <= 0x1FF && !(c & 1))
710 		return c + 1;
711 
712 	// Greece alphabet
713 	if (c == 0x386)
714 		return 0x3AC;
715 
716 	if (c >= 0x388 && c <= 0x38A)
717 		return c + 0x25;
718 
719 	if (c >= 0x38C && c <= 0x38F)
720 		return c + 0x40;
721 
722 	if (c >= 0x391 && c <= 0x3AB)
723 		return c + 0x1B;
724 
725 	// Cyrilic
726 	if (c >= 0x401 && c <= 0x40F)
727 		return c + 0x50;
728 
729 	if (c >= 0x410 && c <= 0x42F)
730 		return c + 0x20;
731 
732 	if (c >= 0x490 && c <= 0x4E9 && !(c & 1))
733 		return c + 1;
734 
735 	// More European characters
736 	if (c >= 0x1E80 && c <= 0x1EF9 && !(c & 1))
737 		return c + 1;
738 
739 	// This character doesn't have lowercase
740 	return c;
741 }
742 
743 ////////////////////////
744 // Like toupper() but for all international characters
UnicodeToUpper(UnicodeChar c)745 UnicodeChar	UnicodeToUpper(UnicodeChar c)
746 {
747 	// ASCII
748 	if (c < 0xC0)
749 		return (UnicodeChar)toupper(c);
750 
751 	// Who the hell invented so crazzy mappings? :S
752 
753 	// European characters
754 	if (c >= 0xE0 && c <= 0xF6)
755 		return c - 0x20;
756 
757 	if (c >= 0xF8 && c <= 0xFE)
758 		return c - 0x20;
759 
760 	if (c == 0xFF)
761 		return 0x178;
762 
763 	if (c >= 0x100 && c <= 0x177 && (c & 1))
764 		return c - 1;
765 
766 	if (c >= 0x179 && c <= 0x17E && !(c & 1))
767 		return c - 1;
768 
769 	if (c == 0x259)
770 		return 0x18F;
771 
772 	if (c >= 0x1A0 && c <= 0x1FF && (c & 1))
773 		return c - 1;
774 
775 	// Greece alphabet
776 	if (c == 0x3AC)
777 		return 0x386;
778 
779 	if (c >= 0x3AD && c <= 0x3AF)
780 		return c - 0x25;
781 
782 	if (c >= 0x3CC && c <= 0x3CF)
783 		return c - 0x40;
784 
785 	if (c >= 0x3B1 && c <= 0x3CB)
786 		return c - 0x1B;
787 
788 	// Cyrilic
789 	if (c >= 0x451 && c <= 0x45F)
790 		return c - 0x50;
791 
792 	if (c >= 0x430 && c <= 0x44F)
793 		return c - 0x20;
794 
795 	if (c >= 0x490 && c <= 0x4E9 && (c & 1))
796 		return c - 1;
797 
798 	// More European characters
799 	if (c >= 0x1E80 && c <= 0x1EF9 && (c & 1))
800 		return c - 1;
801 
802 	// This character doesn't have uppercase
803 	return c;
804 }
805 
806 /////////////////////////
807 // Find a substring in a string (case insensitive)
808 // Handles UTF8 strings correctly
Utf8StringCaseFind(const std::string & text,const std::string & search_for)809 size_t Utf8StringCaseFind(const std::string& text, const std::string& search_for)
810 {
811 	// HINT: same as stringcasefind, only using UTF8 functions instead (a bit slower)
812 	size_t search_for_size = Utf8StringSize(search_for);
813 
814 	if (text.size() == 0 || search_for_size == 0 || search_for_size > Utf8StringSize(text))
815 		return std::string::npos;
816 
817 	std::string::const_iterator it1 = text.begin();
818 	std::string::const_iterator it2 = search_for.begin();
819 
820 	size_t number_of_same = 0;
821 	size_t number_of_same_bytes = 0;
822 	size_t result = 0;
823 
824 	// Go through the text
825 	while (it1 != text.end())  {
826 		size_t num_skipped = 0;
827 		UnicodeChar c1 = UnicodeToLower(GetNextUnicodeFromUtf8(it1, text.end(), num_skipped));
828 		UnicodeChar c2 = UnicodeToLower(GetNextUnicodeFromUtf8(it2, search_for.end()));
829 
830 		// The two characters are the same
831 		if (c1 == c2)  {
832 			number_of_same++;  // If number of same characters equals to the size of the substring, we've found it!
833 			if (number_of_same == search_for_size)
834 				return result - number_of_same_bytes;
835 			number_of_same_bytes += num_skipped;
836 		} else {
837 			number_of_same = 0;
838 			number_of_same_bytes = 0;
839 			it2 = search_for.begin();
840 		}
841 
842 		result += num_skipped;
843 	}
844 
845 	return std::string::npos; // Not found
846 }
847 
848 /////////////////////////
849 // Converts the Utf8 encoded string to format that will display correctly in old LX
OldLxCompatibleString(const std::string & Utf8String)850 std::string OldLxCompatibleString(const std::string &Utf8String)
851 {
852 	std::string result = "";
853 	std::string::const_iterator utf8_it = Utf8String.begin();
854 	std::string::const_iterator last_it = Utf8String.begin();
855 
856 	UnicodeChar current;
857 	int index;
858 	while (utf8_it != Utf8String.end())  {
859 		current = GetNextUnicodeFromUtf8(utf8_it, Utf8String.end());
860 		if (current <= 0x80)  {  // Normal ascii, don't convert in any way
861 			result += (char)current;
862 			last_it = utf8_it;
863 			continue;
864 		}
865 
866 		// Unicode character
867 		index = FindTableIndex(current);
868 		result += std::string(last_it, utf8_it); // Keep the UTF8, old LX will ignore it
869 		if (index == -1)
870 			result += UNKNOWN_CHARACTER; // For characters that cannot be converted
871 		else
872 			result += tConversionTable[index].Ascii;
873 
874 		last_it = utf8_it;
875 	}
876 
877 	return result;
878 }
879 
880 /////////////////////////
881 // Converts the string created by function above back to a normal UTF8 string
882 // WARNING: passing a normal UTF8 string in this function will result in wrong output
Utf8String(const std::string & OldLxString)883 std::string Utf8String(const std::string& OldLxString)
884 {
885 	std::string result = "";
886 	std::string::const_iterator utf8_it = OldLxString.begin();
887 	std::string::const_iterator last_it = OldLxString.begin();
888 
889 	UnicodeChar current;
890 	while (utf8_it != OldLxString.end())  {
891 		current = GetNextUnicodeFromUtf8(utf8_it, OldLxString.end());
892 		if (current <= 0x80)  {  // Normal ascii, don't convert in any way
893 			result += (char)current;
894 			last_it = utf8_it;
895 			continue;
896 		}
897 
898 		// Unicode character
899 
900 		result += std::string(last_it, utf8_it); // Keep the UTF8
901 		if(utf8_it == OldLxString.end()) break;
902 		if ((unsigned char)(*utf8_it) <= 0x80)  { // If after the unicode character comes another one, just continue
903 
904 			// Ignore if the converted character comes after UTF8 character
905 			// NOTE: the check if the character is really a valid converted UTF8 is not made because
906 			// of forward compatibility - in future versions the conversion table can slightly change
907 			// which would make it incompatible
908 			utf8_it++;
909 		}
910 
911 		last_it = utf8_it;
912 	}
913 
914 	return result;
915 }
916 
917 /////////////////////////
918 // Removes special UTF8 characters from the string
RemoveSpecialChars(const std::string & Utf8String)919 std::string RemoveSpecialChars(const std::string &Utf8String)
920 {
921 	std::string result = "";
922 	std::string::const_iterator utf8_it = Utf8String.begin();
923 
924 	UnicodeChar current;
925 	int index;
926 	while (utf8_it != Utf8String.end())  {
927 		current = GetNextUnicodeFromUtf8(utf8_it, Utf8String.end());
928 		if (current <= 0x80)  // Normal ascii, keep it
929 			result += (char)current;
930 		else  { // Replace the unicode character with an ascii equivalent (if some)
931 			index = FindTableIndex(current);
932 			if (index != -1)
933 				result += tConversionTable[index].Ascii;
934 		}
935 
936 	}
937 
938 	return result;
939 }
940 
941 
942 
943 /*
944  * Functions for UTF conversions taken from enconding.c, created by W3C
945  * The license is available at the following address:
946  * http://dev.w3.org/cvsweb/~checkout~/XML/Copyright?rev=1.1&content-type=text/plain
947  * Original file: http://dev.w3.org/cvsweb/~checkout~/XML/encoding.c
948  *
949  */
950 
951 //////////////
952 // Converts UTF16 to UTF8
Utf16ToUtf8(const Utf16String & str)953 std::string Utf16ToUtf8(const Utf16String& str)
954 {
955     Uint32 c, d = 0;
956 	std::string result;
957     int bits, iters;
958 
959 	for (Utf16String::const_iterator in = str.begin(); in != str.end();)  {
960 		c = *in;
961 		in++;
962 		if ((c & 0xFC00) == 0xD800) { // surrogates
963 			if ((in != str.end()) && (((d = (unsigned char)*in) & 0xFC00) == 0xDC00)) {
964 				c &= 0x03FF;
965 				c <<= 10;
966 				c |= d & 0x03FF;
967 				c += 0x10000;
968 			} else {
969 				return result;
970 			}
971 
972 			in++;
973         }
974 
975 		// assertion: c is a single UTF-4 value
976 
977 		if (c < 0x80)  {
978 			result += (char)c;
979 			bits= 0;
980 			iters = 0;
981 		} else if (c < 0x800)  {
982 			result += (char)((c >>  6) | 0xC0);
983 			bits=  0;
984 			iters = 1;
985 		} else if (c < 0x10000)  {
986 			result += (char)((c >> 12) | 0xE0);
987 			bits=  6;
988 			iters = 2;
989 		} else {
990 			result += (char)((c >> 18) | 0xF0);
991 			bits= 12;
992 			iters = 3;
993 		}
994 
995 		for ( ; iters; --iters) {
996 			result += (char)(((c >> bits) & 0x3F) | 0x80);
997 		}
998 	}
999 
1000 	return result;
1001 }
1002 
1003 ///////////////
1004 // Converts UTF8 to UTF16
Utf8ToUtf16(const std::string & str)1005 Utf16String Utf8ToUtf16(const std::string& str)
1006 {
1007     Uint32 c, d, trailing;
1008 	Utf16String result;
1009 
1010 	for (std::string::const_iterator in = str.begin(); in != str.end();)  {
1011 		d = (unsigned char)*in;
1012 		in++;
1013 
1014 		if (d < 0x80)  {
1015 			c = d;
1016 			trailing = 0;
1017 		} else if (d < 0xC0)  {
1018 			return result; // trailing byte in leading position
1019 		} else if (d < 0xE0)  {
1020 			c = d & 0x1F;
1021 			trailing = 1;
1022 		} else if (d < 0xF0)  {
1023 			c = d & 0x0F;
1024 			trailing= 2;
1025 		} else if (d < 0xF8)  {
1026 			c = d & 0x07;
1027 			trailing= 3;
1028 		} else  {
1029 			return result; // no chance for this in UTF-16
1030 		}
1031 
1032 		for ( ; trailing; trailing--) {
1033 			if (in == str.end())
1034 				return result;
1035 			if (((d = (unsigned char)*in++) & 0xC0) != 0x80)
1036 				return result;
1037 			c <<= 6;
1038 			c |= d & 0x3F;
1039 		}
1040 
1041 		// assertion: c is a single UTF-4 value
1042 		if (c < 0x10000) {
1043 			result += (Utf16Char) c;
1044 		} else if (c < 0x110000)  {
1045 			c -= 0x10000;
1046 			result += 0xD800 | (c >> 10);
1047 			result += 0xDC00 | (c & 0x03FF);
1048 		} else {
1049 			return result;
1050 		}
1051     }
1052     return result;
1053 }
1054 
1055 
1056 //////////////////
1057 // Convert a Unicode string to UTF8
UnicodeToUtf8(const Unicode32String & str)1058 std::string UnicodeToUtf8(const Unicode32String& str)
1059 {
1060 	std::string result;
1061 	for (Unicode32String::const_iterator i = str.begin(); i != str.end(); i++)  {
1062 		result += GetUtf8FromUnicode(*i);
1063 	}
1064 
1065 	return result;
1066 }
1067 
1068 //////////////////
1069 // Convert a UTF8 string to Unicode
Utf8ToUnicode(const std::string & str)1070 Unicode32String Utf8ToUnicode(const std::string& str)
1071 {
1072 	Unicode32String result;
1073 	for (std::string::const_iterator it = str.begin(); it != str.end();)
1074 		result += GetNextUnicodeFromUtf8(it, str.end());
1075 
1076 	return result;
1077 }
1078 
1079 
1080 #ifdef WIN32
1081 
1082 #include <windows.h>
1083 
1084 //////////////////////////
1085 // Convert a UTF-8 string to system native encoding
Utf8ToSystemNative(const std::string & utf8str)1086 std::string Utf8ToSystemNative(const std::string& utf8str)
1087 {
1088 	const Utf16String& u16str = Utf8ToUtf16(utf8str);
1089 	char *buf = new char[u16str.size() + 128]; // 128 - just in case...
1090 	int len = WideCharToMultiByte(CP_ACP, WC_NO_BEST_FIT_CHARS, (LPCWSTR) u16str.c_str(), u16str.size(), buf, u16str.size() + 128, NULL, NULL);
1091 	if (len == 0 && GetLastError() == ERROR_INVALID_FLAGS)  {
1092 		len = WideCharToMultiByte(CP_ACP, 0, (LPCWSTR) u16str.c_str(), u16str.size(), buf, u16str.size() + 128, NULL, NULL);
1093 		if (len == 0)
1094 			return "";
1095 	}
1096 	buf[MIN(len, u16str.size() + 127)] = '\0';
1097 
1098 	std::string res(buf);
1099 	delete[] buf;
1100 	return res;
1101 }
1102 
1103 ///////////////////////
1104 // Convert a system-native string to UTF-8
SystemNativeToUtf8(const std::string & natstr)1105 std::string SystemNativeToUtf8(const std::string& natstr)
1106 {
1107 	if (natstr.size() == 0)
1108 		return "";
1109 
1110 	wchar_t *buf = new wchar_t[natstr.size() + 128]; // 128 for safety
1111 	int len = MultiByteToWideChar(CP_ACP, 0, natstr.c_str(), natstr.size(), buf, natstr.size() + 128);
1112 	if (len == 0)
1113 		return "";
1114 	buf[MIN(len, natstr.size() + 127)] = 0;
1115 	std::string res = Utf16ToUtf8(Utf16String((Utf16Char *)buf));
1116 	delete[] buf;
1117 	return res;
1118 }
1119 
1120 #endif
1121 
ISO88591ToUtf8(const std::string & isostr)1122 std::string ISO88591ToUtf8(const std::string& isostr)
1123 {
1124 	std::string result;
1125 	result.reserve(isostr.size() * 2);
1126 
1127 	for (std::string::const_iterator it = isostr.begin(); it != isostr.end(); it++) {
1128 		signed char ch = (signed char)*it;
1129 		if (ch < 0 ) {
1130 			char ch2 = (char)0xc2;
1131 			if (ch >= -64)
1132 				++ch2;
1133 			result += ch2;
1134 			ch &= ~0x40;
1135 		}
1136 		result += ch;
1137 	}
1138 
1139 	return result;
1140 }
1141 
1142 
TransformRawToUtf8Pos(const std::string & text,size_t pos)1143 size_t TransformRawToUtf8Pos(const std::string& text, size_t pos) {
1144 	const_string_iterator newpos(text);
1145 	size_t count = 0;
1146 	while(newpos.pos < pos && newpos.pos < text.size()) {
1147 		IncUtf8StringIterator(newpos, const_string_iterator(text, text.size()));
1148 		count++;
1149 	}
1150 	if(newpos.pos < pos)
1151 		count += pos - newpos.pos;
1152 	return count;
1153 }
1154 
TransformUtf8PosToRaw(const std::string & text,size_t pos)1155 size_t TransformUtf8PosToRaw(const std::string& text, size_t pos) {
1156 	const_string_iterator newpos(text);
1157 	size_t count = 0;
1158 	while(count < pos && newpos.pos < text.size()) {
1159 		IncUtf8StringIterator(newpos, const_string_iterator(text, text.size()));
1160 		count++;
1161 	}
1162 	if(count < pos)
1163 		newpos.pos += pos - count;
1164 	return newpos.pos;
1165 }
1166 
1167