1 /*
2 OpenLieroX
3
4 UTF8/Unicode conversions
5
6 code under LGPL
7 created 01-05-2007
8 by Albert Zeyer and Dark Charlie
9 */
10
11 #ifdef _MSC_VER
12 #pragma warning(disable: 4786) // WARNING: identifier XXX was truncated to 255 characters in the debug info
13 #pragma warning(disable: 4503) // WARNING: decorated name length exceeded, name was truncated
14 #endif
15
16 #include "Unicode.h"
17 #include "MathLib.h" // for SIGN
18 #include "StringUtils.h"
19
20
21 // Table used for removing diacritics and other backward incompatible characters
22 ConversionItem tConversionTable[] = {
23 { 0x84, {0xE2, 0x80, 0x80, 0x00}, '"'},
24 { 0x93, {0xE2, 0x80, 0x80, 0x80}, '"'},
25 { 0x96, {0xC2, 0x96, 0x00, 0x00}, '-'},
26 { 0xA0, {0xC2, 0xA0, 0x00, 0x00}, ' '},
27 { 0xA1, {0xC2, 0xA1, 0x00, 0x00}, '!'},
28 { 0xA2, {0xC2, 0xA2, 0x00, 0x00}, 'c'},
29 { 0xA3, {0xC2, 0xA3, 0x00, 0x00}, 'L'},
30 { 0xA4, {0xC2, 0xA4, 0x00, 0x00}, 'o'},
31 { 0xA5, {0xC2, 0xA5, 0x00, 0x00}, 'Y'},
32 { 0xA6, {0xC2, 0xA6, 0x00, 0x00}, '|'},
33 { 0xA7, {0xC2, 0xA7, 0x00, 0x00}, '$'},
34 { 0xA8, {0xC2, 0xA8, 0x00, 0x00}, ' '},
35 { 0xA9, {0xC2, 0xA9, 0x00, 0x00}, 'c'},
36 { 0xAA, {0xC2, 0xAA, 0x00, 0x00}, 'a'},
37 { 0xAB, {0xC2, 0xAB, 0x00, 0x00}, '<'},
38 { 0xAC, {0xC2, 0xAC, 0x00, 0x00}, '-'},
39 { 0xAD, {0xC2, 0xAD, 0x00, 0x00}, '-'},
40 { 0xAE, {0xC2, 0xAE, 0x00, 0x00}, 'r'},
41 { 0xAF, {0xC2, 0xAF, 0x00, 0x00}, '-'},
42 { 0xB0, {0xC2, 0xB0, 0x00, 0x00}, '*'},
43 { 0xB1, {0xC2, 0xB1, 0x00, 0x00}, '+'},
44 { 0xB2, {0xC2, 0xB2, 0x00, 0x00}, '2'},
45 { 0xB3, {0xC2, 0xB3, 0x00, 0x00}, '3'},
46 { 0xB4, {0xC2, 0xB4, 0x00, 0x00}, ' '},
47 { 0xB5, {0xC2, 0xB5, 0x00, 0x00}, 'u'},
48 { 0xB6, {0xC2, 0xB6, 0x00, 0x00}, 'P'},
49 { 0xB7, {0xC2, 0xB7, 0x00, 0x00}, '.'},
50 { 0xB8, {0xC2, 0xB8, 0x00, 0x00}, ','},
51 { 0xB9, {0xC2, 0xB9, 0x00, 0x00}, '1'},
52 { 0xBA, {0xC2, 0xBA, 0x00, 0x00}, '0'},
53 { 0xBB, {0xC2, 0xBB, 0x00, 0x00}, '>'},
54 { 0xBC, {0xC2, 0xBC, 0x00, 0x00}, '4'},
55 { 0xBD, {0xC2, 0xBD, 0x00, 0x00}, '2'},
56 { 0xBE, {0xC2, 0xBE, 0x00, 0x00}, '4'},
57 { 0xBF, {0xC2, 0xBF, 0x00, 0x00}, '?'},
58 { 0xC0, {0xC3, 0x80, 0x00, 0x00}, 'A'},
59 { 0xC1, {0xC3, 0x81, 0x00, 0x00}, 'A'},
60 { 0xC2, {0xC3, 0x82, 0x00, 0x00}, 'A'},
61 { 0xC3, {0xC3, 0x83, 0x00, 0x00}, 'A'},
62 { 0xC4, {0xC3, 0x84, 0x00, 0x00}, 'A'},
63 { 0xC5, {0xC3, 0x85, 0x00, 0x00}, 'A'},
64 { 0xC6, {0xC3, 0x86, 0x00, 0x00}, 'A'},
65 { 0xC7, {0xC3, 0x87, 0x00, 0x00}, 'C'},
66 { 0xC8, {0xC3, 0x88, 0x00, 0x00}, 'E'},
67 { 0xC9, {0xC3, 0x89, 0x00, 0x00}, 'E'},
68 { 0xCA, {0xC3, 0x8A, 0x00, 0x00}, 'E'},
69 { 0xCB, {0xC3, 0x8B, 0x00, 0x00}, 'E'},
70 { 0xCC, {0xC3, 0x8C, 0x00, 0x00}, 'I'},
71 { 0xCD, {0xC3, 0x8D, 0x00, 0x00}, 'I'},
72 { 0xCE, {0xC3, 0x8E, 0x00, 0x00}, 'I'},
73 { 0xCF, {0xC3, 0x8F, 0x00, 0x00}, 'I'},
74 { 0xD0, {0xC3, 0x90, 0x00, 0x00}, 'D'},
75 { 0xD1, {0xC3, 0x91, 0x00, 0x00}, 'N'},
76 { 0xD2, {0xC3, 0x92, 0x00, 0x00}, 'O'},
77 { 0xD3, {0xC3, 0x93, 0x00, 0x00}, 'O'},
78 { 0xD4, {0xC3, 0x94, 0x00, 0x00}, 'O'},
79 { 0xD5, {0xC3, 0x95, 0x00, 0x00}, 'O'},
80 { 0xD6, {0xC3, 0x96, 0x00, 0x00}, 'O'},
81 { 0xD7, {0xC3, 0x97, 0x00, 0x00}, 'X'},
82 { 0xD8, {0xC3, 0x98, 0x00, 0x00}, 'O'},
83 { 0xD9, {0xC3, 0x99, 0x00, 0x00}, 'U'},
84 { 0xDA, {0xC3, 0x9A, 0x00, 0x00}, 'U'},
85 { 0xDB, {0xC3, 0x9B, 0x00, 0x00}, 'U'},
86 { 0xDC, {0xC3, 0x9C, 0x00, 0x00}, 'U'},
87 { 0xDD, {0xC3, 0x9D, 0x00, 0x00}, 'Y'},
88 { 0xDE, {0xC3, 0x9E, 0x00, 0x00}, 'b'},
89 { 0xDF, {0xC3, 0x9F, 0x00, 0x00}, 'S'},
90 { 0xE0, {0xC3, 0xA0, 0x00, 0x00}, 'a'},
91 { 0xE1, {0xC3, 0xA1, 0x00, 0x00}, 'a'},
92 { 0xE2, {0xC3, 0xA2, 0x00, 0x00}, 'a'},
93 { 0xE3, {0xC3, 0xA3, 0x00, 0x00}, 'a'},
94 { 0xE4, {0xC3, 0xA4, 0x00, 0x00}, 'a'},
95 { 0xE5, {0xC3, 0xA5, 0x00, 0x00}, 'a'},
96 { 0xE6, {0xC3, 0xA6, 0x00, 0x00}, 'a'},
97 { 0xE7, {0xC3, 0xA7, 0x00, 0x00}, 'c'},
98 { 0xE8, {0xC3, 0xA8, 0x00, 0x00}, 'e'},
99 { 0xE9, {0xC3, 0xA9, 0x00, 0x00}, 'e'},
100 { 0xEA, {0xC3, 0xAA, 0x00, 0x00}, 'e'},
101 { 0xEB, {0xC3, 0xAB, 0x00, 0x00}, 'e'},
102 { 0xEC, {0xC3, 0xAC, 0x00, 0x00}, 'i'},
103 { 0xED, {0xC3, 0xAD, 0x00, 0x00}, 'i'},
104 { 0xEE, {0xC3, 0xAE, 0x00, 0x00}, 'i'},
105 { 0xEF, {0xC3, 0xAF, 0x00, 0x00}, 'i'},
106 { 0xF0, {0xC3, 0xB0, 0x00, 0x00}, 'd'},
107 { 0xF1, {0xC3, 0xB1, 0x00, 0x00}, 'n'},
108 { 0xF2, {0xC3, 0xB2, 0x00, 0x00}, 'o'},
109 { 0xF3, {0xC3, 0xB3, 0x00, 0x00}, 'o'},
110 { 0xF4, {0xC3, 0xB4, 0x00, 0x00}, 'o'},
111 { 0xF5, {0xC3, 0xB5, 0x00, 0x00}, 'o'},
112 { 0xF6, {0xC3, 0xB6, 0x00, 0x00}, 'o'},
113 { 0xF7, {0xC3, 0xB7, 0x00, 0x00}, '/'},
114 { 0xF8, {0xC3, 0xB8, 0x00, 0x00}, 'o'},
115 { 0xF9, {0xC3, 0xB9, 0x00, 0x00}, 'u'},
116 { 0xFA, {0xC3, 0xBA, 0x00, 0x00}, 'u'},
117 { 0xFB, {0xC3, 0xBB, 0x00, 0x00}, 'u'},
118 { 0xFC, {0xC3, 0xBC, 0x00, 0x00}, 'u'},
119 { 0xFD, {0xC3, 0xBD, 0x00, 0x00}, 'y'},
120 { 0xFE, {0xC3, 0xBE, 0x00, 0x00}, 'b'},
121 { 0xFF, {0xC3, 0xBF, 0x00, 0x00}, 'y'},
122
123 { 0x0100, {0xC4, 0x80, 0x00, 0x00}, 'A'},
124 { 0x0101, {0xC4, 0x81, 0x00, 0x00}, 'a'},
125 { 0x0102, {0xC4, 0x82, 0x00, 0x00}, 'A'},
126 { 0x0103, {0xC4, 0x83, 0x00, 0x00}, 'a'},
127 { 0x0104, {0xC4, 0x84, 0x00, 0x00}, 'A'},
128 { 0x0105, {0xC4, 0x85, 0x00, 0x00}, 'a'},
129 { 0x0106, {0xC4, 0x86, 0x00, 0x00}, 'C'},
130 { 0x0107, {0xC4, 0x87, 0x00, 0x00}, 'c'},
131 { 0x0108, {0xC4, 0x88, 0x00, 0x00}, 'C'},
132 { 0x0109, {0xC4, 0x89, 0x00, 0x00}, 'c'},
133 { 0x010A, {0xC4, 0x8A, 0x00, 0x00}, 'C'},
134 { 0x010B, {0xC4, 0x8B, 0x00, 0x00}, 'c'},
135 { 0x010C, {0xC4, 0x8C, 0x00, 0x00}, 'C'},
136 { 0x010D, {0xC4, 0x8D, 0x00, 0x00}, 'c'},
137 { 0x010E, {0xC4, 0x8E, 0x00, 0x00}, 'D'},
138 { 0x010F, {0xC4, 0x8F, 0x00, 0x00}, 'd'},
139 { 0x0110, {0xC4, 0x90, 0x00, 0x00}, 'D'},
140 { 0x0111, {0xC4, 0x91, 0x00, 0x00}, 'd'},
141 { 0x0112, {0xC4, 0x92, 0x00, 0x00}, 'E'},
142 { 0x0113, {0xC4, 0x93, 0x00, 0x00}, 'e'},
143 { 0x0114, {0xC4, 0x94, 0x00, 0x00}, 'E'},
144 { 0x0115, {0xC4, 0x95, 0x00, 0x00}, 'e'},
145 { 0x0116, {0xC4, 0x96, 0x00, 0x00}, 'E'},
146 { 0x0117, {0xC4, 0x97, 0x00, 0x00}, 'e'},
147 { 0x0118, {0xC4, 0x98, 0x00, 0x00}, 'E'},
148 { 0x0119, {0xC4, 0x99, 0x00, 0x00}, 'e'},
149 { 0x011A, {0xC4, 0x9A, 0x00, 0x00}, 'E'},
150 { 0x011B, {0xC4, 0x9B, 0x00, 0x00}, 'e'},
151 { 0x011C, {0xC4, 0x9C, 0x00, 0x00}, 'G'},
152 { 0x011D, {0xC4, 0x9D, 0x00, 0x00}, 'g'},
153 { 0x011E, {0xC4, 0x9E, 0x00, 0x00}, 'G'},
154 { 0x011F, {0xC4, 0x9F, 0x00, 0x00}, 'g'},
155 { 0x0120, {0xC4, 0xA0, 0x00, 0x00}, 'G'},
156 { 0x0121, {0xC4, 0xA1, 0x00, 0x00}, 'g'},
157 { 0x0122, {0xC4, 0xA2, 0x00, 0x00}, 'G'},
158 { 0x0123, {0xC4, 0xA3, 0x00, 0x00}, 'g'},
159 { 0x0124, {0xC4, 0xA4, 0x00, 0x00}, 'H'},
160 { 0x0125, {0xC4, 0xA5, 0x00, 0x00}, 'h'},
161 { 0x0126, {0xC4, 0xA6, 0x00, 0x00}, 'H'},
162 { 0x0127, {0xC4, 0xA7, 0x00, 0x00}, 'h'},
163 { 0x0128, {0xC4, 0xA8, 0x00, 0x00}, 'I'},
164 { 0x0129, {0xC4, 0xA9, 0x00, 0x00}, 'i'},
165 { 0x012A, {0xC4, 0xAA, 0x00, 0x00}, 'I'},
166 { 0x012B, {0xC4, 0xAB, 0x00, 0x00}, 'i'},
167 { 0x012C, {0xC4, 0xAC, 0x00, 0x00}, 'I'},
168 { 0x012D, {0xC4, 0xAD, 0x00, 0x00}, 'i'},
169 { 0x012E, {0xC4, 0xAE, 0x00, 0x00}, 'I'},
170 { 0x012F, {0xC4, 0xAF, 0x00, 0x00}, 'i'},
171 { 0x0130, {0xC4, 0xB0, 0x00, 0x00}, 'I'},
172 { 0x0131, {0xC4, 0xB1, 0x00, 0x00}, 'i'},
173 { 0x0132, {0xC4, 0xB2, 0x00, 0x00}, 'I'},
174 { 0x0133, {0xC4, 0xB3, 0x00, 0x00}, 'i'},
175 { 0x0134, {0xC4, 0xB4, 0x00, 0x00}, 'J'},
176 { 0x0135, {0xC4, 0xB5, 0x00, 0x00}, 'j'},
177 { 0x0136, {0xC4, 0xB6, 0x00, 0x00}, 'K'},
178 { 0x0137, {0xC4, 0xB7, 0x00, 0x00}, 'k'},
179 { 0x0138, {0xC4, 0xB8, 0x00, 0x00}, 'k'},
180 { 0x0139, {0xC4, 0xB9, 0x00, 0x00}, 'L'},
181 { 0x013A, {0xC4, 0xBA, 0x00, 0x00}, 'l'},
182 { 0x013B, {0xC4, 0xBB, 0x00, 0x00}, 'L'},
183 { 0x013C, {0xC4, 0xBC, 0x00, 0x00}, 'l'},
184 { 0x013D, {0xC4, 0xBD, 0x00, 0x00}, 'L'},
185 { 0x013E, {0xC4, 0xBE, 0x00, 0x00}, 'l'},
186 { 0x013F, {0xC4, 0xBF, 0x00, 0x00}, 'L'},
187
188 { 0x0140, {0xC5, 0x80, 0x00, 0x00}, 'l'},
189 { 0x0141, {0xC5, 0x81, 0x00, 0x00}, 'L'},
190 { 0x0142, {0xC5, 0x82, 0x00, 0x00}, 'l'},
191 { 0x0143, {0xC5, 0x83, 0x00, 0x00}, 'N'},
192 { 0x0144, {0xC5, 0x84, 0x00, 0x00}, 'n'},
193 { 0x0145, {0xC5, 0x85, 0x00, 0x00}, 'N'},
194 { 0x0146, {0xC5, 0x86, 0x00, 0x00}, 'n'},
195 { 0x0147, {0xC5, 0x87, 0x00, 0x00}, 'N'},
196 { 0x0148, {0xC5, 0x88, 0x00, 0x00}, 'n'},
197 { 0x0149, {0xC5, 0x89, 0x00, 0x00}, 'n'},
198 { 0x014A, {0xC5, 0x8A, 0x00, 0x00}, 'N'},
199 { 0x014B, {0xC5, 0x8B, 0x00, 0x00}, 'n'},
200 { 0x014C, {0xC5, 0x8C, 0x00, 0x00}, 'O'},
201 { 0x014D, {0xC5, 0x8D, 0x00, 0x00}, 'o'},
202 { 0x014E, {0xC5, 0x8E, 0x00, 0x00}, 'O'},
203 { 0x014F, {0xC5, 0x8F, 0x00, 0x00}, 'o'},
204 { 0x0150, {0xC5, 0x90, 0x00, 0x00}, 'O'},
205 { 0x0151, {0xC5, 0x91, 0x00, 0x00}, 'o'},
206 { 0x0152, {0xC5, 0x92, 0x00, 0x00}, 'O'},
207 { 0x0153, {0xC5, 0x93, 0x00, 0x00}, 'o'},
208 { 0x0154, {0xC5, 0x94, 0x00, 0x00}, 'R'},
209 { 0x0155, {0xC5, 0x95, 0x00, 0x00}, 'r'},
210 { 0x0156, {0xC5, 0x96, 0x00, 0x00}, 'R'},
211 { 0x0157, {0xC5, 0x97, 0x00, 0x00}, 'r'},
212 { 0x0158, {0xC5, 0x98, 0x00, 0x00}, 'R'},
213 { 0x0159, {0xC5, 0x99, 0x00, 0x00}, 'r'},
214 { 0x015A, {0xC5, 0x9A, 0x00, 0x00}, 'S'},
215 { 0x015B, {0xC5, 0x9B, 0x00, 0x00}, 's'},
216 { 0x015C, {0xC5, 0x9C, 0x00, 0x00}, 'S'},
217 { 0x015D, {0xC5, 0x9D, 0x00, 0x00}, 's'},
218 { 0x015E, {0xC5, 0x9E, 0x00, 0x00}, 'S'},
219 { 0x015F, {0xC5, 0x9F, 0x00, 0x00}, 's'},
220 { 0x0160, {0xC5, 0xA0, 0x00, 0x00}, 'S'},
221 { 0x0161, {0xC5, 0xA1, 0x00, 0x00}, 's'},
222 { 0x0162, {0xC5, 0xA2, 0x00, 0x00}, 'T'},
223 { 0x0163, {0xC5, 0xA3, 0x00, 0x00}, 't'},
224 { 0x0164, {0xC5, 0xA4, 0x00, 0x00}, 'T'},
225 { 0x0165, {0xC5, 0xA5, 0x00, 0x00}, 't'},
226 { 0x0166, {0xC5, 0xA6, 0x00, 0x00}, 'T'},
227 { 0x0167, {0xC5, 0xA7, 0x00, 0x00}, 't'},
228 { 0x0168, {0xC5, 0xA8, 0x00, 0x00}, 'U'},
229 { 0x0169, {0xC5, 0xA9, 0x00, 0x00}, 'u'},
230 { 0x016A, {0xC5, 0xAA, 0x00, 0x00}, 'U'},
231 { 0x016B, {0xC5, 0xAB, 0x00, 0x00}, 'u'},
232 { 0x016C, {0xC5, 0xAC, 0x00, 0x00}, 'U'},
233 { 0x016D, {0xC5, 0xAD, 0x00, 0x00}, 'u'},
234 { 0x016E, {0xC5, 0xAE, 0x00, 0x00}, 'U'},
235 { 0x016F, {0xC5, 0xAF, 0x00, 0x00}, 'u'},
236 { 0x0170, {0xC5, 0xB0, 0x00, 0x00}, 'U'},
237 { 0x0171, {0xC5, 0xB1, 0x00, 0x00}, 'u'},
238 { 0x0172, {0xC5, 0xB2, 0x00, 0x00}, 'U'},
239 { 0x0173, {0xC5, 0xB3, 0x00, 0x00}, 'u'},
240 { 0x0174, {0xC5, 0xB4, 0x00, 0x00}, 'W'},
241 { 0x0175, {0xC5, 0xB5, 0x00, 0x00}, 'w'},
242 { 0x0176, {0xC5, 0xB6, 0x00, 0x00}, 'Y'},
243 { 0x0177, {0xC5, 0xB7, 0x00, 0x00}, 'y'},
244 { 0x0178, {0xC5, 0xB8, 0x00, 0x00}, 'Y'},
245 { 0x0179, {0xC5, 0xB9, 0x00, 0x00}, 'Z'},
246 { 0x017A, {0xC5, 0xBA, 0x00, 0x00}, 'z'},
247 { 0x017B, {0xC5, 0xBB, 0x00, 0x00}, 'Z'},
248 { 0x017C, {0xC5, 0xBC, 0x00, 0x00}, 'z'},
249 { 0x017D, {0xC5, 0xBD, 0x00, 0x00}, 'Z'},
250 { 0x017E, {0xC5, 0xBE, 0x00, 0x00}, 'z'},
251 { 0x017F, {0xC5, 0xBF, 0x00, 0x00}, 'S'},
252
253 { 0x018F, {0xC6, 0x8F, 0x00, 0x00}, 'e'},
254 { 0x0192, {0xC6, 0x92, 0x00, 0x00}, 'f'},
255 { 0x01A0, {0xC6, 0xA0, 0x00, 0x00}, 'O'},
256 { 0x01A1, {0xC6, 0xA1, 0x00, 0x00}, 'o'},
257 { 0x01AF, {0xC6, 0xAF, 0x00, 0x00}, 'U'},
258 { 0x01B0, {0xC6, 0xB0, 0x00, 0x00}, 'u'},
259
260 { 0x01CD, {0xC7, 0x8D, 0x00, 0x00}, 'A'},
261 { 0x01CE, {0xC7, 0x8E, 0x00, 0x00}, 'a'},
262 { 0x01CF, {0xC7, 0x8F, 0x00, 0x00}, 'I'},
263 { 0x01D0, {0xC7, 0x90, 0x00, 0x00}, 'i'},
264 { 0x01D1, {0xC7, 0x91, 0x00, 0x00}, 'O'},
265 { 0x01D2, {0xC7, 0x92, 0x00, 0x00}, 'o'},
266 { 0x01D3, {0xC7, 0x93, 0x00, 0x00}, 'U'},
267 { 0x01D4, {0xC7, 0x94, 0x00, 0x00}, 'u'},
268 { 0x01D5, {0xC7, 0x95, 0x00, 0x00}, 'U'},
269 { 0x01D6, {0xC7, 0x96, 0x00, 0x00}, 'u'},
270 { 0x01D7, {0xC7, 0x97, 0x00, 0x00}, 'U'},
271 { 0x01D8, {0xC7, 0x98, 0x00, 0x00}, 'u'},
272 { 0x01D9, {0xC7, 0x99, 0x00, 0x00}, 'U'},
273 { 0x01DA, {0xC7, 0x9A, 0x00, 0x00}, 'u'},
274 { 0x01DB, {0xC7, 0x9B, 0x00, 0x00}, 'U'},
275 { 0x01DC, {0xC7, 0x9C, 0x00, 0x00}, 'u'},
276
277 { 0x01FA, {0xC7, 0xBA, 0x00, 0x00}, 'A'},
278 { 0x01FB, {0xC7, 0xBB, 0x00, 0x00}, 'a'},
279 { 0x01FC, {0xC7, 0xBC, 0x00, 0x00}, 'A'},
280 { 0x01FD, {0xC7, 0xBD, 0x00, 0x00}, 'a'},
281 { 0x01FE, {0xC7, 0xBE, 0x00, 0x00}, 'O'},
282 { 0x01FF, {0xC7, 0xBF, 0x00, 0x00}, 'o'},
283
284 { 0x0259, {0xC9, 0x99, 0x00, 0x00}, 'e'},
285
286 { 0x02C9, {0xCB, 0x89, 0x00, 0x00}, '-'},
287 { 0x02DA, {0xCB, 0x9A, 0x00, 0x00}, '*'},
288 { 0x02DC, {0xCB, 0x9C, 0x00, 0x00}, '\"'},
289 { 0x02DD, {0xCB, 0x9D, 0x00, 0x00}, '\"'},
290
291 { 0x0300, {0xCC, 0x80, 0x00, 0x00}, '\''},
292 { 0x0301, {0xCC, 0x81, 0x00, 0x00}, '\''},
293 { 0x0303, {0xCC, 0x83, 0x00, 0x00}, '\"'},
294 { 0x0323, {0xCC, 0xA3, 0x00, 0x00}, '.'},
295
296 { 0x037E, {0xCD, 0xBE, 0x00, 0x00}, ';'},
297
298 { 0x0384, {0xCE, 0x84, 0x00, 0x00}, '\''},
299 { 0x0386, {0xCE, 0x86, 0x00, 0x00}, 'A'},
300 { 0x0387, {0xCE, 0x87, 0x00, 0x00}, '.'},
301 { 0x0388, {0xCE, 0x88, 0x00, 0x00}, 'E'},
302 { 0x0389, {0xCE, 0x89, 0x00, 0x00}, 'G'},
303 { 0x038A, {0xCE, 0x8A, 0x00, 0x00}, 'I'},
304 { 0x038C, {0xCE, 0x8C, 0x00, 0x00}, 'O'},
305 { 0x038E, {0xCE, 0x8E, 0x00, 0x00}, 'Y'},
306 { 0x038F, {0xCE, 0x8F, 0x00, 0x00}, 'O'},
307 { 0x0390, {0xCE, 0x90, 0x00, 0x00}, 'i'},
308
309 { 0x0391, {0xCE, 0x91, 0x00, 0x00}, 'A'},
310 { 0x0392, {0xCE, 0x92, 0x00, 0x00}, 'B'},
311 { 0x0393, {0xCE, 0x93, 0x00, 0x00}, 'G'},
312 { 0x0394, {0xCE, 0x94, 0x00, 0x00}, 'D'},
313 { 0x0395, {0xCE, 0x95, 0x00, 0x00}, 'E'},
314 { 0x0396, {0xCE, 0x96, 0x00, 0x00}, 'Z'},
315 { 0x0397, {0xCE, 0x97, 0x00, 0x00}, 'H'},
316 { 0x0398, {0xCE, 0x98, 0x00, 0x00}, 'T'},
317 { 0x0399, {0xCE, 0x99, 0x00, 0x00}, 'I'},
318 { 0x039A, {0xCE, 0x9A, 0x00, 0x00}, 'K'},
319 { 0x039B, {0xCE, 0x9B, 0x00, 0x00}, 'L'},
320 { 0x039C, {0xCE, 0x9C, 0x00, 0x00}, 'M'},
321 { 0x039D, {0xCE, 0x9D, 0x00, 0x00}, 'N'},
322 { 0x039E, {0xCE, 0x9E, 0x00, 0x00}, 'X'},
323 { 0x039F, {0xCE, 0x9F, 0x00, 0x00}, 'O'},
324 { 0x03A0, {0xCE, 0xA0, 0x00, 0x00}, 'P'},
325 { 0x03A1, {0xCE, 0xA1, 0x00, 0x00}, 'R'},
326 { 0x03A3, {0xCE, 0xA3, 0x00, 0x00}, 'S'},
327 { 0x03A4, {0xCE, 0xA4, 0x00, 0x00}, 'T'},
328 { 0x03A5, {0xCE, 0xA5, 0x00, 0x00}, 'Y'},
329 { 0x03A6, {0xCE, 0xA6, 0x00, 0x00}, 'F'},
330 { 0x03A7, {0xCE, 0xA7, 0x00, 0x00}, 'C'},
331 { 0x03A8, {0xCE, 0xA8, 0x00, 0x00}, 'P'},
332 { 0x03A9, {0xCE, 0xA9, 0x00, 0x00}, 'W'},
333 { 0x03AA, {0xCE, 0xAA, 0x00, 0x00}, 'I'},
334 { 0x03AB, {0xCE, 0xAB, 0x00, 0x00}, 'Y'},
335 { 0x03AC, {0xCE, 0xAC, 0x00, 0x00}, 'a'},
336 { 0x03AD, {0xCE, 0xAD, 0x00, 0x00}, 'e'},
337 { 0x03AE, {0xCE, 0xAE, 0x00, 0x00}, 'h'},
338 { 0x03AF, {0xCE, 0xAF, 0x00, 0x00}, 'i'},
339 { 0x03B0, {0xCE, 0xB0, 0x00, 0x00}, 'y'},
340 { 0x03B1, {0xCE, 0xB1, 0x00, 0x00}, 'I'},
341 { 0x03B2, {0xCE, 0xB2, 0x00, 0x00}, 'a'},
342 { 0x03B3, {0xCE, 0xB3, 0x00, 0x00}, 'b'},
343 { 0x03B4, {0xCE, 0xB4, 0x00, 0x00}, 'g'},
344 { 0x03B5, {0xCE, 0xB5, 0x00, 0x00}, 'd'},
345 { 0x03B6, {0xCE, 0xB6, 0x00, 0x00}, 'e'},
346 { 0x03B7, {0xCE, 0xB7, 0x00, 0x00}, 'z'},
347 { 0x03B8, {0xCE, 0xB8, 0x00, 0x00}, 'h'},
348 { 0x03B9, {0xCE, 0xB9, 0x00, 0x00}, 't'},
349 { 0x03BA, {0xCE, 0xBA, 0x00, 0x00}, 'i'},
350 { 0x03BB, {0xCE, 0xBB, 0x00, 0x00}, 'k'},
351 { 0x03BC, {0xCE, 0xBC, 0x00, 0x00}, 'l'},
352 { 0x03BD, {0xCE, 0xBD, 0x00, 0x00}, 'm'},
353 { 0x03BE, {0xCE, 0xBE, 0x00, 0x00}, 'n'},
354 { 0x03BF, {0xCE, 0xBF, 0x00, 0x00}, 'x'},
355
356 { 0x03C0, {0xCF, 0x80, 0x00, 0x00}, 'o'},
357 { 0x03C1, {0xCF, 0x81, 0x00, 0x00}, 'p'},
358 { 0x03C2, {0xCF, 0x82, 0x00, 0x00}, 'r'},
359 { 0x03C3, {0xCF, 0x83, 0x00, 0x00}, 's'},
360 { 0x03C4, {0xCF, 0x84, 0x00, 0x00}, 't'},
361 { 0x03C5, {0xCF, 0x85, 0x00, 0x00}, 'y'},
362 { 0x03C6, {0xCF, 0x86, 0x00, 0x00}, 'f'},
363 { 0x03C7, {0xCF, 0x87, 0x00, 0x00}, 'c'},
364 { 0x03C8, {0xCF, 0x88, 0x00, 0x00}, 'p'},
365 { 0x03C9, {0xCF, 0x89, 0x00, 0x00}, 'w'},
366 { 0x03CA, {0xCF, 0x8A, 0x00, 0x00}, 'i'},
367 { 0x03CB, {0xCF, 0x8B, 0x00, 0x00}, 'y'},
368 { 0x03CC, {0xCF, 0x8C, 0x00, 0x00}, 'o'},
369 { 0x03CD, {0xCF, 0x8D, 0x00, 0x00}, 'y'},
370 { 0x03CE, {0xCF, 0x8E, 0x00, 0x00}, 'w'},
371
372 // TODO: finish cyrilic
373 { 0x0401, {0xD0, 0x81, 0x00, 0x00}, 'E'},
374 { 0x0404, {0xD0, 0x84, 0x00, 0x00}, 'E'},
375 { 0x0405, {0xD0, 0x85, 0x00, 0x00}, 'S'},
376 { 0x0406, {0xD0, 0x86, 0x00, 0x00}, 'I'},
377 { 0x0407, {0xD0, 0x87, 0x00, 0x00}, 'I'},
378 { 0x0408, {0xD0, 0x88, 0x00, 0x00}, 'J'},
379 { 0x040C, {0xD0, 0x8C, 0x00, 0x00}, 'K'},
380 { 0x040E, {0xD0, 0x8E, 0x00, 0x00}, 'Y'},
381 { 0x0425, {0xD0, 0xA5, 0x00, 0x00}, 'X'},
382 { 0x042C, {0xD0, 0xAC, 0x00, 0x00}, 'b'},
383 { 0x0430, {0xD0, 0xB0, 0x00, 0x00}, 'a'},
384 { 0x0432, {0xD0, 0xB2, 0x00, 0x00}, 'v'},
385 { 0x0435, {0xD0, 0xB5, 0x00, 0x00}, 'e'},
386 { 0x043A, {0xD0, 0xBA, 0x00, 0x00}, 'k'},
387 { 0x043C, {0xD0, 0xBC, 0x00, 0x00}, 'm'},
388
389 // TODO: hebrew
390
391 { 0x1E80, {0xE1, 0xBA, 0x80, 0x00}, 'W'},
392 { 0x1E81, {0xE1, 0xBA, 0x81, 0x00}, 'w'},
393 { 0x1E82, {0xE1, 0xBA, 0x82, 0x00}, 'W'},
394 { 0x1E83, {0xE1, 0xBA, 0x83, 0x00}, 'w'},
395 { 0x1E84, {0xE1, 0xBA, 0x84, 0x00}, 'W'},
396 { 0x1E85, {0xE1, 0xBA, 0x85, 0x00}, 'w'},
397 { 0x1EA0, {0xE1, 0xBA, 0xA0, 0x00}, 'A'},
398 { 0x1EA1, {0xE1, 0xBA, 0xA1, 0x00}, 'a'},
399 { 0x1EA2, {0xE1, 0xBA, 0xA2, 0x00}, 'A'},
400 { 0x1EA3, {0xE1, 0xBA, 0xA3, 0x00}, 'a'},
401 { 0x1EA4, {0xE1, 0xBA, 0xA4, 0x00}, 'A'},
402 { 0x1EA5, {0xE1, 0xBA, 0xA5, 0x00}, 'a'},
403 { 0x1EA6, {0xE1, 0xBA, 0xA6, 0x00}, 'A'},
404 { 0x1EA7, {0xE1, 0xBA, 0xA7, 0x00}, 'a'},
405 { 0x1EA8, {0xE1, 0xBA, 0xA8, 0x00}, 'A'},
406 { 0x1EA9, {0xE1, 0xBA, 0xA9, 0x00}, 'a'},
407 { 0x1EAA, {0xE1, 0xBA, 0xAA, 0x00}, 'A'},
408 { 0x1EAB, {0xE1, 0xBA, 0xAB, 0x00}, 'a'},
409 { 0x1EAC, {0xE1, 0xBA, 0xAC, 0x00}, 'A'},
410 { 0x1EAD, {0xE1, 0xBA, 0xAD, 0x00}, 'a'},
411 { 0x1EAE, {0xE1, 0xBA, 0xAE, 0x00}, 'A'},
412 { 0x1EAF, {0xE1, 0xBA, 0xAF, 0x00}, 'a'},
413 { 0x1EB0, {0xE1, 0xBA, 0xB0, 0x00}, 'A'},
414 { 0x1EB1, {0xE1, 0xBA, 0xB1, 0x00}, 'a'},
415 { 0x1EB2, {0xE1, 0xBA, 0xB2, 0x00}, 'A'},
416 { 0x1EB3, {0xE1, 0xBA, 0xB3, 0x00}, 'a'},
417 { 0x1EB4, {0xE1, 0xBA, 0xB4, 0x00}, 'A'},
418 { 0x1EB5, {0xE1, 0xBA, 0xB5, 0x00}, 'a'},
419 { 0x1EB6, {0xE1, 0xBA, 0xB6, 0x00}, 'A'},
420 { 0x1EB7, {0xE1, 0xBA, 0xB7, 0x00}, 'a'},
421 { 0x1EB8, {0xE1, 0xBA, 0xB8, 0x00}, 'E'},
422 { 0x1EB8, {0xE1, 0xBA, 0xB9, 0x00}, 'e'},
423 { 0x1EBA, {0xE1, 0xBA, 0xBA, 0x00}, 'E'},
424 { 0x1EBB, {0xE1, 0xBA, 0xBB, 0x00}, 'e'},
425 { 0x1EBC, {0xE1, 0xBA, 0xBC, 0x00}, 'E'},
426 { 0x1EBD, {0xE1, 0xBA, 0xBD, 0x00}, 'e'},
427 { 0x1EBE, {0xE1, 0xBA, 0xBE, 0x00}, 'E'},
428 { 0x1EBF, {0xE1, 0xBA, 0xBF, 0x00}, 'e'},
429
430 { 0x1EC0, {0xE1, 0xBB, 0x80, 0x00}, 'E'},
431 { 0x1EC1, {0xE1, 0xBB, 0x81, 0x00}, 'e'},
432 { 0x1EC2, {0xE1, 0xBB, 0x82, 0x00}, 'E'},
433 { 0x1EC3, {0xE1, 0xBB, 0x83, 0x00}, 'e'},
434 { 0x1EC4, {0xE1, 0xBB, 0x84, 0x00}, 'E'},
435 { 0x1EC5, {0xE1, 0xBB, 0x85, 0x00}, 'e'},
436 { 0x1EC6, {0xE1, 0xBB, 0x86, 0x00}, 'E'},
437 { 0x1EC7, {0xE1, 0xBB, 0x87, 0x00}, 'e'},
438 { 0x1EC8, {0xE1, 0xBB, 0x88, 0x00}, 'I'},
439 { 0x1EC9, {0xE1, 0xBB, 0x89, 0x00}, 'i'},
440 { 0x1ECA, {0xE1, 0xBB, 0x8A, 0x00}, 'I'},
441 { 0x1ECB, {0xE1, 0xBB, 0x8B, 0x00}, 'i'},
442 { 0x1ECC, {0xE1, 0xBB, 0x8C, 0x00}, 'O'},
443 { 0x1ECD, {0xE1, 0xBB, 0x8D, 0x00}, 'o'},
444 { 0x1ECE, {0xE1, 0xBB, 0x8E, 0x00}, 'O'},
445 { 0x1ECF, {0xE1, 0xBB, 0x8F, 0x00}, 'o'},
446 { 0x1ED0, {0xE1, 0xBB, 0x90, 0x00}, 'O'},
447 { 0x1ED1, {0xE1, 0xBB, 0x91, 0x00}, 'o'},
448 { 0x1ED2, {0xE1, 0xBB, 0x92, 0x00}, 'O'},
449 { 0x1ED3, {0xE1, 0xBB, 0x93, 0x00}, 'o'},
450 { 0x1ED4, {0xE1, 0xBB, 0x94, 0x00}, 'O'},
451 { 0x1ED5, {0xE1, 0xBB, 0x95, 0x00}, 'o'},
452 { 0x1ED6, {0xE1, 0xBB, 0x96, 0x00}, 'O'},
453 { 0x1ED7, {0xE1, 0xBB, 0x97, 0x00}, 'o'},
454 { 0x1ED8, {0xE1, 0xBB, 0x98, 0x00}, 'O'},
455 { 0x1ED9, {0xE1, 0xBB, 0x99, 0x00}, 'o'},
456 { 0x1EDA, {0xE1, 0xBB, 0x9A, 0x00}, 'O'},
457 { 0x1EDB, {0xE1, 0xBB, 0x9B, 0x00}, 'o'},
458 { 0x1EDC, {0xE1, 0xBB, 0x9C, 0x00}, 'O'},
459 { 0x1EDD, {0xE1, 0xBB, 0x9D, 0x00}, 'o'},
460 { 0x1EDE, {0xE1, 0xBB, 0x9E, 0x00}, 'O'},
461 { 0x1EDF, {0xE1, 0xBB, 0x9F, 0x00}, 'o'},
462 { 0x1EE0, {0xE1, 0xBB, 0xA0, 0x00}, 'O'},
463 { 0x1EE1, {0xE1, 0xBB, 0xA1, 0x00}, 'o'},
464 { 0x1EE2, {0xE1, 0xBB, 0xA2, 0x00}, 'O'},
465 { 0x1EE3, {0xE1, 0xBB, 0xA3, 0x00}, 'o'},
466 { 0x1EE4, {0xE1, 0xBB, 0xA4, 0x00}, 'U'},
467 { 0x1EE5, {0xE1, 0xBB, 0xA5, 0x00}, 'u'},
468 { 0x1EE6, {0xE1, 0xBB, 0xA6, 0x00}, 'U'},
469 { 0x1EE7, {0xE1, 0xBB, 0xA7, 0x00}, 'u'},
470 { 0x1EE8, {0xE1, 0xBB, 0xA8, 0x00}, 'U'},
471 { 0x1EE9, {0xE1, 0xBB, 0xA9, 0x00}, 'u'},
472 { 0x1EEA, {0xE1, 0xBB, 0xAA, 0x00}, 'U'},
473 { 0x1EEB, {0xE1, 0xBB, 0xAB, 0x00}, 'u'},
474 { 0x1EEC, {0xE1, 0xBB, 0xAC, 0x00}, 'U'},
475 { 0x1EED, {0xE1, 0xBB, 0xAD, 0x00}, 'u'},
476 { 0x1EEE, {0xE1, 0xBB, 0xAE, 0x00}, 'U'},
477 { 0x1EEF, {0xE1, 0xBB, 0xAF, 0x00}, 'u'},
478 { 0x1EF0, {0xE1, 0xBB, 0xB0, 0x00}, 'U'},
479 { 0x1EF1, {0xE1, 0xBB, 0xB1, 0x00}, 'u'},
480 { 0x1EF2, {0xE1, 0xBB, 0xB2, 0x00}, 'Y'},
481 { 0x1EF3, {0xE1, 0xBB, 0xB3, 0x00}, 'y'},
482 { 0x1EF4, {0xE1, 0xBB, 0xB4, 0x00}, 'Y'},
483 { 0x1EF5, {0xE1, 0xBB, 0xB5, 0x00}, 'y'},
484 { 0x1EF6, {0xE1, 0xBB, 0xB6, 0x00}, 'Y'},
485 { 0x1EF7, {0xE1, 0xBB, 0xB7, 0x00}, 'y'},
486 { 0x1EF8, {0xE1, 0xBB, 0xB8, 0x00}, 'Y'},
487 { 0x1EF9, {0xE1, 0xBB, 0xB9, 0x00}, 'y'},
488
489 { 0x2013, {0xE2, 0x80, 0x93, 0x00}, '-'},
490 { 0x2014, {0xE2, 0x80, 0x94, 0x00}, '-'},
491 { 0x2015, {0xE2, 0x80, 0x95, 0x00}, '-'},
492 { 0x2017, {0xE2, 0x80, 0x97, 0x00}, '_'},
493 { 0x2018, {0xE2, 0x80, 0x98, 0x00}, '\''},
494 { 0x2019, {0xE2, 0x80, 0x99, 0x00}, '\''},
495 { 0x201A, {0xE2, 0x80, 0x9A, 0x00}, '\''},
496 { 0x201B, {0xE2, 0x80, 0x9B, 0x00}, '\''},
497 { 0x201C, {0xE2, 0x80, 0x9C, 0x00}, '\"'},
498 { 0x201D, {0xE2, 0x80, 0x9D, 0x00}, '\"'},
499 { 0x201E, {0xE2, 0x80, 0x9E, 0x00}, '\"'},
500 { 0x2020, {0xE2, 0x80, 0xA0, 0x00}, '+'},
501 { 0x2021, {0xE2, 0x80, 0xA1, 0x00}, '+'},
502 { 0x2022, {0xE2, 0x80, 0xA2, 0x00}, '*'}
503
504 // TODO: more :)
505 };
506
507
508
509
510 // grabbed from SDL_ttf (also LGPL)
UNICODE_to_UTF8(unsigned char * utf8,UnicodeChar unicode)511 static void UNICODE_to_UTF8(unsigned char *utf8, UnicodeChar unicode)
512 {
513 int j=0;
514
515 if (unicode < 0x80)
516 {
517 utf8[j] = unicode & 0x7F;
518 }
519 else if (unicode < 0x800)
520 {
521 utf8[j] = 0xC0 | (unicode >> 6);
522 utf8[++j] = 0x80 | (unicode & 0x3F);
523 }
524 else if (unicode < 0x10000)
525 {
526 utf8[j] = 0xE0 | (unicode >> 12);
527 utf8[++j] = 0x80 | ((unicode >> 6) & 0x3F);
528 utf8[++j] = 0x80 | (unicode & 0x3F);
529 }
530 else if (unicode < 0x200000)
531 {
532 utf8[j] = 0xF0 | (unicode >> 18);
533 utf8[++j] = 0x80 | ((unicode >> 12) & 0x3F);
534 utf8[++j] = 0x80 | ((unicode >> 6) & 0x3F);
535 utf8[++j] = 0x80 | (unicode & 0x3F);
536 }
537 else if (unicode < 0x4000000)
538 {
539 utf8[j] = 0xF8 | (unicode >> 24);
540 utf8[++j] = 0x80 | ((unicode >> 18) & 0x3F);
541 utf8[++j] = 0x80 | ((unicode >> 12) & 0x3F);
542 utf8[++j] = 0x80 | ((unicode >> 6) & 0x3F);
543 utf8[++j] = 0x80 | (unicode & 0x3F);
544 }
545 else if (unicode < 0x80000000)
546 {
547 utf8[j] = 0xFC | (unicode >> 30);
548 utf8[++j] = 0x80 | ((unicode >> 24) & 0x3F);
549 utf8[++j] = 0x80 | ((unicode >> 18) & 0x3F);
550 utf8[++j] = 0x80 | ((unicode >> 12) & 0x3F);
551 utf8[++j] = 0x80 | ((unicode >> 6) & 0x3F);
552 utf8[++j] = 0x80 | (unicode & 0x3F);
553 }
554 else
555 utf8[j] = 0;
556
557 utf8[++j] = 0;
558 }
559
560 /////////////////
561 // Convert unicode to UTF8
GetUtf8FromUnicode(UnicodeChar ch)562 std::string GetUtf8FromUnicode(UnicodeChar ch) {
563 if(ch == 0) return std::string("\0", 1);
564 unsigned char utf8[7];
565 UNICODE_to_UTF8(utf8, ch);
566 return (const char*)utf8;
567 }
568
569
570 ////////////////////
571 // Convert UTF8 to unicode (takes iterator pointing to the first UTF8-encoded character)
GetNextUnicodeFromUtf8(std::string::const_iterator & it,const std::string::const_iterator & last,size_t & num_skipped)572 UnicodeChar GetNextUnicodeFromUtf8(std::string::const_iterator &it, const std::string::const_iterator& last, size_t& num_skipped) {
573 num_skipped = 0;
574 if(it == last) return 0;
575
576 unsigned char ch = *it;
577 UnicodeChar res = ch;
578 if ( ch >= 0xFC ) {
579 res = (ch&0x01) << 30; it++; num_skipped++; if(it == last) return 0; ch = *it;
580 res |= (ch&0x3F) << 24; it++; num_skipped++; if(it == last) return 0; ch = *it;
581 res |= (ch&0x3F) << 18; it++; num_skipped++; if(it == last) return 0; ch = *it;
582 res |= (ch&0x3F) << 12; it++; num_skipped++; if(it == last) return 0; ch = *it;
583 res |= (ch&0x3F) << 6; it++; num_skipped++; if(it == last) return 0; ch = *it;
584 res |= (ch&0x3F);
585 } else
586 if ( ch >= 0xF8 ) {
587 res = (ch&0x03) << 24; num_skipped++; it++; if(it == last) return 0; ch = *it;
588 res |= (ch&0x3F) << 18; num_skipped++; it++; if(it == last) return 0; ch = *it;
589 res |= (ch&0x3F) << 12; num_skipped++; it++; if(it == last) return 0; ch = *it;
590 res |= (ch&0x3F) << 6; num_skipped++; it++; if(it == last) return 0; ch = *it;
591 res |= (ch&0x3F);
592 } else
593 if ( ch >= 0xF0 ) {
594 res = (ch&0x07) << 18; it++; num_skipped++; if(it == last) return 0; ch = *it;
595 res |= (ch&0x3F) << 12; it++; num_skipped++; if(it == last) return 0; ch = *it;
596 res |= (ch&0x3F) << 6; it++; num_skipped++; if(it == last) return 0; ch = *it;
597 res |= (ch&0x3F);
598 } else
599 if ( ch >= 0xE0 ) {
600 res = (ch&0x0F) << 12; it++; num_skipped++; if(it == last) return 0; ch = *it;
601 res |= (ch&0x3F) << 6; it++; num_skipped++; if(it == last) return 0; ch = *it;
602 res |= (ch&0x3F);
603 } else
604 if ( ch >= 0xC0 ) {
605 res = (ch&0x1F) << 6; it++; num_skipped++; if(it == last) return 0; ch = *it;
606 res |= (ch&0x3F);
607 }
608
609 it++; num_skipped++;
610 return res;
611 }
612
613 // Conversion functions
614
615 ////////////////////////////
616 // Finds an index for the character in conversion table, returns -1 if not found
FindTableIndex(UnicodeChar c)617 int FindTableIndex(UnicodeChar c)
618 {
619 int left, right, middle;
620
621 left = 0;
622 right = sizeof(tConversionTable)/sizeof(ConversionItem) - 1;
623
624 // Binary search
625 while (left <= right) {
626 middle = (left + right) / 2;
627 if (tConversionTable[middle].Unicode == c)
628 return middle;
629
630 if (c < tConversionTable[middle].Unicode)
631 right = middle - 1;
632 else
633 left = middle + 1;
634 }
635
636 return -1; // No conversion available
637 }
638
639 /////////////////////////
640 // Converts given unicode character to ascii, according to conversion table
641 // If impossible to convert, returns 0xFF
UnicodeCharToAsciiChar(UnicodeChar c)642 char UnicodeCharToAsciiChar(UnicodeChar c)
643 {
644 // Regular ascii, just continue
645 if (c <= 0x80)
646 return (char) c;
647
648
649 // Unicode, try to convert
650 int index = FindTableIndex(c);
651 if (index == -1) // Cannot convert
652 return (char)0xFF;
653 else
654 return tConversionTable[index].Ascii;
655
656 }
657
658 ////////////////////////
659 // Converts a UTF-8 string to Ascii while replacing unicode characters by the closest ones from ASCII
UnicodeToAscii(const std::string & utf8str)660 std::string UnicodeToAscii(const std::string& utf8str)
661 {
662 std::string res;
663 res.reserve(utf8str.size());
664
665 for (std::string::const_iterator it = utf8str.begin(); it != utf8str.end(); ) {
666 if ((unsigned char)*it < 0x80) { // Normal Ascii
667 res += *it;
668 ++it;
669 } else { // Unicode
670 const UnicodeChar c = GetNextUnicodeFromUtf8(it, utf8str.end());
671 const int idx = FindTableIndex(c);
672 if (idx != -1)
673 res += tConversionTable[idx].Ascii;
674 }
675 }
676
677 return res;
678 }
679
680 ////////////////////////
681 // Like tolower() but for all international characters
UnicodeToLower(UnicodeChar c)682 UnicodeChar UnicodeToLower(UnicodeChar c)
683 {
684 // ASCII
685 if (c < 0xC0)
686 return (UnicodeChar)tolower(c);
687
688 // Who the hell invented so crazzy mappings? :S
689
690 // European characters
691 if (c >= 0xC0 && c <= 0xD6)
692 return c + 0x20;
693
694 if (c >= 0xD8 && c <= 0xDE)
695 return c + 0x20;
696
697 if (c >= 0x100 && c <= 0x177 && !(c & 1))
698 return c + 1;
699
700 if (c == 0x178)
701 return 0xFF;
702
703 if (c >= 0x179 && c <= 0x17E && (c & 1))
704 return c + 1;
705
706 if (c == 0x18F)
707 return 0x259;
708
709 if (c >= 0x1A0 && c <= 0x1FF && !(c & 1))
710 return c + 1;
711
712 // Greece alphabet
713 if (c == 0x386)
714 return 0x3AC;
715
716 if (c >= 0x388 && c <= 0x38A)
717 return c + 0x25;
718
719 if (c >= 0x38C && c <= 0x38F)
720 return c + 0x40;
721
722 if (c >= 0x391 && c <= 0x3AB)
723 return c + 0x1B;
724
725 // Cyrilic
726 if (c >= 0x401 && c <= 0x40F)
727 return c + 0x50;
728
729 if (c >= 0x410 && c <= 0x42F)
730 return c + 0x20;
731
732 if (c >= 0x490 && c <= 0x4E9 && !(c & 1))
733 return c + 1;
734
735 // More European characters
736 if (c >= 0x1E80 && c <= 0x1EF9 && !(c & 1))
737 return c + 1;
738
739 // This character doesn't have lowercase
740 return c;
741 }
742
743 ////////////////////////
744 // Like toupper() but for all international characters
UnicodeToUpper(UnicodeChar c)745 UnicodeChar UnicodeToUpper(UnicodeChar c)
746 {
747 // ASCII
748 if (c < 0xC0)
749 return (UnicodeChar)toupper(c);
750
751 // Who the hell invented so crazzy mappings? :S
752
753 // European characters
754 if (c >= 0xE0 && c <= 0xF6)
755 return c - 0x20;
756
757 if (c >= 0xF8 && c <= 0xFE)
758 return c - 0x20;
759
760 if (c == 0xFF)
761 return 0x178;
762
763 if (c >= 0x100 && c <= 0x177 && (c & 1))
764 return c - 1;
765
766 if (c >= 0x179 && c <= 0x17E && !(c & 1))
767 return c - 1;
768
769 if (c == 0x259)
770 return 0x18F;
771
772 if (c >= 0x1A0 && c <= 0x1FF && (c & 1))
773 return c - 1;
774
775 // Greece alphabet
776 if (c == 0x3AC)
777 return 0x386;
778
779 if (c >= 0x3AD && c <= 0x3AF)
780 return c - 0x25;
781
782 if (c >= 0x3CC && c <= 0x3CF)
783 return c - 0x40;
784
785 if (c >= 0x3B1 && c <= 0x3CB)
786 return c - 0x1B;
787
788 // Cyrilic
789 if (c >= 0x451 && c <= 0x45F)
790 return c - 0x50;
791
792 if (c >= 0x430 && c <= 0x44F)
793 return c - 0x20;
794
795 if (c >= 0x490 && c <= 0x4E9 && (c & 1))
796 return c - 1;
797
798 // More European characters
799 if (c >= 0x1E80 && c <= 0x1EF9 && (c & 1))
800 return c - 1;
801
802 // This character doesn't have uppercase
803 return c;
804 }
805
806 /////////////////////////
807 // Find a substring in a string (case insensitive)
808 // Handles UTF8 strings correctly
Utf8StringCaseFind(const std::string & text,const std::string & search_for)809 size_t Utf8StringCaseFind(const std::string& text, const std::string& search_for)
810 {
811 // HINT: same as stringcasefind, only using UTF8 functions instead (a bit slower)
812 size_t search_for_size = Utf8StringSize(search_for);
813
814 if (text.size() == 0 || search_for_size == 0 || search_for_size > Utf8StringSize(text))
815 return std::string::npos;
816
817 std::string::const_iterator it1 = text.begin();
818 std::string::const_iterator it2 = search_for.begin();
819
820 size_t number_of_same = 0;
821 size_t number_of_same_bytes = 0;
822 size_t result = 0;
823
824 // Go through the text
825 while (it1 != text.end()) {
826 size_t num_skipped = 0;
827 UnicodeChar c1 = UnicodeToLower(GetNextUnicodeFromUtf8(it1, text.end(), num_skipped));
828 UnicodeChar c2 = UnicodeToLower(GetNextUnicodeFromUtf8(it2, search_for.end()));
829
830 // The two characters are the same
831 if (c1 == c2) {
832 number_of_same++; // If number of same characters equals to the size of the substring, we've found it!
833 if (number_of_same == search_for_size)
834 return result - number_of_same_bytes;
835 number_of_same_bytes += num_skipped;
836 } else {
837 number_of_same = 0;
838 number_of_same_bytes = 0;
839 it2 = search_for.begin();
840 }
841
842 result += num_skipped;
843 }
844
845 return std::string::npos; // Not found
846 }
847
848 /////////////////////////
849 // Converts the Utf8 encoded string to format that will display correctly in old LX
OldLxCompatibleString(const std::string & Utf8String)850 std::string OldLxCompatibleString(const std::string &Utf8String)
851 {
852 std::string result = "";
853 std::string::const_iterator utf8_it = Utf8String.begin();
854 std::string::const_iterator last_it = Utf8String.begin();
855
856 UnicodeChar current;
857 int index;
858 while (utf8_it != Utf8String.end()) {
859 current = GetNextUnicodeFromUtf8(utf8_it, Utf8String.end());
860 if (current <= 0x80) { // Normal ascii, don't convert in any way
861 result += (char)current;
862 last_it = utf8_it;
863 continue;
864 }
865
866 // Unicode character
867 index = FindTableIndex(current);
868 result += std::string(last_it, utf8_it); // Keep the UTF8, old LX will ignore it
869 if (index == -1)
870 result += UNKNOWN_CHARACTER; // For characters that cannot be converted
871 else
872 result += tConversionTable[index].Ascii;
873
874 last_it = utf8_it;
875 }
876
877 return result;
878 }
879
880 /////////////////////////
881 // Converts the string created by function above back to a normal UTF8 string
882 // WARNING: passing a normal UTF8 string in this function will result in wrong output
Utf8String(const std::string & OldLxString)883 std::string Utf8String(const std::string& OldLxString)
884 {
885 std::string result = "";
886 std::string::const_iterator utf8_it = OldLxString.begin();
887 std::string::const_iterator last_it = OldLxString.begin();
888
889 UnicodeChar current;
890 while (utf8_it != OldLxString.end()) {
891 current = GetNextUnicodeFromUtf8(utf8_it, OldLxString.end());
892 if (current <= 0x80) { // Normal ascii, don't convert in any way
893 result += (char)current;
894 last_it = utf8_it;
895 continue;
896 }
897
898 // Unicode character
899
900 result += std::string(last_it, utf8_it); // Keep the UTF8
901 if(utf8_it == OldLxString.end()) break;
902 if ((unsigned char)(*utf8_it) <= 0x80) { // If after the unicode character comes another one, just continue
903
904 // Ignore if the converted character comes after UTF8 character
905 // NOTE: the check if the character is really a valid converted UTF8 is not made because
906 // of forward compatibility - in future versions the conversion table can slightly change
907 // which would make it incompatible
908 utf8_it++;
909 }
910
911 last_it = utf8_it;
912 }
913
914 return result;
915 }
916
917 /////////////////////////
918 // Removes special UTF8 characters from the string
RemoveSpecialChars(const std::string & Utf8String)919 std::string RemoveSpecialChars(const std::string &Utf8String)
920 {
921 std::string result = "";
922 std::string::const_iterator utf8_it = Utf8String.begin();
923
924 UnicodeChar current;
925 int index;
926 while (utf8_it != Utf8String.end()) {
927 current = GetNextUnicodeFromUtf8(utf8_it, Utf8String.end());
928 if (current <= 0x80) // Normal ascii, keep it
929 result += (char)current;
930 else { // Replace the unicode character with an ascii equivalent (if some)
931 index = FindTableIndex(current);
932 if (index != -1)
933 result += tConversionTable[index].Ascii;
934 }
935
936 }
937
938 return result;
939 }
940
941
942
943 /*
944 * Functions for UTF conversions taken from enconding.c, created by W3C
945 * The license is available at the following address:
946 * http://dev.w3.org/cvsweb/~checkout~/XML/Copyright?rev=1.1&content-type=text/plain
947 * Original file: http://dev.w3.org/cvsweb/~checkout~/XML/encoding.c
948 *
949 */
950
951 //////////////
952 // Converts UTF16 to UTF8
Utf16ToUtf8(const Utf16String & str)953 std::string Utf16ToUtf8(const Utf16String& str)
954 {
955 Uint32 c, d = 0;
956 std::string result;
957 int bits, iters;
958
959 for (Utf16String::const_iterator in = str.begin(); in != str.end();) {
960 c = *in;
961 in++;
962 if ((c & 0xFC00) == 0xD800) { // surrogates
963 if ((in != str.end()) && (((d = (unsigned char)*in) & 0xFC00) == 0xDC00)) {
964 c &= 0x03FF;
965 c <<= 10;
966 c |= d & 0x03FF;
967 c += 0x10000;
968 } else {
969 return result;
970 }
971
972 in++;
973 }
974
975 // assertion: c is a single UTF-4 value
976
977 if (c < 0x80) {
978 result += (char)c;
979 bits= 0;
980 iters = 0;
981 } else if (c < 0x800) {
982 result += (char)((c >> 6) | 0xC0);
983 bits= 0;
984 iters = 1;
985 } else if (c < 0x10000) {
986 result += (char)((c >> 12) | 0xE0);
987 bits= 6;
988 iters = 2;
989 } else {
990 result += (char)((c >> 18) | 0xF0);
991 bits= 12;
992 iters = 3;
993 }
994
995 for ( ; iters; --iters) {
996 result += (char)(((c >> bits) & 0x3F) | 0x80);
997 }
998 }
999
1000 return result;
1001 }
1002
1003 ///////////////
1004 // Converts UTF8 to UTF16
Utf8ToUtf16(const std::string & str)1005 Utf16String Utf8ToUtf16(const std::string& str)
1006 {
1007 Uint32 c, d, trailing;
1008 Utf16String result;
1009
1010 for (std::string::const_iterator in = str.begin(); in != str.end();) {
1011 d = (unsigned char)*in;
1012 in++;
1013
1014 if (d < 0x80) {
1015 c = d;
1016 trailing = 0;
1017 } else if (d < 0xC0) {
1018 return result; // trailing byte in leading position
1019 } else if (d < 0xE0) {
1020 c = d & 0x1F;
1021 trailing = 1;
1022 } else if (d < 0xF0) {
1023 c = d & 0x0F;
1024 trailing= 2;
1025 } else if (d < 0xF8) {
1026 c = d & 0x07;
1027 trailing= 3;
1028 } else {
1029 return result; // no chance for this in UTF-16
1030 }
1031
1032 for ( ; trailing; trailing--) {
1033 if (in == str.end())
1034 return result;
1035 if (((d = (unsigned char)*in++) & 0xC0) != 0x80)
1036 return result;
1037 c <<= 6;
1038 c |= d & 0x3F;
1039 }
1040
1041 // assertion: c is a single UTF-4 value
1042 if (c < 0x10000) {
1043 result += (Utf16Char) c;
1044 } else if (c < 0x110000) {
1045 c -= 0x10000;
1046 result += 0xD800 | (c >> 10);
1047 result += 0xDC00 | (c & 0x03FF);
1048 } else {
1049 return result;
1050 }
1051 }
1052 return result;
1053 }
1054
1055
1056 //////////////////
1057 // Convert a Unicode string to UTF8
UnicodeToUtf8(const Unicode32String & str)1058 std::string UnicodeToUtf8(const Unicode32String& str)
1059 {
1060 std::string result;
1061 for (Unicode32String::const_iterator i = str.begin(); i != str.end(); i++) {
1062 result += GetUtf8FromUnicode(*i);
1063 }
1064
1065 return result;
1066 }
1067
1068 //////////////////
1069 // Convert a UTF8 string to Unicode
Utf8ToUnicode(const std::string & str)1070 Unicode32String Utf8ToUnicode(const std::string& str)
1071 {
1072 Unicode32String result;
1073 for (std::string::const_iterator it = str.begin(); it != str.end();)
1074 result += GetNextUnicodeFromUtf8(it, str.end());
1075
1076 return result;
1077 }
1078
1079
1080 #ifdef WIN32
1081
1082 #include <windows.h>
1083
1084 //////////////////////////
1085 // Convert a UTF-8 string to system native encoding
Utf8ToSystemNative(const std::string & utf8str)1086 std::string Utf8ToSystemNative(const std::string& utf8str)
1087 {
1088 const Utf16String& u16str = Utf8ToUtf16(utf8str);
1089 char *buf = new char[u16str.size() + 128]; // 128 - just in case...
1090 int len = WideCharToMultiByte(CP_ACP, WC_NO_BEST_FIT_CHARS, (LPCWSTR) u16str.c_str(), u16str.size(), buf, u16str.size() + 128, NULL, NULL);
1091 if (len == 0 && GetLastError() == ERROR_INVALID_FLAGS) {
1092 len = WideCharToMultiByte(CP_ACP, 0, (LPCWSTR) u16str.c_str(), u16str.size(), buf, u16str.size() + 128, NULL, NULL);
1093 if (len == 0)
1094 return "";
1095 }
1096 buf[MIN(len, u16str.size() + 127)] = '\0';
1097
1098 std::string res(buf);
1099 delete[] buf;
1100 return res;
1101 }
1102
1103 ///////////////////////
1104 // Convert a system-native string to UTF-8
SystemNativeToUtf8(const std::string & natstr)1105 std::string SystemNativeToUtf8(const std::string& natstr)
1106 {
1107 if (natstr.size() == 0)
1108 return "";
1109
1110 wchar_t *buf = new wchar_t[natstr.size() + 128]; // 128 for safety
1111 int len = MultiByteToWideChar(CP_ACP, 0, natstr.c_str(), natstr.size(), buf, natstr.size() + 128);
1112 if (len == 0)
1113 return "";
1114 buf[MIN(len, natstr.size() + 127)] = 0;
1115 std::string res = Utf16ToUtf8(Utf16String((Utf16Char *)buf));
1116 delete[] buf;
1117 return res;
1118 }
1119
1120 #endif
1121
ISO88591ToUtf8(const std::string & isostr)1122 std::string ISO88591ToUtf8(const std::string& isostr)
1123 {
1124 std::string result;
1125 result.reserve(isostr.size() * 2);
1126
1127 for (std::string::const_iterator it = isostr.begin(); it != isostr.end(); it++) {
1128 signed char ch = (signed char)*it;
1129 if (ch < 0 ) {
1130 char ch2 = (char)0xc2;
1131 if (ch >= -64)
1132 ++ch2;
1133 result += ch2;
1134 ch &= ~0x40;
1135 }
1136 result += ch;
1137 }
1138
1139 return result;
1140 }
1141
1142
TransformRawToUtf8Pos(const std::string & text,size_t pos)1143 size_t TransformRawToUtf8Pos(const std::string& text, size_t pos) {
1144 const_string_iterator newpos(text);
1145 size_t count = 0;
1146 while(newpos.pos < pos && newpos.pos < text.size()) {
1147 IncUtf8StringIterator(newpos, const_string_iterator(text, text.size()));
1148 count++;
1149 }
1150 if(newpos.pos < pos)
1151 count += pos - newpos.pos;
1152 return count;
1153 }
1154
TransformUtf8PosToRaw(const std::string & text,size_t pos)1155 size_t TransformUtf8PosToRaw(const std::string& text, size_t pos) {
1156 const_string_iterator newpos(text);
1157 size_t count = 0;
1158 while(count < pos && newpos.pos < text.size()) {
1159 IncUtf8StringIterator(newpos, const_string_iterator(text, text.size()));
1160 count++;
1161 }
1162 if(count < pos)
1163 newpos.pos += pos - count;
1164 return newpos.pos;
1165 }
1166
1167