1 /*******************************************************
2
3 CoolReader Engine
4
5 lvxml.cpp: XML parser implementation
6
7 (c) Vadim Lopatin, 2000-2006
8 This source code is distributed under the terms of
9 GNU General Public License
10 See LICENSE file for details
11
12 *******************************************************/
13
14 #include "../include/crtxtenc.h"
15 #include "../include/lvstring.h"
16 #include "../include/cp_stats.h"
17 #include "../include/crlog.h"
18 #include <string.h>
19 #include <stdio.h>
20
21 static const lChar32 __cp737[128] = {
22 /* 0x80 */
23 0x0391, 0x0392, 0x0393, 0x0394, 0x0395, 0x0396, 0x0397, 0x0398,
24 0x0399, 0x039a, 0x039b, 0x039c, 0x039d, 0x039e, 0x039f, 0x03a0,
25 /* 0x90 */
26 0x03a1, 0x03a3, 0x03a4, 0x03a5, 0x03a6, 0x03a7, 0x03a8, 0x03a9,
27 0x03b1, 0x03b2, 0x03b3, 0x03b4, 0x03b5, 0x03b6, 0x03b7, 0x03b8,
28 /* 0xa0 */
29 0x03b9, 0x03ba, 0x03bb, 0x03bc, 0x03bd, 0x03be, 0x03bf, 0x03c0,
30 0x03c1, 0x03c3, 0x03c2, 0x03c4, 0x03c5, 0x03c6, 0x03c7, 0x03c8,
31 /* 0xb0 */
32 0x2591, 0x2592, 0x2593, 0x2502, 0x2524, 0x2561, 0x2562, 0x2556,
33 0x2555, 0x2563, 0x2551, 0x2557, 0x255d, 0x255c, 0x255b, 0x2510,
34 /* 0xc0 */
35 0x2514, 0x2534, 0x252c, 0x251c, 0x2500, 0x253c, 0x255e, 0x255f,
36 0x255a, 0x2554, 0x2569, 0x2566, 0x2560, 0x2550, 0x256c, 0x2567,
37 /* 0xd0 */
38 0x2568, 0x2564, 0x2565, 0x2559, 0x2558, 0x2552, 0x2553, 0x256b,
39 0x256a, 0x2518, 0x250c, 0x2588, 0x2584, 0x258c, 0x2590, 0x2580,
40 /* 0xe0 */
41 0x03c9, 0x03ac, 0x03ad, 0x03ae, 0x03ca, 0x03af, 0x03cc, 0x03cd,
42 0x03cb, 0x03ce, 0x0386, 0x0388, 0x0389, 0x038a, 0x038c, 0x038e,
43 /* 0xf0 */
44 0x038f, 0x00b1, 0x2265, 0x2264, 0x03aa, 0x03ab, 0x00f7, 0x2248,
45 0x00b0, 0x2219, 0x00b7, 0x221a, 0x207f, 0x00b2, 0x25a0, 0x00a0,
46 };
47
48 static const lChar32 __cp1253[128] = {
49 /* 0x80 */
50 0x20ac, 0xfffd, 0x201a, 0x0192, 0x201e, 0x2026, 0x2020, 0x2021,
51 0xfffd, 0x2030, 0xfffd, 0x2039, 0xfffd, 0xfffd, 0xfffd, 0xfffd,
52 /* 0x90 */
53 0xfffd, 0x2018, 0x2019, 0x201c, 0x201d, 0x2022, 0x2013, 0x2014,
54 0xfffd, 0x2122, 0xfffd, 0x203a, 0xfffd, 0xfffd, 0xfffd, 0xfffd,
55 /* 0xa0 */
56 0x00a0, 0x0385, 0x0386, 0x00a3, 0x00a4, 0x00a5, 0x00a6, 0x00a7,
57 0x00a8, 0x00a9, 0xfffd, 0x00ab, 0x00ac, 0x00ad, 0x00ae, 0x2015,
58 /* 0xb0 */
59 0x00b0, 0x00b1, 0x00b2, 0x00b3, 0x0384, 0x00b5, 0x00b6, 0x00b7,
60 0x0388, 0x0389, 0x038a, 0x00bb, 0x038c, 0x00bd, 0x038e, 0x038f,
61 /* 0xc0 */
62 0x0390, 0x0391, 0x0392, 0x0393, 0x0394, 0x0395, 0x0396, 0x0397,
63 0x0398, 0x0399, 0x039a, 0x039b, 0x039c, 0x039d, 0x039e, 0x039f,
64 /* 0xd0 */
65 0x03a0, 0x03a1, 0xfffd, 0x03a3, 0x03a4, 0x03a5, 0x03a6, 0x03a7,
66 0x03a8, 0x03a9, 0x03aa, 0x03ab, 0x03ac, 0x03ad, 0x03ae, 0x03af,
67 /* 0xe0 */
68 0x03b0, 0x03b1, 0x03b2, 0x03b3, 0x03b4, 0x03b5, 0x03b6, 0x03b7,
69 0x03b8, 0x03b9, 0x03ba, 0x03bb, 0x03bc, 0x03bd, 0x03be, 0x03bf,
70 /* 0xf0 */
71 0x03c0, 0x03c1, 0x03c2, 0x03c3, 0x03c4, 0x03c5, 0x03c6, 0x03c7,
72 0x03c8, 0x03c9, 0x03ca, 0x03cb, 0x03cc, 0x03cd, 0x03ce, 0xfffd,
73 };
74
75 static const lChar32 __cp775[128] = {
76 /* 0x80 */
77 0x0106, 0x00fc, 0x00e9, 0x0101, 0x00e4, 0x0123, 0x00e5, 0x0107,
78 0x0142, 0x0113, 0x0156, 0x0157, 0x012b, 0x0179, 0x00c4, 0x00c5,
79 /* 0x90 */
80 0x00c9, 0x00e6, 0x00c6, 0x014d, 0x00f6, 0x0122, 0x00a2, 0x015a,
81 0x015b, 0x00d6, 0x00dc, 0x00f8, 0x00a3, 0x00d8, 0x00d7, 0x00a4,
82 /* 0xa0 */
83 0x0100, 0x012a, 0x00f3, 0x017b, 0x017c, 0x017a, 0x201d, 0x00a6,
84 0x00a9, 0x00ae, 0x00ac, 0x00bd, 0x00bc, 0x0141, 0x00ab, 0x00bb,
85 /* 0xb0 */
86 0x2591, 0x2592, 0x2593, 0x2502, 0x2524, 0x0104, 0x010c, 0x0118,
87 0x0116, 0x2563, 0x2551, 0x2557, 0x255d, 0x012e, 0x0160, 0x2510,
88 /* 0xc0 */
89 0x2514, 0x2534, 0x252c, 0x251c, 0x2500, 0x253c, 0x0172, 0x016a,
90 0x255a, 0x2554, 0x2569, 0x2566, 0x2560, 0x2550, 0x256c, 0x017d,
91 /* 0xd0 */
92 0x0105, 0x010d, 0x0119, 0x0117, 0x012f, 0x0161, 0x0173, 0x016b,
93 0x017e, 0x2518, 0x250c, 0x2588, 0x2584, 0x258c, 0x2590, 0x2580,
94 /* 0xe0 */
95 0x00d3, 0x00df, 0x014c, 0x0143, 0x00f5, 0x00d5, 0x00b5, 0x0144,
96 0x0136, 0x0137, 0x013b, 0x013c, 0x0146, 0x0112, 0x0145, 0x2019,
97 /* 0xf0 */
98 0x00ad, 0x00b1, 0x201c, 0x00be, 0x00b6, 0x00a7, 0x00f7, 0x201e,
99 0x00b0, 0x2219, 0x00b7, 0x00b9, 0x00b3, 0x00b2, 0x25a0, 0x00a0,
100 };
101
102 /*
103 * CP852
104 */
105
106 static const lChar32 __cp852[128] = {
107 /* 0x80 */
108 0x00c7, 0x00fc, 0x00e9, 0x00e2, 0x00e4, 0x016f, 0x0107, 0x00e7,
109 0x0142, 0x00eb, 0x0150, 0x0151, 0x00ee, 0x0179, 0x00c4, 0x0106,
110 /* 0x90 */
111 0x00c9, 0x0139, 0x013a, 0x00f4, 0x00f6, 0x013d, 0x013e, 0x015a,
112 0x015b, 0x00d6, 0x00dc, 0x0164, 0x0165, 0x0141, 0x00d7, 0x010d,
113 /* 0xa0 */
114 0x00e1, 0x00ed, 0x00f3, 0x00fa, 0x0104, 0x0105, 0x017d, 0x017e,
115 0x0118, 0x0119, 0x00ac, 0x017a, 0x010c, 0x015f, 0x00ab, 0x00bb,
116 /* 0xb0 */
117 0x2591, 0x2592, 0x2593, 0x2502, 0x2524, 0x00c1, 0x00c2, 0x011a,
118 0x015e, 0x2563, 0x2551, 0x2557, 0x255d, 0x017b, 0x017c, 0x2510,
119 /* 0xc0 */
120 0x2514, 0x2534, 0x252c, 0x251c, 0x2500, 0x253c, 0x0102, 0x0103,
121 0x255a, 0x2554, 0x2569, 0x2566, 0x2560, 0x2550, 0x256c, 0x00a4,
122 /* 0xd0 */
123 0x0111, 0x0110, 0x010e, 0x00cb, 0x010f, 0x0147, 0x00cd, 0x00ce,
124 0x011b, 0x2518, 0x250c, 0x2588, 0x2584, 0x0162, 0x016e, 0x2580,
125 /* 0xe0 */
126 0x00d3, 0x00df, 0x00d4, 0x0143, 0x0144, 0x0148, 0x0160, 0x0161,
127 0x0154, 0x00da, 0x0155, 0x0170, 0x00fd, 0x00dd, 0x0163, 0x00b4,
128 /* 0xf0 */
129 0x00ad, 0x02dd, 0x02db, 0x02c7, 0x02d8, 0x00a7, 0x00f7, 0x00b8,
130 0x00b0, 0x00a8, 0x02d9, 0x0171, 0x0158, 0x0159, 0x25a0, 0x00a0,
131 };
132
133 /*
134 * ISO-8859-2
135 */
136
137 static const lChar32 __iso8859_2[128] = {
138 /* 0x80*/
139 0x0402, 0x0403, 0x201a, 0x0453, 0x201e, 0x2026, 0x2020, 0x2021,
140 0x20ac, 0x2030, 0x0409, 0x2039, 0x040a, 0x040c, 0x040b, 0x040f,
141 /* 0x90*/
142 0x0452, 0x2018, 0x2019, 0x201c, 0x201d, 0x2022, 0x2013, 0x2014,
143 0x0000, 0x2122, 0x0459, 0x203a, 0x045a, 0x045c, 0x045b, 0x045f,
144 /* 0xa0 */
145 0x00a0, 0x0104, 0x02d8, 0x0141, 0x00a4, 0x013d, 0x015a, 0x00a7,
146 0x00a8, 0x0160, 0x015e, 0x0164, 0x0179, 0x00ad, 0x017d, 0x017b,
147 /* 0xb0 */
148 0x00b0, 0x0105, 0x02db, 0x0142, 0x00b4, 0x013e, 0x015b, 0x02c7,
149 0x00b8, 0x0161, 0x015f, 0x0165, 0x017a, 0x02dd, 0x017e, 0x017c,
150 /* 0xc0 */
151 0x0154, 0x00c1, 0x00c2, 0x0102, 0x00c4, 0x0139, 0x0106, 0x00c7,
152 0x010c, 0x00c9, 0x0118, 0x00cb, 0x011a, 0x00cd, 0x00ce, 0x010e,
153 /* 0xd0 */
154 0x0110, 0x0143, 0x0147, 0x00d3, 0x00d4, 0x0150, 0x00d6, 0x00d7,
155 0x0158, 0x016e, 0x00da, 0x0170, 0x00dc, 0x00dd, 0x0162, 0x00df,
156 /* 0xe0 */
157 0x0155, 0x00e1, 0x00e2, 0x0103, 0x00e4, 0x013a, 0x0107, 0x00e7,
158 0x010d, 0x00e9, 0x0119, 0x00eb, 0x011b, 0x00ed, 0x00ee, 0x010f,
159 /* 0xf0 */
160 0x0111, 0x0144, 0x0148, 0x00f3, 0x00f4, 0x0151, 0x00f6, 0x00f7,
161 0x0159, 0x016f, 0x00fa, 0x0171, 0x00fc, 0x00fd, 0x0163, 0x02d9,
162 };
163
164 /*
165 * ISO-8859-16
166 */
167
168 static const lChar32 __iso8859_16[128] = {
169 /* 0x80*/
170 0x0402, 0x0403, 0x201a, 0x0453, 0x201e, 0x2026, 0x2020, 0x2021,
171 0x20ac, 0x2030, 0x0409, 0x2039, 0x040a, 0x040c, 0x040b, 0x040f,
172 /* 0x90*/
173 0x0452, 0x2018, 0x2019, 0x201c, 0x201d, 0x2022, 0x2013, 0x2014,
174 0x0000, 0x2122, 0x0459, 0x203a, 0x045a, 0x045c, 0x045b, 0x045f,
175 /* 0xa0 */
176 0x00a0, 0x0104, 0x0105, 0x0141, 0x20ac, 0x201e, 0x0160, 0x00a7,
177 0x0161, 0x00a9, 0x0218, 0x00ab, 0x0179, 0x00ad, 0x017a, 0x017b,
178 /* 0xb0 */
179 0x00b0, 0x00b1, 0x010c, 0x0142, 0x017d, 0x201d, 0x00b6, 0x00b7,
180 0x017e, 0x010d, 0x0219, 0x00bb, 0x0152, 0x0153, 0x0178, 0x017c,
181 /* 0xc0 */
182 0x00c0, 0x00c1, 0x00c2, 0x0102, 0x00c4, 0x0106, 0x00c6, 0x00c7,
183 0x00c8, 0x00c9, 0x00ca, 0x00cb, 0x00cc, 0x00cd, 0x00ce, 0x00cf,
184 /* 0xd0 */
185 0x0110, 0x0143, 0x00d2, 0x00d3, 0x00d4, 0x0150, 0x00d6, 0x015a,
186 0x0170, 0x00d9, 0x00da, 0x00db, 0x00dc, 0x0118, 0x021a, 0x00df,
187 /* 0xe0 */
188 0x00e0, 0x00e1, 0x00e2, 0x0103, 0x00e4, 0x0107, 0x00e6, 0x00e7,
189 0x00e8, 0x00e9, 0x00ea, 0x00eb, 0x00ec, 0x00ed, 0x00ee, 0x00ef,
190 /* 0xf0 */
191 0x0111, 0x0144, 0x00f2, 0x00f3, 0x00f4, 0x0151, 0x00f6, 0x015b,
192 0x0171, 0x00f9, 0x00fa, 0x00fb, 0x00fc, 0x0119, 0x021b, 0x00ff,
193 };
194
195 static const lChar32 __cp1257[128] = {
196 /* 0x80 */
197 0x20ac, 0xfffd, 0x201a, 0xfffd, 0x201e, 0x2026, 0x2020, 0x2021,
198 0xfffd, 0x2030, 0xfffd, 0x2039, 0xfffd, 0x00a8, 0x02c7, 0x00b8,
199 /* 0x90 */
200 0xfffd, 0x2018, 0x2019, 0x201c, 0x201d, 0x2022, 0x2013, 0x2014,
201 0xfffd, 0x2122, 0xfffd, 0x203a, 0xfffd, 0x00af, 0x02db, 0xfffd,
202 /* 0xa0 */
203 0x00a0, 0xfffd, 0x00a2, 0x00a3, 0x00a4, 0xfffd, 0x00a6, 0x00a7,
204 0x00d8, 0x00a9, 0x0156, 0x00ab, 0x00ac, 0x00ad, 0x00ae, 0x00c6,
205 /* 0xb0 */
206 0x00b0, 0x00b1, 0x00b2, 0x00b3, 0x00b4, 0x00b5, 0x00b6, 0x00b7,
207 0x00f8, 0x00b9, 0x0157, 0x00bb, 0x00bc, 0x00bd, 0x00be, 0x00e6,
208 /* 0xc0 */
209 0x0104, 0x012e, 0x0100, 0x0106, 0x00c4, 0x00c5, 0x0118, 0x0112,
210 0x010c, 0x00c9, 0x0179, 0x0116, 0x0122, 0x0136, 0x012a, 0x013b,
211 /* 0xd0 */
212 0x0160, 0x0143, 0x0145, 0x00d3, 0x014c, 0x00d5, 0x00d6, 0x00d7,
213 0x0172, 0x0141, 0x015a, 0x016a, 0x00dc, 0x017b, 0x017d, 0x00df,
214 /* 0xe0 */
215 0x0105, 0x012f, 0x0101, 0x0107, 0x00e4, 0x00e5, 0x0119, 0x0113,
216 0x010d, 0x00e9, 0x017a, 0x0117, 0x0123, 0x0137, 0x012b, 0x013c,
217 /* 0xf0 */
218 0x0161, 0x0144, 0x0146, 0x00f3, 0x014d, 0x00f5, 0x00f6, 0x00f7,
219 0x0173, 0x0142, 0x015b, 0x016b, 0x00fc, 0x017c, 0x017e, 0x02d9,
220 };
221
222 static const lChar32 __cp1251[128] = {
223 /* 0x80*/
224 0x0402, 0x0403, 0x201a, 0x0453,
225 0x201e, 0x2026, 0x2020, 0x2021,
226 0x20ac, 0x2030, 0x0409, 0x2039,
227 0x040a, 0x040c, 0x040b, 0x040f,
228 /* 0x90*/
229 0x0452, 0x2018, 0x2019, 0x201c,
230 0x201d, 0x2022, 0x2013, 0x2014,
231 0x0000, 0x2122, 0x0459, 0x203a,
232 0x045a, 0x045c, 0x045b, 0x045f,
233 /* 0xa0*/
234 0x00a0, 0x040e, 0x045e, 0x0408,
235 0x00a4, 0x0490, 0x00a6, 0x00a7,
236 0x0401, 0x00a9, 0x0404, 0x00ab,
237 0x00ac, 0x00ad, 0x00ae, 0x0407,
238 /* 0xb0*/
239 0x00b0, 0x00b1, 0x0406, 0x0456,
240 0x0491, 0x00b5, 0x00b6, 0x00b7,
241 0x0451, 0x2116, 0x0454, 0x00bb,
242 0x0458, 0x0405, 0x0455, 0x0457,
243 /* 0xc0*/
244 0x0410, 0x0411, 0x0412, 0x0413,
245 0x0414, 0x0415, 0x0416, 0x0417,
246 0x0418, 0x0419, 0x041a, 0x041b,
247 0x041c, 0x041d, 0x041e, 0x041f,
248 /* 0xd0*/
249 0x0420, 0x0421, 0x0422, 0x0423,
250 0x0424, 0x0425, 0x0426, 0x0427,
251 0x0428, 0x0429, 0x042a, 0x042b,
252 0x042c, 0x042d, 0x042e, 0x042f,
253 /* 0xe0*/
254 0x0430, 0x0431, 0x0432, 0x0433,
255 0x0434, 0x0435, 0x0436, 0x0437,
256 0x0438, 0x0439, 0x043a, 0x043b,
257 0x043c, 0x043d, 0x043e, 0x043f,
258 /* 0xf0*/
259 0x0440, 0x0441, 0x0442, 0x0443,
260 0x0444, 0x0445, 0x0446, 0x0447,
261 0x0448, 0x0449, 0x044a, 0x044b,
262 0x044c, 0x044d, 0x044e, 0x044f,
263 };
264
265 static const lChar32 __cp1252[128] = {
266 /* 0x80*/
267 0x0402, 0x0403, 0x201a, 0x0453,
268 0x201e, 0x2026, 0x2020, 0x2021,
269 0x20ac, 0x2030, 0x0409, 0x2039,
270 0x040a, 0x040c, 0x040b, 0x040f,
271 /* 0x90*/
272 0x0452, 0x2018, 0x2019, 0x201c,
273 0x201d, 0x2022, 0x2013, 0x2014,
274 0x0000, 0x2122, 0x0459, 0x203a,
275 0x045a, 0x045c, 0x045b, 0x045f,
276 /* 0xa0*/
277 0x00a0, 0x00a1, 0x00a2, 0x00a3,
278 0x00a4, 0x00a5, 0x00a6, 0x00a7,
279 0x00a8, 0x00a9, 0x00aa, 0x00ab,
280 0x00ac, 0x00ad, 0x00ae, 0x00af,
281 /* 0xb0*/
282 0x00b0, 0x00b1, 0x00b2, 0x00b3,
283 0x00b4, 0x00b5, 0x00b6, 0x00b7,
284 0x00b8, 0x00b9, 0x00ba, 0x00bb,
285 0x00bc, 0x00bd, 0x00be, 0x00bf,
286 /* 0xc0*/
287 0x00c0, 0x00c1, 0x00c2, 0x00c3,
288 0x00c4, 0x00c5, 0x00c6, 0x00c7,
289 0x00c8, 0x00c9, 0x00ca, 0x00cb,
290 0x00cc, 0x00cd, 0x00ce, 0x00cf,
291 /* 0xd0*/
292 0x00d0, 0x00d1, 0x00d2, 0x00d3,
293 0x00d4, 0x00d5, 0x00d6, 0x00d7,
294 0x00d8, 0x00d9, 0x00da, 0x00db,
295 0x00dc, 0x00dd, 0x00de, 0x00df,
296 /* 0xe0*/
297 0x00e0, 0x00e1, 0x00e2, 0x00e3,
298 0x00e4, 0x00e5, 0x00e6, 0x00e7,
299 0x00e8, 0x00e9, 0x00ea, 0x00eb,
300 0x00ec, 0x00ed, 0x00ee, 0x00ef,
301 /* 0xf0*/
302 0x00f0, 0x00f1, 0x00f2, 0x00f3,
303 0x00f4, 0x00f5, 0x00f6, 0x00f7,
304 0x00f8, 0x00f9, 0x00fa, 0x00fb,
305 0x00fc, 0x00fd, 0x00fe, 0x00ff,
306 };
307
308 static const lChar32 __cp1254[128] = {
309 /* 0x80 */
310 0x20ac, 0xfffd, 0x201a, 0x0192,
311 0x201e, 0x2026, 0x2020, 0x2021,
312 0x02c6, 0x2030, 0x0160, 0x2039,
313 0x0152, 0xfffd, 0xfffd, 0xfffd,
314 /* 0x90 */
315 0xfffd, 0x2018, 0x2019, 0x201c,
316 0x201d, 0x2022, 0x2013, 0x2014,
317 0x02dc, 0x2122, 0x0161, 0x203a,
318 0x0153, 0xfffd, 0xfffd, 0x0178,
319 /* 0xa0*/
320 0x00a0, 0x00a1, 0x00a2, 0x00a3,
321 0x00a4, 0x00a5, 0x00a6, 0x00a7,
322 0x00a8, 0x00a9, 0x00aa, 0x00ab,
323 0x00ac, 0x00ad, 0x00ae, 0x00af,
324 /* 0xb0*/
325 0x00b0, 0x00b1, 0x00b2, 0x00b3,
326 0x00b4, 0x00b5, 0x00b6, 0x00b7,
327 0x00b8, 0x00b9, 0x00ba, 0x00bb,
328 0x00bc, 0x00bd, 0x00be, 0x00bf,
329 /* 0xc0*/
330 0x00c0, 0x00c1, 0x00c2, 0x00c3,
331 0x00c4, 0x00c5, 0x00c6, 0x00c7,
332 0x00c8, 0x00c9, 0x00ca, 0x00cb,
333 0x00cc, 0x00cd, 0x00ce, 0x00cf,
334 /* 0xd0 */
335 0x011e, 0x00d1, 0x00d2, 0x00d3,
336 0x00d4, 0x00d5, 0x00d6, 0x00d7,
337 0x00d8, 0x00d9, 0x00da, 0x00db,
338 0x00dc, 0x0130, 0x015e, 0x00df,
339 /* 0xe0*/
340 0x00e0, 0x00e1, 0x00e2, 0x00e3,
341 0x00e4, 0x00e5, 0x00e6, 0x00e7,
342 0x00e8, 0x00e9, 0x00ea, 0x00eb,
343 0x00ec, 0x00ed, 0x00ee, 0x00ef,
344 /* 0xf0 */
345 0x011f, 0x00f1, 0x00f2, 0x00f3,
346 0x00f4, 0x00f5, 0x00f6, 0x00f7,
347 0x00f8, 0x00f9, 0x00fa, 0x00fb,
348 0x00fc, 0x0131, 0x015f, 0x00ff,
349 };
350
351 static const lChar32 __cp866[128] = {
352 /* 0x80*/
353 0x0410, 0x0411, 0x0412, 0x0413,
354 0x0414, 0x0415, 0x0416, 0x0417,
355 0x0418, 0x0419, 0x041a, 0x041b,
356 0x041c, 0x041d, 0x041e, 0x041f,
357 /* 0x90*/
358 0x0420, 0x0421, 0x0422, 0x0423,
359 0x0424, 0x0425, 0x0426, 0x0427,
360 0x0428, 0x0429, 0x042a, 0x042b,
361 0x042c, 0x042d, 0x042e, 0x042f,
362 /* 0xa0*/
363 0x0430, 0x0431, 0x0432, 0x0433,
364 0x0434, 0x0435, 0x0436, 0x0437,
365 0x0438, 0x0439, 0x043a, 0x043b,
366 0x043c, 0x043d, 0x043e, 0x043f,
367 /* 0xb0*/
368 0x2591, 0x2592, 0x2593, 0x2502,
369 0x2524, 0x2561, 0x2562, 0x2556,
370 0x2555, 0x2563, 0x2551, 0x2557,
371 0x255d, 0x255c, 0x255b, 0x2510,
372 /* 0xc0*/
373 0x2514, 0x2534, 0x252c, 0x251c,
374 0x2500, 0x253c, 0x255e, 0x255f,
375 0x255a, 0x2554, 0x2569, 0x2566,
376 0x2560, 0x2550, 0x256c, 0x2567,
377 /* 0xd0*/
378 0x2568, 0x2564, 0x2565, 0x2559,
379 0x2558, 0x2552, 0x2553, 0x256b,
380 0x256a, 0x2518, 0x250c, 0x2588,
381 0x2584, 0x258c, 0x2590, 0x2580,
382 /* 0xe0*/
383 0x0440, 0x0441, 0x0442, 0x0443,
384 0x0444, 0x0445, 0x0446, 0x0447,
385 0x0448, 0x0449, 0x044a, 0x044b,
386 0x044c, 0x044d, 0x044e, 0x044f,
387 /* 0xf0*/
388 0x0401, 0x0451, 0x0404, 0x0454,
389 0x0407, 0x0457, 0x040e, 0x045e,
390 0x00b0, 0x2219, 0x00b7, 0x221a,
391 0x2116, 0x00a4, 0x25a0, 0x00a0,
392 };
393
394 static const lChar32 __koi8r[128] = {
395 /* 0x80*/
396 0x2500, 0x2502, 0x250c, 0x2510,
397 0x2514, 0x2518, 0x251c, 0x2524,
398 0x252c, 0x2534, 0x253c, 0x2580,
399 0x2584, 0x2588, 0x258c, 0x2590,
400 /* 0x90*/
401 0x2591, 0x2592, 0x2593, 0x2320,
402 0x25a0, 0x2219, 0x221a, 0x2248,
403 0x2264, 0x2265, 0x00a0, 0x2321,
404 0x00b0, 0x00b2, 0x00b7, 0x00f7,
405 /* 0xa0*/
406 0x2550, 0x2551, 0x2552, 0x0451,
407 0x2553, 0x2554, 0x2555, 0x2556,
408 0x2557, 0x2558, 0x2559, 0x255a,
409 0x255b, 0x255c, 0x255d, 0x255e,
410 /* 0xb0*/
411 0x255f, 0x2560, 0x2561, 0x0401,
412 0x2562, 0x2563, 0x2564, 0x2565,
413 0x2566, 0x2567, 0x2568, 0x2569,
414 0x256a, 0x256b, 0x256c, 0x00a9,
415 /* 0xc0*/
416 0x044e, 0x0430, 0x0431, 0x0446,
417 0x0434, 0x0435, 0x0444, 0x0433,
418 0x0445, 0x0438, 0x0439, 0x043a,
419 0x043b, 0x043c, 0x043d, 0x043e,
420 /* 0xd0*/
421 0x043f, 0x044f, 0x0440, 0x0441,
422 0x0442, 0x0443, 0x0436, 0x0432,
423 0x044c, 0x044b, 0x0437, 0x0448,
424 0x044d, 0x0449, 0x0447, 0x044a,
425 /* 0xe0*/
426 0x042e, 0x0410, 0x0411, 0x0426,
427 0x0414, 0x0415, 0x0424, 0x0413,
428 0x0425, 0x0418, 0x0419, 0x041a,
429 0x041b, 0x041c, 0x041d, 0x041e,
430 /* 0xf0*/
431 0x041f, 0x042f, 0x0420, 0x0421,
432 0x0422, 0x0423, 0x0416, 0x0412,
433 0x042c, 0x042b, 0x0417, 0x0428,
434 0x042d, 0x0429, 0x0427, 0x042a,
435 };
436
437 static const lChar32 __cp1250[128] = {
438 /* 0x80*/
439 0x20ac, 0x0000, 0x201a, 0x0000,
440 0x201e, 0x2026, 0x2020, 0x2021,
441 0x0000, 0x2030, 0x0160, 0x2039,
442 0x015a, 0x0164, 0x017d, 0x0179,
443 /* 0x90*/
444 0x0000, 0x2018, 0x2019, 0x201c,
445 0x201d, 0x2022, 0x2013, 0x2014,
446 0x0000, 0x2122, 0x0161, 0x203a,
447 0x015b, 0x0165, 0x017e, 0x017a,
448 /* 0xa0*/
449 0x00a0, 0x02c7, 0x02d8, 0x0141,
450 0x00a4, 0x0104, 0x00a6, 0x00a7,
451 0x00a8, 0x00a9, 0x015e, 0x00ab,
452 0x00ac, 0x00ad, 0x00ae, 0x017b,
453 /* 0xb0*/
454 0x00b0, 0x00b1, 0x02db, 0x0142,
455 0x00b4, 0x00b5, 0x00b6, 0x00b7,
456 0x00b8, 0x0105, 0x015f, 0x00bb,
457 0x013d, 0x02dd, 0x013e, 0x017c,
458 /* 0xc0*/
459 0x0154, 0x00c1, 0x00c2, 0x0102,
460 0x00c4, 0x0139, 0x0106, 0x00c7,
461 0x010c, 0x00c9, 0x0118, 0x00cb,
462 0x011a, 0x00cd, 0x00ce, 0x010e,
463 /* 0xd0*/
464 0x0110, 0x0143, 0x0147, 0x00d3,
465 0x00d4, 0x0150, 0x00d6, 0x00d7,
466 0x0158, 0x016e, 0x00da, 0x0170,
467 0x00dc, 0x00dd, 0x0162, 0x00df,
468 /* 0xe0*/
469 0x0155, 0x00e1, 0x00e2, 0x0103,
470 0x00e4, 0x013a, 0x0107, 0x00e7,
471 0x010d, 0x00e9, 0x0119, 0x00eb,
472 0x011b, 0x00ed, 0x00ee, 0x010f,
473 /* 0xf0*/
474 0x0111, 0x0144, 0x0148, 0x00f3,
475 0x00f4, 0x0151, 0x00f6, 0x00f7,
476 0x0159, 0x016f, 0x00fa, 0x0171,
477 0x00fc, 0x00fd, 0x0163, 0x02d9,
478 };
479
480 static const lChar32 __cp850[128] = {
481 /* 0x80*/
482 0x00c7, 0x00fc, 0x00e9, 0x00e2,
483 0x00e4, 0x00e0, 0x00e5, 0x00e7,
484 0x00ea, 0x00eb, 0x00e8, 0x00ef,
485 0x00ee, 0x00ec, 0x00c4, 0x00c5,
486 /* 0x90*/
487 0x00c9, 0x00e6, 0x00c6, 0x00f4,
488 0x00f6, 0x00f2, 0x00fb, 0x00f9,
489 0x00ff, 0x00d6, 0x00dc, 0x00f8,
490 0x00a3, 0x00d8, 0x00d7, 0x0192,
491 /* 0xa0*/
492 0x00e1, 0x00ed, 0x00f3, 0x00fa,
493 0x00f1, 0x00d1, 0x00aa, 0x00ba,
494 0x00bf, 0x00ae, 0x00ac, 0x00bd,
495 0x00bc, 0x00a1, 0x00ab, 0x00bb,
496 /* 0xb0*/
497 0x2591, 0x2592, 0x2593, 0x2502,
498 0x2524, 0x00c1, 0x00c2, 0x00c0,
499 0x00a9, 0x2563, 0x2551, 0x2557,
500 0x255d, 0x00a2, 0x00a5, 0x2510,
501 /* 0xc0*/
502 0x2514, 0x2534, 0x252c, 0x251c,
503 0x2500, 0x253c, 0x00e3, 0x00c3,
504 0x255a, 0x2554, 0x2569, 0x2566,
505 0x2560, 0x2550, 0x256c, 0x00a4,
506 /* 0xd0*/
507 0x00f0, 0x00d0, 0x00ca, 0x00cb,
508 0x00c8, 0x0131, 0x00cd, 0x00ce,
509 0x00cf, 0x2518, 0x250c, 0x2588,
510 0x2584, 0x00a6, 0x00cc, 0x2580,
511 /* 0xe0*/
512 0x00d3, 0x00df, 0x00d4, 0x00d2,
513 0x00f5, 0x00d5, 0x00b5, 0x00fe,
514 0x00de, 0x00da, 0x00db, 0x00d9,
515 0x00fd, 0x00dd, 0x00af, 0x00b4,
516 /* 0xf0*/
517 0x00ad, 0x00b1, 0x2017, 0x00be,
518 0x00b6, 0x00a7, 0x00f7, 0x00b8,
519 0x00b0, 0x00a8, 0x00b7, 0x00b9,
520 0x00b3, 0x00b2, 0x25a0, 0x00a0,
521 };
522
523 #define CRENC_ID_CP1250 (CRENC_ID_8BIT_START+1)
524 #define CRENC_ID_CP1251 (CRENC_ID_8BIT_START+2)
525 #define CRENC_ID_CP1252 (CRENC_ID_8BIT_START+3)
526 #define CRENC_ID_CP1253 (CRENC_ID_8BIT_START+4)
527 #define CRENC_ID_CP1257 (CRENC_ID_8BIT_START+5)
528 #define CRENC_ID_CP775 (CRENC_ID_8BIT_START+6)
529 #define CRENC_ID_CP737 (CRENC_ID_8BIT_START+7)
530 #define CRENC_ID_CP866 (CRENC_ID_8BIT_START+8)
531 #define CRENC_ID_CP850 (CRENC_ID_8BIT_START+9)
532 #define CRENC_ID_KOI8R (CRENC_ID_8BIT_START+10)
533 #define CRENC_ID_ISO8859_2 (CRENC_ID_8BIT_START+11)
534 #define CRENC_ID_CP1254 (CRENC_ID_8BIT_START+12)
535 #define CRENC_ID_CP852 (CRENC_ID_8BIT_START+13)
536 #define CRENC_ID_ISO8859_16 (CRENC_ID_8BIT_START+14)
537
538
539 /// add other encodings here
540 static struct {
541 const char * name;
542 const lChar32 * table;
543 int id;
544 } _enc_table[] = {
545 {"windows-1250", __cp1250, CRENC_ID_CP1250},
546 {"windows-1251", __cp1251, CRENC_ID_CP1251},
547 {"windows-1252", __cp1252, CRENC_ID_CP1252},
548 {"windows-1253", __cp1253, CRENC_ID_CP1253},
549 {"windows-1254", __cp1254, CRENC_ID_CP1254},
550 {"windows-1257", __cp1257, CRENC_ID_CP1257},
551 {"cp775", __cp775, CRENC_ID_CP775},
552 {"cp737", __cp737, CRENC_ID_CP737},
553 {"cp1250", __cp1250, CRENC_ID_CP1250},
554 {"cp1251", __cp1251, CRENC_ID_CP1251},
555 {"cp1254", __cp1254, CRENC_ID_CP1254},
556 {"iso-8859-5", __cp1251, CRENC_ID_CP1251},
557 {"iso_8859-5", __cp1251, CRENC_ID_CP1251},
558 {"iso8859-5", __cp1251, CRENC_ID_CP1251},
559 {"cp1252", __cp1252, CRENC_ID_CP1252},
560 {"iso-8859-1", __cp1252, CRENC_ID_CP1252},
561 {"iso_8859-1", __cp1252, CRENC_ID_CP1252},
562 {"iso8859-1", __cp1252, CRENC_ID_CP1252},
563 {"latin-1", __cp1252, CRENC_ID_CP1252},
564 {"cp1253", __cp1253, CRENC_ID_CP1253},
565 {"cp1257", __cp1257, CRENC_ID_CP1257},
566 {"cp866", __cp866, CRENC_ID_CP866},
567 {"cp850", __cp850, CRENC_ID_CP850},
568 {"cp852", __cp852, CRENC_ID_CP852},
569 {"windows-866", __cp866, CRENC_ID_CP866},
570 {"windows-850", __cp850, CRENC_ID_CP850},
571 {"windows-852", __cp852, CRENC_ID_CP852},
572 {"koi-8r", __koi8r, CRENC_ID_KOI8R},
573 {"koi8r", __koi8r, CRENC_ID_KOI8R},
574 {"koi8-r", __koi8r, CRENC_ID_KOI8R},
575 {"iso8859-2", __iso8859_2, CRENC_ID_ISO8859_2},
576 {"iso-8859-2", __iso8859_2, CRENC_ID_ISO8859_2},
577 {"iso8859_2", __iso8859_2, CRENC_ID_ISO8859_2},
578 {"latin-2", __iso8859_2, CRENC_ID_ISO8859_2},
579 {"latin-5", __iso8859_2, CRENC_ID_ISO8859_2},
580 {"iso8859-16", __iso8859_16, CRENC_ID_ISO8859_16},
581 {"iso-8859-16", __iso8859_16, CRENC_ID_ISO8859_16},
582 {"iso8859_16", __iso8859_16, CRENC_ID_ISO8859_16},
583 {NULL, NULL, 0}
584 };
585
CREncodingNameToId(const lChar32 * enc_name)586 int CREncodingNameToId( const lChar32 * enc_name )
587 {
588 lString32 s( enc_name );
589 s.lowercase();
590 const lChar32 * encoding_name = s.c_str();
591 if ( !lStr_cmp(encoding_name, "utf-8") )
592 return CRENC_ID_UTF8;
593 else if ( !lStr_cmp(encoding_name, "utf-16") )
594 return CRENC_ID_UTF16_LE;
595 else if ( !lStr_cmp(encoding_name, "utf-16le") )
596 return CRENC_ID_UTF16_LE;
597 else if ( !lStr_cmp(encoding_name, "utf-16be") )
598 return CRENC_ID_UTF16_BE;
599 else if ( !lStr_cmp(encoding_name, "utf-32") )
600 return CRENC_ID_UTF16_LE;
601 else if ( !lStr_cmp(encoding_name, "utf-32le") )
602 return CRENC_ID_UTF16_LE;
603 else if ( !lStr_cmp(encoding_name, "utf-32be") )
604 return CRENC_ID_UTF16_BE;
605 for (int i=0; _enc_table[i].name!=NULL; i++)
606 {
607 if ( !lStr_cmp(encoding_name, _enc_table[i].name) )
608 {
609 return _enc_table[i].id;
610 }
611 }
612 return CRENC_ID_UNKNOWN; // not found
613 }
614
CREncodingIdToName(int id)615 const char * CREncodingIdToName( int id )
616 {
617 switch ( id ) {
618 case CRENC_ID_UTF8:
619 return "utf-8";
620 case CRENC_ID_UTF16_LE:
621 return "utf-16le";
622 case CRENC_ID_UTF16_BE:
623 return "utf-16be";
624 case CRENC_ID_UTF32_LE:
625 return "utf-32be";
626 case CRENC_ID_UTF32_BE:
627 return "utf-32be";
628 }
629 for (int i=0; _enc_table[i].name!=NULL; i++)
630 {
631 if ( id == _enc_table[i].id )
632 {
633 return _enc_table[i].name;
634 }
635 }
636 return NULL; // not found
637 }
638
GetCharsetByte2UnicodeTable(const lChar32 * enc_name)639 const lChar32 * GetCharsetByte2UnicodeTable( const lChar32 * enc_name )
640 {
641 lString32 s( enc_name );
642 s.lowercase();
643 const lChar32 * encoding_name = s.c_str();
644 for (int i=0; _enc_table[i].name!=NULL; i++)
645 {
646 if ( !lStr_cmp(encoding_name, _enc_table[i].name) )
647 {
648 return _enc_table[i].table;
649 }
650 }
651 return NULL; // not found
652 }
653
GetCharsetByte2UnicodeTableById(int id)654 const lChar32 * GetCharsetByte2UnicodeTableById( int id )
655 {
656 for (int i=0; _enc_table[i].name!=NULL; i++)
657 {
658 if ( id==_enc_table[i].id )
659 {
660 return _enc_table[i].table;
661 }
662 }
663 return NULL; // not found
664 }
665
langToCodepage(int lang)666 int langToCodepage( int lang )
667 {
668 switch ( lang )
669 {
670 case 0x0436 : // Afrikaans
671 return 1252;
672 case 0x041c : // Albanian
673 return 1252;
674 case 0x0401 : // Arabic
675 case 0x1401 : // Arabic Algeria
676 case 0x3c01 : // Arabic Bahrain
677 case 0x0c01 : // Arabic Egypt
678 case 0x0001 : // Arabic General
679 case 0x0801 : // Arabic Iraq
680 case 0x2c01 : // Arabic Jordan
681 case 0x3401 : // Arabic Kuwait
682 case 0x3001 : // Arabic Lebanon
683 case 0x1001 : // Arabic Libya
684 case 0x1801 : // Arabic Morocco
685 case 0x2001 : // Arabic Oman
686 case 0x4001 : // Arabic Qatar
687 case 0x2801 : // Arabic Syria
688 case 0x1c01 : // Arabic Tunisia
689 case 0x3801 : // Arabic U.A.E.
690 case 0x2401 : // Arabic Yemen
691 return 1256;
692 case 0x042b : // Armenian
693 return 1252;
694 case 0x044d : // Assamese
695 return 1252;
696 case 0x082c : // Azeri Cyrillic
697 return 1251;
698 case 0x042c : // Azeri Latin
699 return 1252;
700 case 0x042d : // Basque
701 return 1252;
702 case 0x0445 : // Bengali
703 case 0x101a : // Bosnia Herzegovina
704 return 1252;
705 case 0x0402 : // Bulgarian
706 return 1251;
707 case 0x0455 : // Burmese
708 return 1252;
709 case 0x0423 : // Byelorussian
710 return 1251;
711 case 0x0403 : // Catalan
712 return 1252;
713 case 0x0804 : // Chinese China
714 case 0x0004 : // Chinese General
715 case 0x0c04 : // Chinese Hong Kong
716 //case 0x0c04 : // Chinese Macao
717 case 0x1004 : // Chinese Singapore
718 case 0x0404 : // Chinese Taiwan
719 return 950;
720 case 0x041a : // Croatian
721 return 1250;
722 case 0x0405 : // Czech
723 return 1250;
724 case 0x0406 : // Danish
725 return 1252;
726 case 0x0813 : // Dutch Belgium
727 case 0x0413 : // Dutch Standard
728 return 1252;
729 case 0x0c09 : // English Australia
730 case 0x2809 : // English Belize
731 case 0x0809 : // English British
732 case 0x1009 : // English Canada
733 case 0x2409 : // English Caribbean
734 case 0x0009 : // English General
735 case 0x1809 : // English Ireland
736 case 0x2009 : // English Jamaica
737 case 0x1409 : // English New Zealand
738 case 0x3409 : // English Philippines
739 case 0x1c09 : // English South Africa
740 case 0x2c09 : // English Trinidad
741 case 0x0409 : // English United States
742 //case 0x0409 : // English Zimbabwe
743 return 1252;
744 case 0x0425 : // Estonian
745 return 1257;
746 case 0x0438 : // Faeroese
747 case 0x0429 : // Farsi
748 return 1252;
749 case 0x040b : // Finnish
750 return 1252;
751 case 0x040c : // French
752 case 0x080c : // French Belgium
753 case 0x2c0c : // French Cameroon
754 case 0x0c0c : // French Canada
755 case 0x300c : // French Cote d'Ivoire
756 case 0x140c : // French Luxemburg
757 case 0x340c : // French Mali
758 case 0x180c : // French Monaco
759 case 0x200c : // French Reunion
760 case 0x280c : // French Senegal
761 case 0x100c : // French Swiss
762 case 0x1c0c : // French West Indies
763 case 0x240c : // French Zaire
764 return 1252;
765 case 0x0462 : // Frisian
766 case 0x043c : // Gaelic
767 case 0x083c : // Gaelic Ireland
768 case 0x0456 : // Galician
769 case 0x0437 : // Georgian
770 return 1252;
771 case 0x0407 : // German
772 case 0x0c07 : // German Austrian
773 case 0x1407 : // German Liechtenstein
774 case 0x1007 : // German Luxemburg
775 case 0x0807 : // German Switzerland
776 return 1252;
777 case 0x0408 : // Greek
778 return 1253;
779 case 0x0447 : // Gujarati
780 return 1252;
781 case 0x040d : // Hebrew
782 return 1255;
783 case 0x0439 : // Hindi
784 return 1252;
785 case 0x040e : // Hungarian
786 return 1252;
787 case 0x040f : // Icelandic
788 return 1252;
789 case 0x0421 : // Indonesian
790 return 1252;
791 case 0x0410 : // Italian
792 case 0x0810 : // Italian Switzerland
793 return 1252;
794 case 0x0411 : // Japanese
795 return 932;
796 case 0x044b : // Kannada
797 return 1252;
798 case 0x0460 : // Kashmiri
799 case 0x0860 : // Kashmiri India
800 return 1252;
801 case 0x043f : // Kazakh
802 return 1251;
803 case 0x0453 : // Khmer
804 return 1252;
805 case 0x0440 : // Kirghiz
806 return 1252;
807 case 0x0457 : // Konkani
808 return 1252;
809 case 0x0412 : // Korean
810 case 0x0812 : // Korean Johab
811 return 1252;
812 case 0x0454 : // Lao
813 return 1252;
814 case 0x0426 : // Latvian
815 return 1257;
816 case 0x0427 : // Lithuanian
817 case 0x0827 : // Lithuanian Classic
818 return 1257;
819 case 0x043e : // Macedonian
820 return 1252;
821 //case 0x043e : // Malay
822 case 0x083e : // Malay Brunei Darussalam
823 case 0x044c : // Malayalam
824 return 1252;
825 case 0x043a : // Maltese
826 return 1252;
827 case 0x0458 : // Manipuri
828 return 1252;
829 case 0x044e : // Marathi
830 return 1252;
831 case 0x0450 : // Mongolian
832 return 1252;
833 case 0x0461 : // Nepali
834 case 0x0861 : // Nepali India
835 return 1252;
836 case 0x0414 : // Norwegian Bokmal
837 case 0x0814 : // Norwegian Nynorsk
838 return 1252;
839 case 0x0448 : // Oriya
840 return 1252;
841 case 0x0415 : // Polish
842 return 1250;
843 case 0x0416 : // Portuguese Brazil
844 case 0x0816 : // Portuguese Iberian
845 return 1252;
846 case 0x0446 : // Punjabi
847 case 0x0417 : // Rhaeto-Romanic
848 return 1252;
849 case 0x0418 : // Romanian
850 case 0x0818 : // Romanian Moldova
851 return 1252;
852 case 0x0419 : // Russian
853 case 0x0819 : // Russian Moldova
854 return 1251;
855 case 0x043b : // Sami Lappish
856 return 1252;
857 case 0x044f : // Sanskrit
858 return 1252;
859 case 0x0c1a : // Serbian Cyrillic
860 return 1251;
861 case 0x081a : // Serbian Latin
862 return 1252;
863 case 0x0459 : // Sindhi
864 return 1252;
865 case 0x041b : // Slovak
866 return 1252;
867 case 0x0424 : // Slovenian
868 return 1252;
869 case 0x042e : // Sorbian
870 return 1252;
871 case 0x2c0a : // Spanish Argentina
872 case 0x400a : // Spanish Bolivia
873 case 0x340a : // Spanish Chile
874 case 0x240a : // Spanish Colombia
875 case 0x140a : // Spanish Costa Rica
876 case 0x1c0a : // Spanish Dominican Republic
877 case 0x300a : // Spanish Ecuador
878 case 0x440a : // Spanish El Salvador
879 case 0x100a : // Spanish Guatemala
880 case 0x480a : // Spanish Honduras
881 case 0x080a : // Spanish Mexico
882 case 0x0c0a : // Spanish Modern
883 case 0x4c0a : // Spanish Nicaragua
884 case 0x180a : // Spanish Panama
885 case 0x3c0a : // Spanish Paraguay
886 case 0x280a : // Spanish Peru
887 case 0x500a : // Spanish Puerto Rico
888 case 0x040a : // Spanish Traditional
889 case 0x380a : // Spanish Uruguay
890 case 0x200a : // Spanish Venezuela
891 return 1252;
892 case 0x0430 : // Sutu
893 return 1252;
894 case 0x0441 : // Swahili
895 return 1252;
896 case 0x041d : // Swedish
897 case 0x081d : // Swedish Finland
898 return 1252;
899 case 0x0428 : // Tajik
900 return 1252;
901 case 0x0449 : // Tamil
902 return 1252;
903 case 0x0444 : // Tatar
904 return 1251;
905 case 0x044a : // Telugu
906 return 1252;
907 case 0x041e : // Thai
908 return 1252;
909 case 0x0451 : // Tibetan
910 return 1252;
911 case 0x0431 : // Tsonga
912 return 1252;
913 case 0x0432 : // Tswana
914 return 1252;
915 case 0x041f : // Turkish
916 return 1254;
917 case 0x0442 : // Turkmen
918 return 1251;
919 case 0x0422 : // Ukrainian
920 return 1251;
921 case 0x0420 : // Urdu
922 return 1252;
923 case 0x0820 : // Urdu India
924 return 1252;
925 case 0x0843 : // Uzbek Cyrillic
926 return 1251;
927 case 0x0443 : // Uzbek Latin
928 return 1252;
929 case 0x0433 : // Venda
930 return 1252;
931 case 0x042a : // Vietnamese
932 return 1252;
933 case 0x0452 : // Welsh
934 return 1252;
935 case 0x0434 : // Xhosa
936 return 1252;
937 case 0x043d : // Yiddish
938 return 1252;
939 case 0x0435 : // Zulu
940 return 1252;
941 default:
942 return 1251;
943 }
944 }
945
langToLanguage(int lang)946 const char* langToLanguage( int lang )
947 {
948 switch ( lang )
949 {
950 case 0x0436 : // Afrikaans
951 return "af";
952 case 0x041c : // Albanian
953 return "sq";
954 case 0x0401 : // Arabic
955 case 0x1401 : // Arabic Algeria
956 case 0x3c01 : // Arabic Bahrain
957 case 0x0c01 : // Arabic Egypt
958 case 0x0001 : // Arabic General
959 case 0x0801 : // Arabic Iraq
960 case 0x2c01 : // Arabic Jordan
961 case 0x3401 : // Arabic Kuwait
962 case 0x3001 : // Arabic Lebanon
963 case 0x1001 : // Arabic Libya
964 case 0x1801 : // Arabic Morocco
965 case 0x2001 : // Arabic Oman
966 case 0x4001 : // Arabic Qatar
967 case 0x2801 : // Arabic Syria
968 case 0x1c01 : // Arabic Tunisia
969 case 0x3801 : // Arabic U.A.E.
970 case 0x2401 : // Arabic Yemen
971 return "ar";
972 case 0x042b : // Armenian
973 return "hy";
974 case 0x044d : // Assamese
975 return "as";
976 case 0x082c : // Azeri Cyrillic
977 case 0x042c : // Azeri Latin
978 return "az";
979 case 0x042d : // Basque
980 return "eu";
981 case 0x0445 : // Bengali
982 return "bn";
983 case 0x101a : // Bosnia Herzegovina
984 return "hr";
985 case 0x0402 : // Bulgarian
986 return "bg";
987 case 0x0455 : // Burmese
988 return "my";
989 case 0x0423 : // Byelorussian
990 return "be";
991 case 0x0403 : // Catalan
992 return "ca";
993 case 0x0804 : // Chinese China
994 case 0x0004 : // Chinese General
995 case 0x0c04 : // Chinese Hong Kong
996 //case 0x0c04 : // Chinese Macao
997 case 0x1004 : // Chinese Singapore
998 case 0x0404 : // Chinese Taiwan
999 return "zh";
1000 case 0x041a : // Croatian
1001 return "hr";
1002 case 0x0405 : // Czech
1003 return "cs";
1004 case 0x0406 : // Danish
1005 return "da";
1006 case 0x0813 : // Dutch Belgium
1007 case 0x0413 : // Dutch Standard
1008 return "nl";
1009 case 0x0c09 : // English Australia
1010 case 0x2809 : // English Belize
1011 case 0x0809 : // English British
1012 case 0x1009 : // English Canada
1013 case 0x2409 : // English Caribbean
1014 case 0x0009 : // English General
1015 case 0x1809 : // English Ireland
1016 case 0x2009 : // English Jamaica
1017 case 0x1409 : // English New Zealand
1018 case 0x3409 : // English Philippines
1019 case 0x1c09 : // English South Africa
1020 case 0x2c09 : // English Trinidad
1021 case 0x0409 : // English United States
1022 //case 0x0409 : // English Zimbabwe
1023 return "en";
1024 case 0x0425 : // Estonian
1025 return "et";
1026 case 0x0438 : // Faeroese
1027 return "fo";
1028 case 0x0429 : // Farsi
1029 return "fa";
1030 case 0x040b : // Finnish
1031 return "fi";
1032 case 0x040c : // French
1033 case 0x080c : // French Belgium
1034 case 0x2c0c : // French Cameroon
1035 case 0x0c0c : // French Canada
1036 case 0x300c : // French Cote d'Ivoire
1037 case 0x140c : // French Luxemburg
1038 case 0x340c : // French Mali
1039 case 0x180c : // French Monaco
1040 case 0x200c : // French Reunion
1041 case 0x280c : // French Senegal
1042 case 0x100c : // French Swiss
1043 case 0x1c0c : // French West Indies
1044 case 0x240c : // French Zaire
1045 return "fr";
1046 case 0x0462 : // Frisian
1047 return "fy";
1048 case 0x043c : // Gaelic
1049 case 0x083c : // Gaelic Ireland
1050 return "ga";
1051 case 0x0456 : // Galician
1052 return "gl";
1053 case 0x0437 : // Georgian
1054 return "ka";
1055 case 0x0407 : // German
1056 case 0x0c07 : // German Austrian
1057 case 0x1407 : // German Liechtenstein
1058 case 0x1007 : // German Luxemburg
1059 case 0x0807 : // German Switzerland
1060 return "de";
1061 case 0x0408 : // Greek
1062 return "el";
1063 case 0x0447 : // Gujarati
1064 return "gu";
1065 case 0x040d : // Hebrew
1066 return "he";
1067 case 0x0439 : // Hindi
1068 return "hi";
1069 case 0x040e : // Hungarian
1070 return "hu";
1071 case 0x040f : // Icelandic
1072 return "is";
1073 case 0x0421 : // Indonesian
1074 return "id";
1075 case 0x0410 : // Italian
1076 case 0x0810 : // Italian Switzerland
1077 return "it";
1078 case 0x0411 : // Japanese
1079 return "ja";
1080 case 0x044b : // Kannada
1081 return "kn";
1082 case 0x0460 : // Kashmiri
1083 case 0x0860 : // Kashmiri India
1084 return "ks";
1085 case 0x043f : // Kazakh
1086 return "kk";
1087 case 0x0453 : // Khmer
1088 return "km";
1089 case 0x0440 : // Kirghiz
1090 return "ky";
1091 case 0x0457 : // Konkani
1092 return "kok";
1093 case 0x0412 : // Korean
1094 case 0x0812 : // Korean Johab
1095 return "ko";
1096 case 0x0454 : // Lao
1097 return "lo";
1098 case 0x0426 : // Latvian
1099 return "lv";
1100 case 0x0427 : // Lithuanian
1101 case 0x0827 : // Lithuanian Classic
1102 return "lt";
1103 case 0x043e : // Macedonian
1104 //case 0x043e : // Malay
1105 case 0x083e : // Malay Brunei Darussalam
1106 return "ms";
1107 case 0x044c : // Malayalam
1108 return "ml";
1109 case 0x043a : // Maltese
1110 return "mt";
1111 case 0x0458 : // Manipuri
1112 return "mni";
1113 case 0x044e : // Marathi
1114 return "mr";
1115 case 0x0450 : // Mongolian
1116 return "mn";
1117 case 0x0461 : // Nepali
1118 case 0x0861 : // Nepali India
1119 return "ne";
1120 case 0x0414 : // Norwegian Bokmal
1121 case 0x0814 : // Norwegian Nynorsk
1122 return "nb";
1123 case 0x0448 : // Oriya
1124 return "or";
1125 case 0x0415 : // Polish
1126 return "pl";
1127 case 0x0416 : // Portuguese Brazil
1128 case 0x0816 : // Portuguese Iberian
1129 return "pt";
1130 case 0x0446 : // Punjabi
1131 return "pa";
1132 case 0x0417 : // Rhaeto-Romanic
1133 return "rm";
1134 case 0x0418 : // Romanian
1135 case 0x0818 : // Romanian Moldova
1136 return "ro";
1137 case 0x0419 : // Russian
1138 case 0x0819 : // Russian Moldova
1139 return "ru";
1140 case 0x043b : // Sami Lappish
1141 return "se";
1142 case 0x044f : // Sanskrit
1143 return "sa";
1144 case 0x0c1a : // Serbian Cyrillic
1145 case 0x081a : // Serbian Latin
1146 return "hr";
1147 case 0x0459 : // Sindhi
1148 return "sd";
1149 case 0x041b : // Slovak
1150 return "sk";
1151 case 0x0424 : // Slovenian
1152 return "sl";
1153 case 0x042e : // Sorbian
1154 return "hsb";
1155 case 0x2c0a : // Spanish Argentina
1156 case 0x400a : // Spanish Bolivia
1157 case 0x340a : // Spanish Chile
1158 case 0x240a : // Spanish Colombia
1159 case 0x140a : // Spanish Costa Rica
1160 case 0x1c0a : // Spanish Dominican Republic
1161 case 0x300a : // Spanish Ecuador
1162 case 0x440a : // Spanish El Salvador
1163 case 0x100a : // Spanish Guatemala
1164 case 0x480a : // Spanish Honduras
1165 case 0x080a : // Spanish Mexico
1166 case 0x0c0a : // Spanish Modern
1167 case 0x4c0a : // Spanish Nicaragua
1168 case 0x180a : // Spanish Panama
1169 case 0x3c0a : // Spanish Paraguay
1170 case 0x280a : // Spanish Peru
1171 case 0x500a : // Spanish Puerto Rico
1172 case 0x040a : // Spanish Traditional
1173 case 0x380a : // Spanish Uruguay
1174 case 0x200a : // Spanish Venezuela
1175 return "es";
1176 case 0x0430 : // Sutu
1177 return "st";
1178 case 0x0441 : // Swahili
1179 return "sw";
1180 case 0x041d : // Swedish
1181 case 0x081d : // Swedish Finland
1182 return "sv";
1183 case 0x0428 : // Tajik
1184 return "tg";
1185 case 0x0449 : // Tamil
1186 return "ta";
1187 case 0x0444 : // Tatar
1188 return "tt";
1189 case 0x044a : // Telugu
1190 return "te";
1191 case 0x041e : // Thai
1192 return "th";
1193 case 0x0451 : // Tibetan
1194 return "bo";
1195 case 0x0431 : // Tsonga
1196 return "ts";
1197 case 0x0432 : // Tswana
1198 return "tn";
1199 case 0x041f : // Turkish
1200 return "tr";
1201 case 0x0442 : // Turkmen
1202 return "tk";
1203 case 0x0422 : // Ukrainian
1204 return "uk";
1205 case 0x0420 : // Urdu
1206 case 0x0820 : // Urdu India
1207 return "ur";
1208 case 0x0843 : // Uzbek Cyrillic
1209 case 0x0443 : // Uzbek Latin
1210 return "uz";
1211 case 0x0433 : // Venda
1212 return "ve";
1213 case 0x042a : // Vietnamese
1214 return "vi";
1215 case 0x0452 : // Welsh
1216 return "cy";
1217 case 0x0434 : // Xhosa
1218 return "xh";
1219 case 0x043d : // Yiddish
1220 return "yi";
1221 case 0x0435 : // Zulu
1222 return "zu";
1223 default:
1224 return NULL;
1225 }
1226 }
1227
GetCharsetByte2UnicodeTable(int codepage)1228 const lChar32 * GetCharsetByte2UnicodeTable( int codepage )
1229 {
1230 switch ( codepage )
1231 {
1232 case 1251:
1233 return __cp1251;
1234 case 1257:
1235 return __cp1257;
1236 case 204:
1237 return __cp1251;
1238 case 1252:
1239 return __cp1252;
1240 case 1253:
1241 return __cp1253;
1242 case 1254:
1243 return __cp1254;
1244 case 737:
1245 return __cp737;
1246 case 1250: return __cp1250;
1247 case 866: return __cp866;
1248 case 850: return __cp850;
1249 default: return __cp1252;
1250 }
1251 }
1252
GetCharsetName(int codepage)1253 const lChar32 * GetCharsetName( int codepage )
1254 {
1255 switch ( codepage )
1256 {
1257 case 1251:
1258 return U"cp1251";
1259 case 1257:
1260 return U"cp1257";
1261 case 204:
1262 return U"cp1251";
1263 case 1252:
1264 return U"cp1252";
1265 case 1253:
1266 return U"cp1253";
1267 case 737:
1268 return U"cp737";
1269 case 1250: return U"cp1250";
1270 case 866: return U"cp866";
1271 case 850: return U"cp850";
1272 default: return U"cp1252";
1273 }
1274 }
1275
1276 static unsigned char cp1252_page00[256] = {
1277 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */
1278 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */
1279 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */
1280 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */
1281 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */
1282 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */
1283 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */
1284 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */
1285 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */
1286 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */
1287 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */
1288 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */
1289 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */
1290 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */
1291 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */
1292 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */
1293
1294 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */
1295 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */
1296 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */
1297 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */
1298 0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, /* 0xa0-0xa7 */
1299 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, /* 0xa8-0xaf */
1300 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, /* 0xb0-0xb7 */
1301 0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, /* 0xb8-0xbf */
1302 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xc0-0xc7 */
1303 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xc8-0xcf */
1304 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, /* 0xd0-0xd7 */
1305 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, /* 0xd8-0xdf */
1306 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, /* 0xe0-0xe7 */
1307 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0xe8-0xef */
1308 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0xf0-0xf7 */
1309 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff, /* 0xf8-0xff */
1310 };
1311
1312 static unsigned char *cp1252_page_uni2charset[256] = {
1313 cp1252_page00, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
1314 };
1315
1316 static unsigned char cp1251_page00[256] = {
1317 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */
1318 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */
1319 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */
1320 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */
1321 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */
1322 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */
1323 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */
1324 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */
1325 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */
1326 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */
1327 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */
1328 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */
1329 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */
1330 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */
1331 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */
1332 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */
1333
1334 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */
1335 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */
1336 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */
1337 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */
1338 0xa0, 0x00, 0x00, 0x00, 0xa4, 0x00, 0xa6, 0xa7, /* 0xa0-0xa7 */
1339 0x00, 0xa9, 0x00, 0xab, 0xac, 0xad, 0xae, 0x00, /* 0xa8-0xaf */
1340 0xb0, 0xb1, 0x00, 0x00, 0x00, 0xb5, 0xb6, 0xb7, /* 0xb0-0xb7 */
1341 0x00, 0x00, 0x00, 0xbb, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */
1342 0x00, 0x00, 0x00, 0xbb, 0x00, 0x00, 0x00, 0x00, /* 0xc0-0xc7 */
1343 0x00, 0x00, 0x00, 0xbb, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */
1344 0x00, 0x00, 0x00, 0xbb, 0x00, 0x00, 0x00, 0x00, /* 0xd0-0xd7 */
1345 0x00, 0x00, 0x00, 0xbb, 0x00, 0x00, 0x00, 0x00, /* 0xd8-0xdf */
1346 0x00, 0x00, 0x00, 0xbb, 0x00, 0x00, 0x00, 0x00, /* 0xe0-0xe7 */
1347 0x00, 0x00, 0x00, 0xbb, 0x00, 0x00, 0x00, 0x00, /* 0xe8-0xef */
1348 0x00, 0x00, 0x00, 0xbb, 0x00, 0x00, 0x00, 0x00, /* 0xf0-0xf7 */
1349 0x00, 0x00, 0x00, 0xbb, 0x00, 0x00, 0x00, 0x00, /* 0xf8-0xff */
1350 };
1351
1352 static unsigned char cp1251_page04[256] = {
1353 0x00, 0xa8, 0x80, 0x81, 0xaa, 0xbd, 0xb2, 0xaf, /* 0x00-0x07 */
1354 0xa3, 0x8a, 0x8c, 0x8e, 0x8d, 0x00, 0xa1, 0x8f, /* 0x08-0x0f */
1355 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0x10-0x17 */
1356 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0x18-0x1f */
1357 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, /* 0x20-0x27 */
1358 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, /* 0x28-0x2f */
1359 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, /* 0x30-0x37 */
1360 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0x38-0x3f */
1361 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0x40-0x47 */
1362 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff, /* 0x48-0x4f */
1363 0x00, 0xb8, 0x90, 0x83, 0xba, 0xbe, 0xb3, 0xbf, /* 0x50-0x57 */
1364 0xbc, 0x9a, 0x9c, 0x9e, 0x9d, 0x00, 0xa2, 0x9f, /* 0x58-0x5f */
1365 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */
1366 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */
1367 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */
1368 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */
1369
1370 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */
1371 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */
1372 0x00, 0xa5, 0xb4, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */
1373 };
1374
1375 static unsigned char cp1251_page20[256] = {
1376 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */
1377 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */
1378 0x00, 0x00, 0x00, 0x96, 0x97, 0x00, 0x00, 0x00, /* 0x10-0x17 */
1379 0x91, 0x92, 0x82, 0x00, 0x93, 0x94, 0x84, 0x00, /* 0x18-0x1f */
1380 0x86, 0x87, 0x95, 0x00, 0x00, 0x00, 0x85, 0x00, /* 0x20-0x27 */
1381 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */
1382 0x89, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */
1383 0x00, 0x8b, 0x9b, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */
1384 };
1385
1386 static unsigned char cp1251_page21[256] = {
1387 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */
1388 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */
1389 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xb9, 0x00, /* 0x10-0x17 */
1390 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */
1391 0x00, 0x00, 0x99, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */
1392 };
1393
1394 static unsigned char *cp1251_page_uni2charset[256] = {
1395 cp1251_page00, NULL, NULL, NULL, cp1251_page04, NULL, NULL, NULL,
1396 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
1397 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
1398 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
1399 cp1251_page20, cp1251_page21, NULL, NULL, NULL, NULL, NULL, NULL,
1400 };
1401
1402 /// add other encodings here
1403 static struct {
1404 const char * name;
1405 unsigned char ** table;
1406 } _uni2byte_enc_table[] = {
1407 {"windows-1251", cp1251_page_uni2charset},
1408 {"cp1251", cp1251_page_uni2charset},
1409 {"windows-1252", cp1252_page_uni2charset},
1410 {"cp1252", cp1252_page_uni2charset},
1411 {NULL, NULL}
1412 };
1413
GetCharsetUnicode2ByteTable(const lChar32 * enc_name)1414 const lChar8 ** GetCharsetUnicode2ByteTable( const lChar32 * enc_name )
1415 {
1416 lString32 s( enc_name );
1417 s.lowercase();
1418 const lChar32 * encoding_name = s.c_str();
1419 for (int i=0; _uni2byte_enc_table[i].name!=NULL; i++)
1420 {
1421 if ( !lStr_cmp(encoding_name, _uni2byte_enc_table[i].name) )
1422 {
1423 return (const lChar8 **)_uni2byte_enc_table[i].table;
1424 }
1425 }
1426 return NULL; // not found
1427 }
1428
1429
1430
1431 // AUTODETECT ENCODINGS feature
1432 #define DBL_CHAR_STAT_SIZE 256
1433
1434 class CDoubleCharStat
1435 {
1436
1437 struct CDblCharNode
1438 {
1439 unsigned char ch1;
1440 unsigned char ch2;
1441 unsigned int count;
1442 unsigned int index;
1443 CDblCharNode * left;
1444 CDblCharNode * right;
1445 CDblCharNode * sleft;
1446 CDblCharNode * sright;
CDblCharNodeCDoubleCharStat::CDblCharNode1447 CDblCharNode( unsigned char c1, unsigned char c2 ) :
1448 ch1(c1), ch2(c2), count(1), index(0), left(NULL), right(NULL),
1449 sleft(NULL), sright(NULL)
1450 {
1451 }
~CDblCharNodeCDoubleCharStat::CDblCharNode1452 ~CDblCharNode()
1453 {
1454 if (left)
1455 delete left;
1456 if (right)
1457 delete right;
1458 }
operator <CDoubleCharStat::CDblCharNode1459 bool operator < (const CDblCharNode & node )
1460 {
1461 return (ch1<node.ch2) || (ch1==node.ch1 && ch2<node.ch2);
1462 }
operator ==CDoubleCharStat::CDblCharNode1463 bool operator == (const CDblCharNode & node )
1464 {
1465 return (ch1==node.ch1) && (ch2=node.ch2);
1466 }
AddCDoubleCharStat::CDblCharNode1467 static inline void Add( CDblCharNode * & pnode, unsigned char c1, unsigned char c2 )
1468 {
1469 if (pnode)
1470 pnode->Add( c1, c2 );
1471 else
1472 pnode = new CDblCharNode( c1, c2 );
1473 }
AddCDoubleCharStat::CDblCharNode1474 void Add( unsigned char c1, unsigned char c2 )
1475 {
1476 if (c1==ch1 && c2==ch2) {
1477 count++; // found
1478 } else if (c1<ch1 || (c1==ch1 && c2<ch2) ) {
1479 Add(left, c1, c2 );
1480 } else {
1481 Add(right, c1, c2 );
1482 }
1483 }
AddSortedCDoubleCharStat::CDblCharNode1484 void AddSorted( CDblCharNode * & sroot )
1485 {
1486 if (!sroot)
1487 sroot = this;
1488 else if (count>sroot->count)
1489 AddSorted( sroot->sleft );
1490 else
1491 AddSorted( sroot->sright );
1492 }
SortCDoubleCharStat::CDblCharNode1493 void Sort( CDblCharNode * & sroot )
1494 {
1495 if (left)
1496 left->Sort( sroot );
1497 AddSorted( sroot );
1498 if (right)
1499 right->Sort( sroot );
1500 }
RenumberCDoubleCharStat::CDblCharNode1501 void Renumber( int & curr_index )
1502 {
1503 if (sleft)
1504 sleft->Renumber( curr_index );
1505 index = curr_index++;
1506 if (sright)
1507 sright->Renumber( curr_index );
1508 }
Renumber1CDoubleCharStat::CDblCharNode1509 void Renumber1( int & curr_index )
1510 {
1511 if (left)
1512 left->Renumber1( curr_index );
1513 index = curr_index++;
1514 if (right)
1515 right->Renumber1( curr_index );
1516 }
GetDataCDoubleCharStat::CDblCharNode1517 void GetData( dbl_char_stat_long_t * & pData, int & len, unsigned int maxindex )
1518 {
1519 if (len<=0)
1520 return;
1521 if (left)
1522 left->GetData( pData, len, maxindex );
1523 if (len<=0)
1524 return;
1525 if (index<maxindex)
1526 {
1527 pData->ch1 = ch1;
1528 pData->ch2 = ch2;
1529 pData->count = count;
1530 pData++;
1531 len--;
1532 }
1533 if (len<=0)
1534 return;
1535 if (right)
1536 right->GetData( pData, len, maxindex );
1537 }
1538 };
1539
1540 CDblCharNode * nodes;
1541 int total;
1542 public:
CDoubleCharStat()1543 CDoubleCharStat() : nodes(NULL), total(0)
1544 {
1545 }
Add(unsigned char c1,unsigned char c2)1546 void Add( unsigned char c1, unsigned char c2 )
1547 {
1548 /* if ( !(c1>127 || c1>='a' && c1<='z' || c1>='A' && c1<='Z' || c1=='\'')
1549 && !(c2>127 || c2>='a' && c2<='z' || c2>='A' && c2<='Z' || c2=='\'') )
1550 {
1551 return;
1552 }
1553 */
1554 if (c1==' ' && c2==' ')
1555 return;
1556 total++;
1557 CDblCharNode::Add( nodes, c1, c2 );
1558 }
GetData(dbl_char_stat_t * pData,int len)1559 void GetData( dbl_char_stat_t * pData, int len )
1560 {
1561 dbl_char_stat_long_t data[DBL_CHAR_STAT_SIZE];
1562 dbl_char_stat_long_t * pData2 = data;
1563 int len2 = len;
1564 int idx = 0;
1565 if (nodes && total)
1566 {
1567 nodes->Renumber1( idx );
1568 idx = 0;
1569 if (nodes->left)
1570 nodes->left->Sort(nodes);
1571 if (nodes->right)
1572 nodes->right->Sort(nodes);
1573 //nodes->Sort( nodes );
1574 nodes->Renumber( idx );
1575 nodes->GetData( pData2, len2, len2 );
1576 }
1577 // fill rest of array
1578 for ( ; len2>0; len2--, pData2++ ) {
1579 pData2->ch1 = 0;
1580 pData2->ch2 = 0;
1581 pData2->count = 0;
1582 }
1583 // scale by total
1584 if (total) {
1585 for (int i=0; i<len; i++) {
1586 if ( data[i].count<0 ) {
1587 data[i].count = -data[i].count;
1588 }
1589 data[i].count = (int)(data[i].count * (lInt64)0x7000 / total);
1590 }
1591 }
1592 for ( int i=0; i<len; i++ ) {
1593 pData[i].ch1 = data[i].ch1;
1594 pData[i].ch2 = data[i].ch2;
1595 pData[i].count = data[i].count;
1596 }
1597 Close();
1598 }
Close()1599 void Close()
1600 {
1601 if (nodes)
1602 delete nodes;
1603 nodes = NULL;
1604 total = 0;
1605 }
~CDoubleCharStat()1606 virtual ~CDoubleCharStat()
1607 {
1608 Close();
1609 }
1610 };
1611
sort_dblstats_by_count(const void * p1,const void * p2)1612 int sort_dblstats_by_count( const void * p1, const void * p2 )
1613 {
1614 int n1 = static_cast<const dbl_char_stat_long_t*>(p1)->count;
1615 int n2 = static_cast<const dbl_char_stat_long_t*>(p2)->count;
1616 if ( n1>n2 )
1617 return -1;
1618 else if ( n2>n1 )
1619 return 1;
1620 else
1621 return 0;
1622 }
1623
sort_dblstats_by_ch(const void * p1,const void * p2)1624 int sort_dblstats_by_ch( const void * p1, const void * p2 )
1625 {
1626 const dbl_char_stat_long_t* n1 = static_cast<const dbl_char_stat_long_t*>(p1);
1627 const dbl_char_stat_long_t* n2 = static_cast<const dbl_char_stat_long_t*>(p2);
1628 if ( n1->ch1>n2->ch1 )
1629 return 1;
1630 else if ( n1->ch1<n2->ch1 )
1631 return -1;
1632 if ( n1->ch2>n2->ch2 )
1633 return 1;
1634 else if ( n1->ch2<n2->ch2 )
1635 return -1;
1636 else
1637 return 0;
1638 }
1639
1640 class CDoubleCharStat2
1641 {
1642 private:
1643 lUInt16 * * stats;
1644 int total;
1645 int items;
1646 public:
CDoubleCharStat2()1647 CDoubleCharStat2() : stats(NULL), total(0), items(0)
1648 {
1649 }
Add(unsigned char c1,unsigned char c2)1650 void Add( unsigned char c1, unsigned char c2 )
1651 {
1652 if ( !stats ) {
1653 stats = new lUInt16* [256]();
1654 }
1655 if (c1==' ' && c2==' ')
1656 return;
1657 total++;
1658 if ( stats[c1]==NULL ) {
1659 stats[c1] = new lUInt16[256]();
1660 }
1661 if ( stats[c1][c2]++ == 0)
1662 items++;
1663 }
GetData(dbl_char_stat_t * pData,int len)1664 void GetData( dbl_char_stat_t * pData, int len )
1665 {
1666 int count = 0;
1667 dbl_char_stat_long_t * pdata = new dbl_char_stat_long_t[items];
1668 if ( total ) {
1669 for ( int i=0; i<256; i++ ) {
1670 if ( stats[i] ) {
1671 for ( int j=0; j<256; j++ ) {
1672 if ( stats[i][j]> 0 ) {
1673 pdata[count].ch1 = i;
1674 pdata[count].ch2 = j;
1675 int n = stats[i][j];
1676 n = (int)(n * (lInt64)0x7000 / total);
1677 pdata[count].count = n;
1678 count++;
1679 }
1680 }
1681 }
1682 }
1683 qsort(pdata, count, sizeof(dbl_char_stat_long_t), sort_dblstats_by_count);
1684 int nsort = count;
1685 if ( nsort>len )
1686 nsort = len;
1687 qsort(pdata, nsort, sizeof(dbl_char_stat_long_t), sort_dblstats_by_ch);
1688 }
1689 // copy data to destination
1690 for ( int k=0; k<len; k++ ) {
1691 if ( k<count ) {
1692 pData[k].ch1 = pdata[k].ch1;
1693 pData[k].ch2 = pdata[k].ch2;
1694 pData[k].count = pdata[k].count;
1695 } else {
1696 pData[k].ch1 = 0;
1697 pData[k].ch2 = 0;
1698 pData[k].count = 0;
1699 }
1700 }
1701 delete[] pdata;
1702 Close();
1703 }
1704
Close()1705 void Close()
1706 {
1707 if ( stats ) {
1708 for ( int i=0; i<256; i++ )
1709 if ( stats[i] )
1710 delete[] stats[i];
1711 delete[] stats;
1712 stats = NULL;
1713 }
1714 total = 0;
1715 }
1716
~CDoubleCharStat2()1717 virtual ~CDoubleCharStat2()
1718 {
1719 Close();
1720 }
1721 };
1722
isValidUtf8Data(const unsigned char * buf,int buf_size)1723 bool isValidUtf8Data( const unsigned char * buf, int buf_size )
1724 {
1725 const unsigned char * start = buf;
1726 const unsigned char * end_buf = buf + buf_size - 5;
1727 while ( buf < end_buf ) {
1728 lUInt8 ch = *buf++;
1729 if ( (ch & 0x80) == 0 ) {
1730 } else if ( (ch & 0xC0) == 0x80 ) {
1731 CRLog::trace("unexpected char %02x at position %x, str=%s", ch, (buf-1-start), lString8((const char *)(buf-1), 32).c_str());
1732 return false;
1733 } else if ( (ch & 0xE0) == 0xC0 ) {
1734 ch = *buf++;
1735 if ( (ch & 0xC0) != 0x80 ) {
1736 CRLog::trace("unexpected char %02x at position %x, str=%s", ch, (buf-1-start), lString8((const char *)(buf-1), 32).c_str());
1737 return false;
1738 }
1739 } else if ( (ch & 0xF0) == 0xE0 ) {
1740 ch = *buf++;
1741 if ( (ch & 0xC0) != 0x80 )
1742 return false;
1743 ch = *buf++;
1744 if ( (ch & 0xC0) != 0x80 )
1745 return false;
1746 } else if ( (ch & 0xF8) == 0xF0 ) {
1747 ch = *buf++;
1748 if ( (ch & 0xC0) != 0x80 )
1749 return false;
1750 ch = *buf++;
1751 if ( (ch & 0xC0) != 0x80 )
1752 return false;
1753 ch = *buf++;
1754 if ( (ch & 0xC0) != 0x80 )
1755 return false;
1756 } else {
1757 return false;
1758 }
1759 }
1760 return true;
1761 }
1762
MakeDblCharStat(const unsigned char * buf,int buf_size,dbl_char_stat_t * stat,int stat_len,bool skipHtml)1763 void MakeDblCharStat(const unsigned char * buf, int buf_size, dbl_char_stat_t * stat, int stat_len, bool skipHtml)
1764 {
1765 CDoubleCharStat2 maker;
1766 unsigned char ch1=' ';
1767 unsigned char ch2=' ';
1768 bool insideTag = false;
1769 for ( int i=1; i<buf_size; i++) {
1770 lChar8 ch = buf[i];
1771 if (skipHtml) {
1772 if (ch == '<') {
1773 insideTag = true;
1774 continue;
1775 } else if (ch == '>') {
1776 insideTag = false;
1777 ch = ' ';
1778 }
1779 }
1780 if (insideTag)
1781 continue;
1782 ch1 = ch2;
1783 ch2 = ch;
1784 if ( ch2<128 && ch2!='\'' && !( (ch2>='a' && ch2<='z') || (ch2>='A' && ch2<='Z')) )
1785 ch2 = ' ';
1786 //if (i>0)
1787 maker.Add( ch1, ch2 );
1788 }
1789 maker.GetData( stat, stat_len );
1790 }
1791
MakeCharStat(const unsigned char * buf,int buf_size,short stat_table[256],bool skipHtml)1792 void MakeCharStat(const unsigned char * buf, int buf_size, short stat_table[256], bool skipHtml)
1793 {
1794 int stat[256] = { 0 };
1795 int total=0;
1796 unsigned char ch;
1797 bool insideTag = false;
1798 for (int i=0; i<buf_size; i++) {
1799 ch = buf[i];
1800 if (skipHtml) {
1801 if (ch == '<') {
1802 insideTag = true;
1803 continue;
1804 }
1805 if (ch == '>') {
1806 insideTag = false;
1807 continue;
1808 }
1809 if (insideTag)
1810 continue;
1811 }
1812 if ( ch>127 || (ch>='a' && ch<='z') || (ch>='A' && ch<='Z') || ch=='\'') {
1813 stat[ch]++;
1814 total++;
1815 }
1816 }
1817 if (total) {
1818 for (int i=0; i<256; i++) {
1819 stat_table[i] = (short)(stat[i] * (lInt64)0x7000 / total);
1820 }
1821 }
1822 }
1823
CompareCharStats(const short * stat1,const short * stat2,double & k1,double & k2)1824 double CompareCharStats( const short * stat1, const short * stat2, double &k1, double &k2 )
1825 {
1826 double sum = 0;
1827 double psum = 0;
1828 double psum2 = 0;
1829 for (int i=0; i<256; i++) {
1830 psum += ( (double)stat1[i] * stat2[i] / 0x7000 / 0x7000);
1831 if (i>=128)
1832 psum2 += ( (double)stat1[i] * stat2[i] / 0x7000 / 0x7000);
1833 int delta = stat1[i] - stat2[i];
1834 if (delta<0)
1835 delta = -delta;
1836 sum += delta;
1837 }
1838 sum /= 0x7000;
1839 k1 = psum;
1840 k2 = psum2;
1841 return sum / 256;
1842 }
1843
CompareDblCharStats(const dbl_char_stat_t * stat1,const dbl_char_stat_t * stat2,int stat_len,double & k1,double & k2)1844 double CompareDblCharStats( const dbl_char_stat_t * stat1, const dbl_char_stat_t * stat2, int stat_len, double &k1, double &k2 )
1845 {
1846 double sum = 0;
1847 int len1 = stat_len;
1848 int len2 = stat_len;
1849 double psum = 0;
1850 double psum2 = 0;
1851 while (len1 && len2) {
1852 //
1853 if (stat1->ch1==stat2->ch1 && stat1->ch2==stat2->ch2) {
1854 if (stat1->ch1 != ' ' || stat1->ch2 != ' ') {
1855 // add stat
1856 int delta = (stat1->count - stat2->count);
1857 if (delta<0)
1858 delta = -delta;
1859 sum += delta;
1860 psum += ( (double)stat1->count * stat2->count / 0x7000 / 0x7000);
1861 if (stat1->ch1>=128 || stat1->ch2>=128)
1862 psum2 += ( (double)stat1->count * stat2->count / 0x7000 / 0x7000);
1863 }
1864 // move both
1865 stat1++;
1866 len1--;
1867 stat2++;
1868 len2--;
1869 } else if ( stat1->ch1<stat2->ch1 || (stat1->ch1==stat2->ch1 && stat1->ch2<stat2->ch2) ) {
1870 // add stat
1871 //int delta = (stat1->count);
1872 sum += stat1->count;
1873 // move 1st
1874 stat1++;
1875 len1--;
1876 } else {
1877 // add stat
1878 //int delta = (stat2->count);
1879 sum += stat2->count;
1880 stat2++;
1881 len2--;
1882 }
1883 }
1884 sum /= 0x7000;
1885 k1 = psum;
1886 k2 = psum2;
1887 return sum / stat_len;
1888 }
1889
1890
1891 //==========================================
1892 // Stats
1893 typedef struct {
1894 const short * ch_stat; // int[256] statistics table table
1895 const dbl_char_stat_t * dbl_ch_stat;
1896 char * cp_name; // codepage name
1897 char * lang_name; // lang name
1898 } cp_stat_t;
1899 // EXTERNAL DEFINE
1900 extern cp_stat_t cp_stat_table[];
1901
AutodetectCodePageUtf(const unsigned char * buf,int buf_size,char * cp_name,char * lang_name)1902 int AutodetectCodePageUtf( const unsigned char * buf, int buf_size, char * cp_name, char * lang_name )
1903 {
1904 // checking byte order signatures
1905 if ( buf[0]==0xEF && buf[1]==0xBB && buf[2]==0xBF ) {
1906 strcpy( cp_name, "utf-8" ); // NOLINT: strcpy is fine with hardcoded string with len < 32
1907 strcpy( lang_name, "en" ); // NOLINT
1908 return 1;
1909 } else if ( buf[0]==0 && buf[1]==0 && buf[2]==0xFE && buf[3]==0xFF ) {
1910 strcpy( cp_name, "utf-32be" ); // NOLINT
1911 strcpy( lang_name, "en" ); // NOLINT
1912 return 1;
1913 } else if ( buf[0]==0xFE && buf[1]==0xFF ) {
1914 strcpy( cp_name, "utf-16be" ); // NOLINT
1915 strcpy( lang_name, "en" ); // NOLINT
1916 return 1;
1917 } else if ( buf[0]==0xFF && buf[1]==0xFE && buf[2]==0 && buf[3]==0 ) {
1918 strcpy( cp_name, "utf-32le" ); // NOLINT
1919 strcpy( lang_name, "en" ); // NOLINT
1920 return 1;
1921 } else if ( buf[0]==0xFF && buf[1]==0xFE ) {
1922 strcpy( cp_name, "utf-16le" ); // NOLINT
1923 strcpy( lang_name, "en" ); // NOLINT
1924 return 1;
1925 }
1926 if ( isValidUtf8Data( buf, buf_size ) ) {
1927 strcpy( cp_name, "utf-8" ); // NOLINT
1928 strcpy( lang_name, "en" ); // NOLINT
1929 return 1;
1930 }
1931 return 0;
1932 }
1933
strincmp(const unsigned char * buf,const char * pattern,int len)1934 int strincmp(const unsigned char * buf, const char * pattern, int len)
1935 {
1936 for (int i=0; i<len && pattern[i] && buf[i]; i++) {
1937 int ch = buf[i];
1938 if (ch >= 'A' && ch<='Z')
1939 ch += 'a' - 'A';
1940 int ch2 = pattern[i];
1941 if (ch2 >= 'A' && ch2<='Z')
1942 ch2 += 'a' - 'A';
1943 if (ch < ch2)
1944 return -1;
1945 if (ch > ch2)
1946 return 1;
1947 }
1948 return 0;
1949 }
1950
strnstr(const unsigned char * buf,int buf_len,const char * pattern)1951 int strnstr(const unsigned char * buf, int buf_len, const char * pattern)
1952 {
1953 int plen = (int)strlen(pattern);
1954 for (int i=0; i<=buf_len - plen; i++) {
1955 if (!strincmp(buf + i, pattern, plen)) {
1956 return i;
1957 }
1958 }
1959 return -1;
1960 }
1961
rstrnstr(const unsigned char * buf,int buf_len,const char * pattern)1962 int rstrnstr(const unsigned char * buf, int buf_len, const char * pattern)
1963 {
1964 int plen = (int)strlen(pattern);
1965 for (int i=buf_len - plen; i>=0; i--) {
1966 if (!strincmp(buf + i, pattern, plen)) {
1967 return i;
1968 }
1969 }
1970 return -1;
1971 }
1972
detectXmlHtmlEncoding(const unsigned char * buf,int buf_len,char * html_enc_name)1973 bool detectXmlHtmlEncoding(const unsigned char * buf, int buf_len, char * html_enc_name)
1974 {
1975 int xml_p = strnstr(buf, buf_len, "<?xml");
1976 int xml_end_p = strnstr(buf, buf_len, "?>");
1977 if (xml_p >= 0 && xml_end_p > xml_p) {
1978 // XML
1979 int enc_p = strnstr(buf, buf_len, "encoding=\"");
1980 if (enc_p < xml_p || enc_p > xml_end_p)
1981 return false;
1982 enc_p += 10;
1983 int enc_end_p = strnstr(buf + enc_p, xml_end_p - enc_p, "\"");
1984 if (enc_end_p < 0 || enc_end_p > 20)
1985 return false;
1986 strncpy(html_enc_name, (char *)(buf + enc_p), enc_end_p);
1987 html_enc_name[enc_end_p] = 0;
1988 CRLog::debug("XML header encoding detected: %s", html_enc_name);
1989 return true;
1990 }
1991 int content_type_p = strnstr(buf, buf_len, "http-equiv=\"Content-Type\"");
1992 if (content_type_p >= 0) {
1993 int meta_p = rstrnstr(buf, content_type_p, "<meta");
1994 if (meta_p < 0)
1995 return false;
1996 int meta_end_p = strnstr(buf + meta_p, buf_len - meta_p, ">");
1997 if (meta_end_p < 0)
1998 return false;
1999 int charset_p = strnstr(buf + meta_p, meta_end_p, "charset=");
2000 if (charset_p < 0)
2001 return false;
2002 charset_p += 8;
2003 int charset_end_p = strnstr(buf + meta_p + charset_p, meta_end_p - charset_p, "\"");
2004 if (charset_end_p < 0)
2005 return false;
2006 strncpy(html_enc_name, (char *)(buf + meta_p + charset_p), charset_end_p);
2007 html_enc_name[charset_end_p] = 0;
2008 CRLog::debug("HTML header meta encoding detected: %s", html_enc_name);
2009 return true;
2010 }
2011 return false;
2012 }
2013
AutodetectCodePage(const unsigned char * buf,int buf_size,char * cp_name,char * lang_name,bool skipHtml)2014 int AutodetectCodePage(const unsigned char * buf, int buf_size, char * cp_name, char * lang_name, bool skipHtml)
2015 {
2016 int res = AutodetectCodePageUtf( buf, buf_size, cp_name, lang_name );
2017 if ( res )
2018 return res;
2019 // use character statistics
2020 short char_stat[256];
2021 dbl_char_stat_t dbl_char_stat[DBL_CHAR_STAT_SIZE];
2022 MakeCharStat(buf, buf_size, char_stat, skipHtml);
2023 MakeDblCharStat(buf, buf_size, dbl_char_stat, DBL_CHAR_STAT_SIZE, skipHtml);
2024 int bestn = 0;
2025 double bestq = 0; //1000000;
2026 for (int i=0; cp_stat_table[i].ch_stat; i++) {
2027 double q12, q11;
2028 double q22, q21;
2029 double q1 = CompareCharStats( cp_stat_table[i].ch_stat, char_stat, q11, q12 );
2030 double q2 = CompareDblCharStats( cp_stat_table[i].dbl_ch_stat, dbl_char_stat, DBL_CHAR_STAT_SIZE, q21, q22 );
2031 // double q_1 = q11 + 3*q12;
2032 // double q_2 = q21 + 5*q22;
2033 // double q_ = q_1 * q_2;
2034 if (q1 < 0.00001)
2035 q1 = 0.00001;
2036 if (q2 < 0.00001)
2037 q2 = 0.00001;
2038 double q = q11 * 0 + q12 * 2 + q21 * 0 + q22 * 6; //(q_>0) ? (q1*2+q2*7) / (q_) : 1000000;
2039 q = q / (q1 + q2);
2040 //CRLog::debug("%d %10s %4s : %lf %lf %lf - %lf %lf %lf : %lf", i, cp_stat_table[i].cp_name, cp_stat_table[i].lang_name, q1, q11, q12, q2, q21, q22, q);
2041 if (q > bestq) {
2042 bestn = i;
2043 bestq = q;
2044 }
2045 }
2046 strcpy(cp_name, cp_stat_table[bestn].cp_name); // NOLINT: strcpy is fine, all strings are len < 32
2047 strcpy(lang_name, cp_stat_table[bestn].lang_name); // NOLINT
2048 CRLog::debug("Detected codepage:%s lang:%s index:%d %s", cp_name, lang_name, bestn, skipHtml ? "(skipHtml)" : "");
2049 if (skipHtml) {
2050 if (detectXmlHtmlEncoding(buf, buf_size, cp_name)) {
2051 CRLog::debug("Encoding parsed from XML/HTML: %s", cp_name);
2052 }
2053 }
2054 return 1;
2055 }
2056
hasXmlTags(const lUInt8 * buf,int size)2057 bool hasXmlTags(const lUInt8 * buf, int size) {
2058 int openCount = 0;
2059 int closeCount = 0;
2060 for (int i=0; i<size; i++) {
2061 if (buf[i]=='<')
2062 openCount++;
2063 else if (buf[i]=='>')
2064 closeCount++;
2065 }
2066 if (openCount > 2 && closeCount > 2) {
2067 int diff = openCount - closeCount;
2068 if (diff<0)
2069 diff = -diff;
2070 if (diff < 2)
2071 return true;
2072 }
2073 return false;
2074 }
2075
MakeStatsForFile(const char * fname,const char * cp_name,const char * lang_name,int index,FILE * f,lString8 & list)2076 void MakeStatsForFile( const char * fname, const char * cp_name, const char * lang_name, int index, FILE * f, lString8 & list )
2077 {
2078 FILE * in = fopen( fname, "rb" );
2079 if (!in)
2080 return;
2081 fseek( in, 0, SEEK_END );
2082 int buf_size = ftell(in);
2083 fseek( in, 0, SEEK_SET );
2084 unsigned char * buf = new unsigned char[buf_size];
2085 fread(buf, 1, buf_size, in);
2086 short char_stat[256] = { 0 };
2087 dbl_char_stat_t dbl_char_stat[DBL_CHAR_STAT_SIZE];
2088 bool skipHtml = hasXmlTags(buf, buf_size);
2089 MakeCharStat(buf, buf_size, char_stat, skipHtml);
2090 MakeDblCharStat(buf, buf_size, dbl_char_stat, DBL_CHAR_STAT_SIZE, skipHtml);
2091 fprintf(f, "\n\nstatic const short ch_stat_%s_%s%d[256]={\n", cp_name, lang_name, index);
2092 int i;
2093 for (i=0; i<16; i++)
2094 {
2095 for (int j=0; j<16; j++)
2096 {
2097 fprintf(f, "0x%04x,", (unsigned int)char_stat[i*16+j] );
2098 }
2099 fprintf(f, "// %d..%d\n", i*16, i*16+15 );
2100 }
2101 fprintf(f, "};\n\n" );
2102 fprintf(f, "static const dbl_char_stat_t dbl_ch_stat_%s_%s%d[%d] = {\n", cp_name, lang_name, index, DBL_CHAR_STAT_SIZE );
2103 for (i=0; i<DBL_CHAR_STAT_SIZE/16; i++)
2104 {
2105 for (int j=0; j<16; j++)
2106 {
2107 fprintf(f, "{0x%02x,0x%02x,0x%04x}, ", (unsigned int)dbl_char_stat[i*16+j].ch1, (unsigned int)dbl_char_stat[i*16+j].ch2, (unsigned int)((lUInt16)dbl_char_stat[i*16+j].count) );
2108 }
2109 fprintf(f, "// %d..%d\n", i*16, i*16+15 );
2110 }
2111 char str[100];
2112 sprintf(str, "{ch_stat_%s_%s%d,dbl_ch_stat_%s_%s%d,\"%s\",\"%s\"}, \n", cp_name, lang_name, index, cp_name, lang_name, index, cp_name, lang_name );
2113 list += str;
2114 fprintf(f, "};\n\n" );
2115 delete [] buf;
2116 fclose(in);
2117 }
2118