1 /*******************************************************
2 
3    CoolReader Engine
4 
5    lvxml.cpp:  XML parser implementation
6 
7    (c) Vadim Lopatin, 2000-2006
8    This source code is distributed under the terms of
9    GNU General Public License
10    See LICENSE file for details
11 
12 *******************************************************/
13 
14 #include "../include/crtxtenc.h"
15 #include "../include/lvstring.h"
16 #include "../include/cp_stats.h"
17 #include "../include/crlog.h"
18 #include <string.h>
19 #include <stdio.h>
20 
21 static const lChar32 __cp737[128] = {
22   /* 0x80 */
23   0x0391, 0x0392, 0x0393, 0x0394, 0x0395, 0x0396, 0x0397, 0x0398,
24   0x0399, 0x039a, 0x039b, 0x039c, 0x039d, 0x039e, 0x039f, 0x03a0,
25   /* 0x90 */
26   0x03a1, 0x03a3, 0x03a4, 0x03a5, 0x03a6, 0x03a7, 0x03a8, 0x03a9,
27   0x03b1, 0x03b2, 0x03b3, 0x03b4, 0x03b5, 0x03b6, 0x03b7, 0x03b8,
28   /* 0xa0 */
29   0x03b9, 0x03ba, 0x03bb, 0x03bc, 0x03bd, 0x03be, 0x03bf, 0x03c0,
30   0x03c1, 0x03c3, 0x03c2, 0x03c4, 0x03c5, 0x03c6, 0x03c7, 0x03c8,
31   /* 0xb0 */
32   0x2591, 0x2592, 0x2593, 0x2502, 0x2524, 0x2561, 0x2562, 0x2556,
33   0x2555, 0x2563, 0x2551, 0x2557, 0x255d, 0x255c, 0x255b, 0x2510,
34   /* 0xc0 */
35   0x2514, 0x2534, 0x252c, 0x251c, 0x2500, 0x253c, 0x255e, 0x255f,
36   0x255a, 0x2554, 0x2569, 0x2566, 0x2560, 0x2550, 0x256c, 0x2567,
37   /* 0xd0 */
38   0x2568, 0x2564, 0x2565, 0x2559, 0x2558, 0x2552, 0x2553, 0x256b,
39   0x256a, 0x2518, 0x250c, 0x2588, 0x2584, 0x258c, 0x2590, 0x2580,
40   /* 0xe0 */
41   0x03c9, 0x03ac, 0x03ad, 0x03ae, 0x03ca, 0x03af, 0x03cc, 0x03cd,
42   0x03cb, 0x03ce, 0x0386, 0x0388, 0x0389, 0x038a, 0x038c, 0x038e,
43   /* 0xf0 */
44   0x038f, 0x00b1, 0x2265, 0x2264, 0x03aa, 0x03ab, 0x00f7, 0x2248,
45   0x00b0, 0x2219, 0x00b7, 0x221a, 0x207f, 0x00b2, 0x25a0, 0x00a0,
46 };
47 
48 static const lChar32 __cp1253[128] = {
49   /* 0x80 */
50   0x20ac, 0xfffd, 0x201a, 0x0192, 0x201e, 0x2026, 0x2020, 0x2021,
51   0xfffd, 0x2030, 0xfffd, 0x2039, 0xfffd, 0xfffd, 0xfffd, 0xfffd,
52   /* 0x90 */
53   0xfffd, 0x2018, 0x2019, 0x201c, 0x201d, 0x2022, 0x2013, 0x2014,
54   0xfffd, 0x2122, 0xfffd, 0x203a, 0xfffd, 0xfffd, 0xfffd, 0xfffd,
55   /* 0xa0 */
56   0x00a0, 0x0385, 0x0386, 0x00a3, 0x00a4, 0x00a5, 0x00a6, 0x00a7,
57   0x00a8, 0x00a9, 0xfffd, 0x00ab, 0x00ac, 0x00ad, 0x00ae, 0x2015,
58   /* 0xb0 */
59   0x00b0, 0x00b1, 0x00b2, 0x00b3, 0x0384, 0x00b5, 0x00b6, 0x00b7,
60   0x0388, 0x0389, 0x038a, 0x00bb, 0x038c, 0x00bd, 0x038e, 0x038f,
61   /* 0xc0 */
62   0x0390, 0x0391, 0x0392, 0x0393, 0x0394, 0x0395, 0x0396, 0x0397,
63   0x0398, 0x0399, 0x039a, 0x039b, 0x039c, 0x039d, 0x039e, 0x039f,
64   /* 0xd0 */
65   0x03a0, 0x03a1, 0xfffd, 0x03a3, 0x03a4, 0x03a5, 0x03a6, 0x03a7,
66   0x03a8, 0x03a9, 0x03aa, 0x03ab, 0x03ac, 0x03ad, 0x03ae, 0x03af,
67   /* 0xe0 */
68   0x03b0, 0x03b1, 0x03b2, 0x03b3, 0x03b4, 0x03b5, 0x03b6, 0x03b7,
69   0x03b8, 0x03b9, 0x03ba, 0x03bb, 0x03bc, 0x03bd, 0x03be, 0x03bf,
70   /* 0xf0 */
71   0x03c0, 0x03c1, 0x03c2, 0x03c3, 0x03c4, 0x03c5, 0x03c6, 0x03c7,
72   0x03c8, 0x03c9, 0x03ca, 0x03cb, 0x03cc, 0x03cd, 0x03ce, 0xfffd,
73 };
74 
75 static const lChar32 __cp775[128] = {
76   /* 0x80 */
77   0x0106, 0x00fc, 0x00e9, 0x0101, 0x00e4, 0x0123, 0x00e5, 0x0107,
78   0x0142, 0x0113, 0x0156, 0x0157, 0x012b, 0x0179, 0x00c4, 0x00c5,
79   /* 0x90 */
80   0x00c9, 0x00e6, 0x00c6, 0x014d, 0x00f6, 0x0122, 0x00a2, 0x015a,
81   0x015b, 0x00d6, 0x00dc, 0x00f8, 0x00a3, 0x00d8, 0x00d7, 0x00a4,
82   /* 0xa0 */
83   0x0100, 0x012a, 0x00f3, 0x017b, 0x017c, 0x017a, 0x201d, 0x00a6,
84   0x00a9, 0x00ae, 0x00ac, 0x00bd, 0x00bc, 0x0141, 0x00ab, 0x00bb,
85   /* 0xb0 */
86   0x2591, 0x2592, 0x2593, 0x2502, 0x2524, 0x0104, 0x010c, 0x0118,
87   0x0116, 0x2563, 0x2551, 0x2557, 0x255d, 0x012e, 0x0160, 0x2510,
88   /* 0xc0 */
89   0x2514, 0x2534, 0x252c, 0x251c, 0x2500, 0x253c, 0x0172, 0x016a,
90   0x255a, 0x2554, 0x2569, 0x2566, 0x2560, 0x2550, 0x256c, 0x017d,
91   /* 0xd0 */
92   0x0105, 0x010d, 0x0119, 0x0117, 0x012f, 0x0161, 0x0173, 0x016b,
93   0x017e, 0x2518, 0x250c, 0x2588, 0x2584, 0x258c, 0x2590, 0x2580,
94   /* 0xe0 */
95   0x00d3, 0x00df, 0x014c, 0x0143, 0x00f5, 0x00d5, 0x00b5, 0x0144,
96   0x0136, 0x0137, 0x013b, 0x013c, 0x0146, 0x0112, 0x0145, 0x2019,
97   /* 0xf0 */
98   0x00ad, 0x00b1, 0x201c, 0x00be, 0x00b6, 0x00a7, 0x00f7, 0x201e,
99   0x00b0, 0x2219, 0x00b7, 0x00b9, 0x00b3, 0x00b2, 0x25a0, 0x00a0,
100 };
101 
102 /*
103  * CP852
104  */
105 
106 static const lChar32 __cp852[128] = {
107   /* 0x80 */
108   0x00c7, 0x00fc, 0x00e9, 0x00e2, 0x00e4, 0x016f, 0x0107, 0x00e7,
109   0x0142, 0x00eb, 0x0150, 0x0151, 0x00ee, 0x0179, 0x00c4, 0x0106,
110   /* 0x90 */
111   0x00c9, 0x0139, 0x013a, 0x00f4, 0x00f6, 0x013d, 0x013e, 0x015a,
112   0x015b, 0x00d6, 0x00dc, 0x0164, 0x0165, 0x0141, 0x00d7, 0x010d,
113   /* 0xa0 */
114   0x00e1, 0x00ed, 0x00f3, 0x00fa, 0x0104, 0x0105, 0x017d, 0x017e,
115   0x0118, 0x0119, 0x00ac, 0x017a, 0x010c, 0x015f, 0x00ab, 0x00bb,
116   /* 0xb0 */
117   0x2591, 0x2592, 0x2593, 0x2502, 0x2524, 0x00c1, 0x00c2, 0x011a,
118   0x015e, 0x2563, 0x2551, 0x2557, 0x255d, 0x017b, 0x017c, 0x2510,
119   /* 0xc0 */
120   0x2514, 0x2534, 0x252c, 0x251c, 0x2500, 0x253c, 0x0102, 0x0103,
121   0x255a, 0x2554, 0x2569, 0x2566, 0x2560, 0x2550, 0x256c, 0x00a4,
122   /* 0xd0 */
123   0x0111, 0x0110, 0x010e, 0x00cb, 0x010f, 0x0147, 0x00cd, 0x00ce,
124   0x011b, 0x2518, 0x250c, 0x2588, 0x2584, 0x0162, 0x016e, 0x2580,
125   /* 0xe0 */
126   0x00d3, 0x00df, 0x00d4, 0x0143, 0x0144, 0x0148, 0x0160, 0x0161,
127   0x0154, 0x00da, 0x0155, 0x0170, 0x00fd, 0x00dd, 0x0163, 0x00b4,
128   /* 0xf0 */
129   0x00ad, 0x02dd, 0x02db, 0x02c7, 0x02d8, 0x00a7, 0x00f7, 0x00b8,
130   0x00b0, 0x00a8, 0x02d9, 0x0171, 0x0158, 0x0159, 0x25a0, 0x00a0,
131 };
132 
133 /*
134  * ISO-8859-2
135  */
136 
137 static const lChar32 __iso8859_2[128] = {
138   /* 0x80*/
139   0x0402, 0x0403, 0x201a, 0x0453, 0x201e, 0x2026, 0x2020, 0x2021,
140   0x20ac, 0x2030, 0x0409, 0x2039, 0x040a, 0x040c, 0x040b, 0x040f,
141   /* 0x90*/
142   0x0452, 0x2018, 0x2019, 0x201c, 0x201d, 0x2022, 0x2013, 0x2014,
143   0x0000, 0x2122, 0x0459, 0x203a, 0x045a, 0x045c, 0x045b, 0x045f,
144   /* 0xa0 */
145   0x00a0, 0x0104, 0x02d8, 0x0141, 0x00a4, 0x013d, 0x015a, 0x00a7,
146   0x00a8, 0x0160, 0x015e, 0x0164, 0x0179, 0x00ad, 0x017d, 0x017b,
147   /* 0xb0 */
148   0x00b0, 0x0105, 0x02db, 0x0142, 0x00b4, 0x013e, 0x015b, 0x02c7,
149   0x00b8, 0x0161, 0x015f, 0x0165, 0x017a, 0x02dd, 0x017e, 0x017c,
150   /* 0xc0 */
151   0x0154, 0x00c1, 0x00c2, 0x0102, 0x00c4, 0x0139, 0x0106, 0x00c7,
152   0x010c, 0x00c9, 0x0118, 0x00cb, 0x011a, 0x00cd, 0x00ce, 0x010e,
153   /* 0xd0 */
154   0x0110, 0x0143, 0x0147, 0x00d3, 0x00d4, 0x0150, 0x00d6, 0x00d7,
155   0x0158, 0x016e, 0x00da, 0x0170, 0x00dc, 0x00dd, 0x0162, 0x00df,
156   /* 0xe0 */
157   0x0155, 0x00e1, 0x00e2, 0x0103, 0x00e4, 0x013a, 0x0107, 0x00e7,
158   0x010d, 0x00e9, 0x0119, 0x00eb, 0x011b, 0x00ed, 0x00ee, 0x010f,
159   /* 0xf0 */
160   0x0111, 0x0144, 0x0148, 0x00f3, 0x00f4, 0x0151, 0x00f6, 0x00f7,
161   0x0159, 0x016f, 0x00fa, 0x0171, 0x00fc, 0x00fd, 0x0163, 0x02d9,
162 };
163 
164 /*
165  * ISO-8859-16
166  */
167 
168 static const lChar32 __iso8859_16[128] = {
169     /* 0x80*/
170     0x0402, 0x0403, 0x201a, 0x0453, 0x201e, 0x2026, 0x2020, 0x2021,
171     0x20ac, 0x2030, 0x0409, 0x2039, 0x040a, 0x040c, 0x040b, 0x040f,
172     /* 0x90*/
173     0x0452, 0x2018, 0x2019, 0x201c, 0x201d, 0x2022, 0x2013, 0x2014,
174     0x0000, 0x2122, 0x0459, 0x203a, 0x045a, 0x045c, 0x045b, 0x045f,
175     /* 0xa0 */
176     0x00a0, 0x0104, 0x0105, 0x0141, 0x20ac, 0x201e, 0x0160, 0x00a7,
177     0x0161, 0x00a9, 0x0218, 0x00ab, 0x0179, 0x00ad, 0x017a, 0x017b,
178     /* 0xb0 */
179     0x00b0, 0x00b1, 0x010c, 0x0142, 0x017d, 0x201d, 0x00b6, 0x00b7,
180     0x017e, 0x010d, 0x0219, 0x00bb, 0x0152, 0x0153, 0x0178, 0x017c,
181     /* 0xc0 */
182     0x00c0, 0x00c1, 0x00c2, 0x0102, 0x00c4, 0x0106, 0x00c6, 0x00c7,
183     0x00c8, 0x00c9, 0x00ca, 0x00cb, 0x00cc, 0x00cd, 0x00ce, 0x00cf,
184     /* 0xd0 */
185     0x0110, 0x0143, 0x00d2, 0x00d3, 0x00d4, 0x0150, 0x00d6, 0x015a,
186     0x0170, 0x00d9, 0x00da, 0x00db, 0x00dc, 0x0118, 0x021a, 0x00df,
187     /* 0xe0 */
188     0x00e0, 0x00e1, 0x00e2, 0x0103, 0x00e4, 0x0107, 0x00e6, 0x00e7,
189     0x00e8, 0x00e9, 0x00ea, 0x00eb, 0x00ec, 0x00ed, 0x00ee, 0x00ef,
190     /* 0xf0 */
191     0x0111, 0x0144, 0x00f2, 0x00f3, 0x00f4, 0x0151, 0x00f6, 0x015b,
192     0x0171, 0x00f9, 0x00fa, 0x00fb, 0x00fc, 0x0119, 0x021b, 0x00ff,
193 };
194 
195 static const lChar32 __cp1257[128] = {
196   /* 0x80 */
197   0x20ac, 0xfffd, 0x201a, 0xfffd, 0x201e, 0x2026, 0x2020, 0x2021,
198   0xfffd, 0x2030, 0xfffd, 0x2039, 0xfffd, 0x00a8, 0x02c7, 0x00b8,
199   /* 0x90 */
200   0xfffd, 0x2018, 0x2019, 0x201c, 0x201d, 0x2022, 0x2013, 0x2014,
201   0xfffd, 0x2122, 0xfffd, 0x203a, 0xfffd, 0x00af, 0x02db, 0xfffd,
202   /* 0xa0 */
203   0x00a0, 0xfffd, 0x00a2, 0x00a3, 0x00a4, 0xfffd, 0x00a6, 0x00a7,
204   0x00d8, 0x00a9, 0x0156, 0x00ab, 0x00ac, 0x00ad, 0x00ae, 0x00c6,
205   /* 0xb0 */
206   0x00b0, 0x00b1, 0x00b2, 0x00b3, 0x00b4, 0x00b5, 0x00b6, 0x00b7,
207   0x00f8, 0x00b9, 0x0157, 0x00bb, 0x00bc, 0x00bd, 0x00be, 0x00e6,
208   /* 0xc0 */
209   0x0104, 0x012e, 0x0100, 0x0106, 0x00c4, 0x00c5, 0x0118, 0x0112,
210   0x010c, 0x00c9, 0x0179, 0x0116, 0x0122, 0x0136, 0x012a, 0x013b,
211   /* 0xd0 */
212   0x0160, 0x0143, 0x0145, 0x00d3, 0x014c, 0x00d5, 0x00d6, 0x00d7,
213   0x0172, 0x0141, 0x015a, 0x016a, 0x00dc, 0x017b, 0x017d, 0x00df,
214   /* 0xe0 */
215   0x0105, 0x012f, 0x0101, 0x0107, 0x00e4, 0x00e5, 0x0119, 0x0113,
216   0x010d, 0x00e9, 0x017a, 0x0117, 0x0123, 0x0137, 0x012b, 0x013c,
217   /* 0xf0 */
218   0x0161, 0x0144, 0x0146, 0x00f3, 0x014d, 0x00f5, 0x00f6, 0x00f7,
219   0x0173, 0x0142, 0x015b, 0x016b, 0x00fc, 0x017c, 0x017e, 0x02d9,
220 };
221 
222 static const lChar32 __cp1251[128] = {
223     /* 0x80*/
224     0x0402, 0x0403, 0x201a, 0x0453,
225     0x201e, 0x2026, 0x2020, 0x2021,
226     0x20ac, 0x2030, 0x0409, 0x2039,
227     0x040a, 0x040c, 0x040b, 0x040f,
228     /* 0x90*/
229     0x0452, 0x2018, 0x2019, 0x201c,
230     0x201d, 0x2022, 0x2013, 0x2014,
231     0x0000, 0x2122, 0x0459, 0x203a,
232     0x045a, 0x045c, 0x045b, 0x045f,
233     /* 0xa0*/
234     0x00a0, 0x040e, 0x045e, 0x0408,
235     0x00a4, 0x0490, 0x00a6, 0x00a7,
236     0x0401, 0x00a9, 0x0404, 0x00ab,
237     0x00ac, 0x00ad, 0x00ae, 0x0407,
238     /* 0xb0*/
239     0x00b0, 0x00b1, 0x0406, 0x0456,
240     0x0491, 0x00b5, 0x00b6, 0x00b7,
241     0x0451, 0x2116, 0x0454, 0x00bb,
242     0x0458, 0x0405, 0x0455, 0x0457,
243     /* 0xc0*/
244     0x0410, 0x0411, 0x0412, 0x0413,
245     0x0414, 0x0415, 0x0416, 0x0417,
246     0x0418, 0x0419, 0x041a, 0x041b,
247     0x041c, 0x041d, 0x041e, 0x041f,
248     /* 0xd0*/
249     0x0420, 0x0421, 0x0422, 0x0423,
250     0x0424, 0x0425, 0x0426, 0x0427,
251     0x0428, 0x0429, 0x042a, 0x042b,
252     0x042c, 0x042d, 0x042e, 0x042f,
253     /* 0xe0*/
254     0x0430, 0x0431, 0x0432, 0x0433,
255     0x0434, 0x0435, 0x0436, 0x0437,
256     0x0438, 0x0439, 0x043a, 0x043b,
257     0x043c, 0x043d, 0x043e, 0x043f,
258     /* 0xf0*/
259     0x0440, 0x0441, 0x0442, 0x0443,
260     0x0444, 0x0445, 0x0446, 0x0447,
261     0x0448, 0x0449, 0x044a, 0x044b,
262     0x044c, 0x044d, 0x044e, 0x044f,
263 };
264 
265 static const lChar32 __cp1252[128] = {
266     /* 0x80*/
267     0x0402, 0x0403, 0x201a, 0x0453,
268     0x201e, 0x2026, 0x2020, 0x2021,
269     0x20ac, 0x2030, 0x0409, 0x2039,
270     0x040a, 0x040c, 0x040b, 0x040f,
271     /* 0x90*/
272     0x0452, 0x2018, 0x2019, 0x201c,
273     0x201d, 0x2022, 0x2013, 0x2014,
274     0x0000, 0x2122, 0x0459, 0x203a,
275     0x045a, 0x045c, 0x045b, 0x045f,
276     /* 0xa0*/
277     0x00a0, 0x00a1, 0x00a2, 0x00a3,
278     0x00a4, 0x00a5, 0x00a6, 0x00a7,
279     0x00a8, 0x00a9, 0x00aa, 0x00ab,
280     0x00ac, 0x00ad, 0x00ae, 0x00af,
281     /* 0xb0*/
282     0x00b0, 0x00b1, 0x00b2, 0x00b3,
283     0x00b4, 0x00b5, 0x00b6, 0x00b7,
284     0x00b8, 0x00b9, 0x00ba, 0x00bb,
285     0x00bc, 0x00bd, 0x00be, 0x00bf,
286     /* 0xc0*/
287     0x00c0, 0x00c1, 0x00c2, 0x00c3,
288     0x00c4, 0x00c5, 0x00c6, 0x00c7,
289     0x00c8, 0x00c9, 0x00ca, 0x00cb,
290     0x00cc, 0x00cd, 0x00ce, 0x00cf,
291     /* 0xd0*/
292     0x00d0, 0x00d1, 0x00d2, 0x00d3,
293     0x00d4, 0x00d5, 0x00d6, 0x00d7,
294     0x00d8, 0x00d9, 0x00da, 0x00db,
295     0x00dc, 0x00dd, 0x00de, 0x00df,
296     /* 0xe0*/
297     0x00e0, 0x00e1, 0x00e2, 0x00e3,
298     0x00e4, 0x00e5, 0x00e6, 0x00e7,
299     0x00e8, 0x00e9, 0x00ea, 0x00eb,
300     0x00ec, 0x00ed, 0x00ee, 0x00ef,
301     /* 0xf0*/
302     0x00f0, 0x00f1, 0x00f2, 0x00f3,
303     0x00f4, 0x00f5, 0x00f6, 0x00f7,
304     0x00f8, 0x00f9, 0x00fa, 0x00fb,
305     0x00fc, 0x00fd, 0x00fe, 0x00ff,
306 };
307 
308 static const lChar32 __cp1254[128] = {
309     /* 0x80 */
310     0x20ac, 0xfffd, 0x201a, 0x0192,
311     0x201e, 0x2026, 0x2020, 0x2021,
312     0x02c6, 0x2030, 0x0160, 0x2039,
313     0x0152, 0xfffd, 0xfffd, 0xfffd,
314     /* 0x90 */
315     0xfffd, 0x2018, 0x2019, 0x201c,
316     0x201d, 0x2022, 0x2013, 0x2014,
317     0x02dc, 0x2122, 0x0161, 0x203a,
318     0x0153, 0xfffd, 0xfffd, 0x0178,
319     /* 0xa0*/
320     0x00a0, 0x00a1, 0x00a2, 0x00a3,
321     0x00a4, 0x00a5, 0x00a6, 0x00a7,
322     0x00a8, 0x00a9, 0x00aa, 0x00ab,
323     0x00ac, 0x00ad, 0x00ae, 0x00af,
324     /* 0xb0*/
325     0x00b0, 0x00b1, 0x00b2, 0x00b3,
326     0x00b4, 0x00b5, 0x00b6, 0x00b7,
327     0x00b8, 0x00b9, 0x00ba, 0x00bb,
328     0x00bc, 0x00bd, 0x00be, 0x00bf,
329     /* 0xc0*/
330     0x00c0, 0x00c1, 0x00c2, 0x00c3,
331     0x00c4, 0x00c5, 0x00c6, 0x00c7,
332     0x00c8, 0x00c9, 0x00ca, 0x00cb,
333     0x00cc, 0x00cd, 0x00ce, 0x00cf,
334     /* 0xd0 */
335     0x011e, 0x00d1, 0x00d2, 0x00d3,
336     0x00d4, 0x00d5, 0x00d6, 0x00d7,
337     0x00d8, 0x00d9, 0x00da, 0x00db,
338     0x00dc, 0x0130, 0x015e, 0x00df,
339     /* 0xe0*/
340     0x00e0, 0x00e1, 0x00e2, 0x00e3,
341     0x00e4, 0x00e5, 0x00e6, 0x00e7,
342     0x00e8, 0x00e9, 0x00ea, 0x00eb,
343     0x00ec, 0x00ed, 0x00ee, 0x00ef,
344     /* 0xf0 */
345     0x011f, 0x00f1, 0x00f2, 0x00f3,
346     0x00f4, 0x00f5, 0x00f6, 0x00f7,
347     0x00f8, 0x00f9, 0x00fa, 0x00fb,
348     0x00fc, 0x0131, 0x015f, 0x00ff,
349 };
350 
351 static const lChar32 __cp866[128] = {
352     /* 0x80*/
353     0x0410, 0x0411, 0x0412, 0x0413,
354     0x0414, 0x0415, 0x0416, 0x0417,
355     0x0418, 0x0419, 0x041a, 0x041b,
356     0x041c, 0x041d, 0x041e, 0x041f,
357     /* 0x90*/
358     0x0420, 0x0421, 0x0422, 0x0423,
359     0x0424, 0x0425, 0x0426, 0x0427,
360     0x0428, 0x0429, 0x042a, 0x042b,
361     0x042c, 0x042d, 0x042e, 0x042f,
362     /* 0xa0*/
363     0x0430, 0x0431, 0x0432, 0x0433,
364     0x0434, 0x0435, 0x0436, 0x0437,
365     0x0438, 0x0439, 0x043a, 0x043b,
366     0x043c, 0x043d, 0x043e, 0x043f,
367     /* 0xb0*/
368     0x2591, 0x2592, 0x2593, 0x2502,
369     0x2524, 0x2561, 0x2562, 0x2556,
370     0x2555, 0x2563, 0x2551, 0x2557,
371     0x255d, 0x255c, 0x255b, 0x2510,
372     /* 0xc0*/
373     0x2514, 0x2534, 0x252c, 0x251c,
374     0x2500, 0x253c, 0x255e, 0x255f,
375     0x255a, 0x2554, 0x2569, 0x2566,
376     0x2560, 0x2550, 0x256c, 0x2567,
377     /* 0xd0*/
378     0x2568, 0x2564, 0x2565, 0x2559,
379     0x2558, 0x2552, 0x2553, 0x256b,
380     0x256a, 0x2518, 0x250c, 0x2588,
381     0x2584, 0x258c, 0x2590, 0x2580,
382     /* 0xe0*/
383     0x0440, 0x0441, 0x0442, 0x0443,
384     0x0444, 0x0445, 0x0446, 0x0447,
385     0x0448, 0x0449, 0x044a, 0x044b,
386     0x044c, 0x044d, 0x044e, 0x044f,
387     /* 0xf0*/
388     0x0401, 0x0451, 0x0404, 0x0454,
389     0x0407, 0x0457, 0x040e, 0x045e,
390     0x00b0, 0x2219, 0x00b7, 0x221a,
391     0x2116, 0x00a4, 0x25a0, 0x00a0,
392 };
393 
394 static const lChar32 __koi8r[128] = {
395     /* 0x80*/
396     0x2500, 0x2502, 0x250c, 0x2510,
397     0x2514, 0x2518, 0x251c, 0x2524,
398     0x252c, 0x2534, 0x253c, 0x2580,
399     0x2584, 0x2588, 0x258c, 0x2590,
400     /* 0x90*/
401     0x2591, 0x2592, 0x2593, 0x2320,
402     0x25a0, 0x2219, 0x221a, 0x2248,
403     0x2264, 0x2265, 0x00a0, 0x2321,
404     0x00b0, 0x00b2, 0x00b7, 0x00f7,
405     /* 0xa0*/
406     0x2550, 0x2551, 0x2552, 0x0451,
407     0x2553, 0x2554, 0x2555, 0x2556,
408     0x2557, 0x2558, 0x2559, 0x255a,
409     0x255b, 0x255c, 0x255d, 0x255e,
410     /* 0xb0*/
411     0x255f, 0x2560, 0x2561, 0x0401,
412     0x2562, 0x2563, 0x2564, 0x2565,
413     0x2566, 0x2567, 0x2568, 0x2569,
414     0x256a, 0x256b, 0x256c, 0x00a9,
415     /* 0xc0*/
416     0x044e, 0x0430, 0x0431, 0x0446,
417     0x0434, 0x0435, 0x0444, 0x0433,
418     0x0445, 0x0438, 0x0439, 0x043a,
419     0x043b, 0x043c, 0x043d, 0x043e,
420     /* 0xd0*/
421     0x043f, 0x044f, 0x0440, 0x0441,
422     0x0442, 0x0443, 0x0436, 0x0432,
423     0x044c, 0x044b, 0x0437, 0x0448,
424     0x044d, 0x0449, 0x0447, 0x044a,
425     /* 0xe0*/
426     0x042e, 0x0410, 0x0411, 0x0426,
427     0x0414, 0x0415, 0x0424, 0x0413,
428     0x0425, 0x0418, 0x0419, 0x041a,
429     0x041b, 0x041c, 0x041d, 0x041e,
430     /* 0xf0*/
431     0x041f, 0x042f, 0x0420, 0x0421,
432     0x0422, 0x0423, 0x0416, 0x0412,
433     0x042c, 0x042b, 0x0417, 0x0428,
434     0x042d, 0x0429, 0x0427, 0x042a,
435 };
436 
437 static const lChar32 __cp1250[128] = {
438     /* 0x80*/
439     0x20ac, 0x0000, 0x201a, 0x0000,
440     0x201e, 0x2026, 0x2020, 0x2021,
441     0x0000, 0x2030, 0x0160, 0x2039,
442     0x015a, 0x0164, 0x017d, 0x0179,
443     /* 0x90*/
444     0x0000, 0x2018, 0x2019, 0x201c,
445     0x201d, 0x2022, 0x2013, 0x2014,
446     0x0000, 0x2122, 0x0161, 0x203a,
447     0x015b, 0x0165, 0x017e, 0x017a,
448     /* 0xa0*/
449     0x00a0, 0x02c7, 0x02d8, 0x0141,
450     0x00a4, 0x0104, 0x00a6, 0x00a7,
451     0x00a8, 0x00a9, 0x015e, 0x00ab,
452     0x00ac, 0x00ad, 0x00ae, 0x017b,
453     /* 0xb0*/
454     0x00b0, 0x00b1, 0x02db, 0x0142,
455     0x00b4, 0x00b5, 0x00b6, 0x00b7,
456     0x00b8, 0x0105, 0x015f, 0x00bb,
457     0x013d, 0x02dd, 0x013e, 0x017c,
458     /* 0xc0*/
459     0x0154, 0x00c1, 0x00c2, 0x0102,
460     0x00c4, 0x0139, 0x0106, 0x00c7,
461     0x010c, 0x00c9, 0x0118, 0x00cb,
462     0x011a, 0x00cd, 0x00ce, 0x010e,
463     /* 0xd0*/
464     0x0110, 0x0143, 0x0147, 0x00d3,
465     0x00d4, 0x0150, 0x00d6, 0x00d7,
466     0x0158, 0x016e, 0x00da, 0x0170,
467     0x00dc, 0x00dd, 0x0162, 0x00df,
468     /* 0xe0*/
469     0x0155, 0x00e1, 0x00e2, 0x0103,
470     0x00e4, 0x013a, 0x0107, 0x00e7,
471     0x010d, 0x00e9, 0x0119, 0x00eb,
472     0x011b, 0x00ed, 0x00ee, 0x010f,
473     /* 0xf0*/
474     0x0111, 0x0144, 0x0148, 0x00f3,
475     0x00f4, 0x0151, 0x00f6, 0x00f7,
476     0x0159, 0x016f, 0x00fa, 0x0171,
477     0x00fc, 0x00fd, 0x0163, 0x02d9,
478 };
479 
480 static const lChar32 __cp850[128] = {
481     /* 0x80*/
482     0x00c7, 0x00fc, 0x00e9, 0x00e2,
483     0x00e4, 0x00e0, 0x00e5, 0x00e7,
484     0x00ea, 0x00eb, 0x00e8, 0x00ef,
485     0x00ee, 0x00ec, 0x00c4, 0x00c5,
486     /* 0x90*/
487     0x00c9, 0x00e6, 0x00c6, 0x00f4,
488     0x00f6, 0x00f2, 0x00fb, 0x00f9,
489     0x00ff, 0x00d6, 0x00dc, 0x00f8,
490     0x00a3, 0x00d8, 0x00d7, 0x0192,
491     /* 0xa0*/
492     0x00e1, 0x00ed, 0x00f3, 0x00fa,
493     0x00f1, 0x00d1, 0x00aa, 0x00ba,
494     0x00bf, 0x00ae, 0x00ac, 0x00bd,
495     0x00bc, 0x00a1, 0x00ab, 0x00bb,
496     /* 0xb0*/
497     0x2591, 0x2592, 0x2593, 0x2502,
498     0x2524, 0x00c1, 0x00c2, 0x00c0,
499     0x00a9, 0x2563, 0x2551, 0x2557,
500     0x255d, 0x00a2, 0x00a5, 0x2510,
501     /* 0xc0*/
502     0x2514, 0x2534, 0x252c, 0x251c,
503     0x2500, 0x253c, 0x00e3, 0x00c3,
504     0x255a, 0x2554, 0x2569, 0x2566,
505     0x2560, 0x2550, 0x256c, 0x00a4,
506     /* 0xd0*/
507     0x00f0, 0x00d0, 0x00ca, 0x00cb,
508     0x00c8, 0x0131, 0x00cd, 0x00ce,
509     0x00cf, 0x2518, 0x250c, 0x2588,
510     0x2584, 0x00a6, 0x00cc, 0x2580,
511     /* 0xe0*/
512     0x00d3, 0x00df, 0x00d4, 0x00d2,
513     0x00f5, 0x00d5, 0x00b5, 0x00fe,
514     0x00de, 0x00da, 0x00db, 0x00d9,
515     0x00fd, 0x00dd, 0x00af, 0x00b4,
516     /* 0xf0*/
517     0x00ad, 0x00b1, 0x2017, 0x00be,
518     0x00b6, 0x00a7, 0x00f7, 0x00b8,
519     0x00b0, 0x00a8, 0x00b7, 0x00b9,
520     0x00b3, 0x00b2, 0x25a0, 0x00a0,
521 };
522 
523 #define CRENC_ID_CP1250   (CRENC_ID_8BIT_START+1)
524 #define CRENC_ID_CP1251   (CRENC_ID_8BIT_START+2)
525 #define CRENC_ID_CP1252   (CRENC_ID_8BIT_START+3)
526 #define CRENC_ID_CP1253   (CRENC_ID_8BIT_START+4)
527 #define CRENC_ID_CP1257   (CRENC_ID_8BIT_START+5)
528 #define CRENC_ID_CP775   (CRENC_ID_8BIT_START+6)
529 #define CRENC_ID_CP737   (CRENC_ID_8BIT_START+7)
530 #define CRENC_ID_CP866   (CRENC_ID_8BIT_START+8)
531 #define CRENC_ID_CP850   (CRENC_ID_8BIT_START+9)
532 #define CRENC_ID_KOI8R   (CRENC_ID_8BIT_START+10)
533 #define CRENC_ID_ISO8859_2 (CRENC_ID_8BIT_START+11)
534 #define CRENC_ID_CP1254   (CRENC_ID_8BIT_START+12)
535 #define CRENC_ID_CP852   (CRENC_ID_8BIT_START+13)
536 #define CRENC_ID_ISO8859_16 (CRENC_ID_8BIT_START+14)
537 
538 
539 /// add other encodings here
540 static struct {
541     const char * name;
542     const lChar32 * table;
543     int id;
544 } _enc_table[] = {
545     {"windows-1250", __cp1250, CRENC_ID_CP1250},
546     {"windows-1251", __cp1251, CRENC_ID_CP1251},
547     {"windows-1252", __cp1252, CRENC_ID_CP1252},
548     {"windows-1253", __cp1253, CRENC_ID_CP1253},
549     {"windows-1254", __cp1254, CRENC_ID_CP1254},
550     {"windows-1257", __cp1257, CRENC_ID_CP1257},
551     {"cp775", __cp775, CRENC_ID_CP775},
552     {"cp737", __cp737, CRENC_ID_CP737},
553     {"cp1250", __cp1250, CRENC_ID_CP1250},
554     {"cp1251", __cp1251, CRENC_ID_CP1251},
555     {"cp1254", __cp1254, CRENC_ID_CP1254},
556     {"iso-8859-5", __cp1251, CRENC_ID_CP1251},
557     {"iso_8859-5", __cp1251, CRENC_ID_CP1251},
558     {"iso8859-5", __cp1251, CRENC_ID_CP1251},
559     {"cp1252", __cp1252, CRENC_ID_CP1252},
560     {"iso-8859-1", __cp1252, CRENC_ID_CP1252},
561     {"iso_8859-1", __cp1252, CRENC_ID_CP1252},
562     {"iso8859-1", __cp1252, CRENC_ID_CP1252},
563     {"latin-1", __cp1252, CRENC_ID_CP1252},
564     {"cp1253", __cp1253, CRENC_ID_CP1253},
565     {"cp1257", __cp1257, CRENC_ID_CP1257},
566     {"cp866", __cp866, CRENC_ID_CP866},
567     {"cp850", __cp850, CRENC_ID_CP850},
568     {"cp852", __cp852, CRENC_ID_CP852},
569     {"windows-866", __cp866, CRENC_ID_CP866},
570     {"windows-850", __cp850, CRENC_ID_CP850},
571     {"windows-852", __cp852, CRENC_ID_CP852},
572     {"koi-8r", __koi8r, CRENC_ID_KOI8R},
573     {"koi8r", __koi8r, CRENC_ID_KOI8R},
574     {"koi8-r", __koi8r, CRENC_ID_KOI8R},
575     {"iso8859-2", __iso8859_2, CRENC_ID_ISO8859_2},
576     {"iso-8859-2", __iso8859_2, CRENC_ID_ISO8859_2},
577     {"iso8859_2", __iso8859_2, CRENC_ID_ISO8859_2},
578     {"latin-2", __iso8859_2, CRENC_ID_ISO8859_2},
579     {"latin-5", __iso8859_2, CRENC_ID_ISO8859_2},
580     {"iso8859-16", __iso8859_16, CRENC_ID_ISO8859_16},
581     {"iso-8859-16", __iso8859_16, CRENC_ID_ISO8859_16},
582     {"iso8859_16", __iso8859_16, CRENC_ID_ISO8859_16},
583     {NULL, NULL, 0}
584 };
585 
CREncodingNameToId(const lChar32 * enc_name)586 int CREncodingNameToId( const lChar32 * enc_name )
587 {
588     lString32 s( enc_name );
589     s.lowercase();
590     const lChar32 * encoding_name = s.c_str();
591     if ( !lStr_cmp(encoding_name, "utf-8") )
592         return CRENC_ID_UTF8;
593     else if ( !lStr_cmp(encoding_name, "utf-16") )
594         return CRENC_ID_UTF16_LE;
595     else if ( !lStr_cmp(encoding_name, "utf-16le") )
596         return CRENC_ID_UTF16_LE;
597     else if ( !lStr_cmp(encoding_name, "utf-16be") )
598         return CRENC_ID_UTF16_BE;
599     else if ( !lStr_cmp(encoding_name, "utf-32") )
600         return CRENC_ID_UTF16_LE;
601     else if ( !lStr_cmp(encoding_name, "utf-32le") )
602         return CRENC_ID_UTF16_LE;
603     else if ( !lStr_cmp(encoding_name, "utf-32be") )
604         return CRENC_ID_UTF16_BE;
605     for (int i=0; _enc_table[i].name!=NULL; i++)
606     {
607         if ( !lStr_cmp(encoding_name, _enc_table[i].name) )
608         {
609             return _enc_table[i].id;
610         }
611     }
612     return CRENC_ID_UNKNOWN; // not found
613 }
614 
CREncodingIdToName(int id)615 const char * CREncodingIdToName( int id )
616 {
617     switch ( id ) {
618         case CRENC_ID_UTF8:
619             return "utf-8";
620         case CRENC_ID_UTF16_LE:
621             return "utf-16le";
622         case CRENC_ID_UTF16_BE:
623             return "utf-16be";
624         case CRENC_ID_UTF32_LE:
625             return "utf-32be";
626         case CRENC_ID_UTF32_BE:
627             return "utf-32be";
628     }
629     for (int i=0; _enc_table[i].name!=NULL; i++)
630     {
631         if ( id == _enc_table[i].id )
632         {
633             return _enc_table[i].name;
634         }
635     }
636     return NULL; // not found
637 }
638 
GetCharsetByte2UnicodeTable(const lChar32 * enc_name)639 const lChar32 * GetCharsetByte2UnicodeTable( const lChar32 * enc_name )
640 {
641     lString32 s( enc_name );
642     s.lowercase();
643     const lChar32 * encoding_name = s.c_str();
644     for (int i=0; _enc_table[i].name!=NULL; i++)
645     {
646         if ( !lStr_cmp(encoding_name, _enc_table[i].name) )
647         {
648             return _enc_table[i].table;
649         }
650     }
651     return NULL; // not found
652 }
653 
GetCharsetByte2UnicodeTableById(int id)654 const lChar32 * GetCharsetByte2UnicodeTableById( int id )
655 {
656     for (int i=0; _enc_table[i].name!=NULL; i++)
657     {
658         if ( id==_enc_table[i].id )
659         {
660             return _enc_table[i].table;
661         }
662     }
663     return NULL; // not found
664 }
665 
langToCodepage(int lang)666 int langToCodepage( int lang )
667 {
668     switch ( lang )
669     {
670     case	0x0436	: //	Afrikaans
671         return 1252;
672     case	0x041c	: //	Albanian
673         return 1252;
674     case	0x0401	: //	Arabic
675     case	0x1401	: //	Arabic Algeria
676     case	0x3c01	: //	Arabic Bahrain
677     case	0x0c01	: //	Arabic Egypt
678     case	0x0001	: //	Arabic General
679     case	0x0801	: //	Arabic Iraq
680     case	0x2c01	: //	Arabic Jordan
681     case	0x3401	: //	Arabic Kuwait
682     case	0x3001	: //	Arabic Lebanon
683     case	0x1001	: //	Arabic Libya
684     case	0x1801	: //	Arabic Morocco
685     case	0x2001	: //	Arabic Oman
686     case	0x4001	: //	Arabic Qatar
687     case	0x2801	: //	Arabic Syria
688     case	0x1c01	: //	Arabic Tunisia
689     case	0x3801	: //	Arabic U.A.E.
690     case	0x2401	: //	Arabic Yemen
691         return 1256;
692     case	0x042b	: //	Armenian
693         return 1252;
694     case	0x044d	: //	Assamese
695         return 1252;
696     case	0x082c	: //	Azeri Cyrillic
697         return 1251;
698     case	0x042c	: //	Azeri Latin
699         return 1252;
700     case	0x042d	: //	Basque
701         return 1252;
702     case	0x0445	: //	Bengali
703     case	0x101a	: //	Bosnia Herzegovina
704         return 1252;
705     case	0x0402	: //	Bulgarian
706         return 1251;
707     case	0x0455	: //	Burmese
708         return 1252;
709     case	0x0423	: //	Byelorussian
710         return 1251;
711     case	0x0403	: //	Catalan
712         return 1252;
713     case	0x0804	: //	Chinese China
714     case	0x0004	: //	Chinese General
715     case	0x0c04	: //	Chinese Hong Kong
716     //case	0x0c04	: //	Chinese Macao
717     case	0x1004	: //	Chinese Singapore
718     case	0x0404	: //	Chinese Taiwan
719         return 950;
720     case	0x041a	: //	Croatian
721         return 1250;
722     case	0x0405	: //	Czech
723         return 1250;
724     case	0x0406	: //	Danish
725         return 1252;
726     case	0x0813	: //	Dutch Belgium
727     case	0x0413	: //	Dutch Standard
728         return 1252;
729     case	0x0c09	: //	English Australia
730     case	0x2809	: //	English Belize
731     case	0x0809	: //	English British
732     case	0x1009	: //	English Canada
733     case	0x2409	: //	English Caribbean
734     case	0x0009	: //	English General
735     case	0x1809	: //	English Ireland
736     case	0x2009	: //	English Jamaica
737     case	0x1409	: //	English New Zealand
738     case	0x3409	: //	English Philippines
739     case	0x1c09	: //	English South Africa
740     case	0x2c09	: //	English Trinidad
741     case	0x0409	: //	English United States
742     //case	0x0409	: //	English Zimbabwe
743         return 1252;
744     case	0x0425	: //	Estonian
745         return 1257;
746     case	0x0438	: //	Faeroese
747     case	0x0429	: //	Farsi
748         return 1252;
749     case	0x040b	: //	Finnish
750         return 1252;
751     case	0x040c	: //	French
752     case	0x080c	: //	French Belgium
753     case	0x2c0c	: //	French Cameroon
754     case	0x0c0c	: //	French Canada
755     case	0x300c	: //	French Cote d'Ivoire
756     case	0x140c	: //	French Luxemburg
757     case	0x340c	: //	French Mali
758     case	0x180c	: //	French Monaco
759     case	0x200c	: //	French Reunion
760     case	0x280c	: //	French Senegal
761     case	0x100c	: //	French Swiss
762     case	0x1c0c	: //	French West Indies
763     case	0x240c	: //	French Zaire
764         return 1252;
765     case	0x0462	: //	Frisian
766     case	0x043c	: //	Gaelic
767     case	0x083c	: //	Gaelic Ireland
768     case	0x0456	: //	Galician
769     case	0x0437	: //	Georgian
770         return 1252;
771     case	0x0407	: //	German
772     case	0x0c07	: //	German Austrian
773     case	0x1407	: //	German Liechtenstein
774     case	0x1007	: //	German Luxemburg
775     case	0x0807	: //	German Switzerland
776         return 1252;
777     case	0x0408	: //	Greek
778         return 1253;
779     case	0x0447	: //	Gujarati
780         return 1252;
781     case	0x040d	: //	Hebrew
782         return 1255;
783     case	0x0439	: //	Hindi
784         return 1252;
785     case	0x040e	: //	Hungarian
786         return 1252;
787     case	0x040f	: //	Icelandic
788         return 1252;
789     case	0x0421	: //	Indonesian
790         return 1252;
791     case	0x0410	: //	Italian
792     case	0x0810	: //	Italian Switzerland
793         return 1252;
794     case	0x0411	: //	Japanese
795         return 932;
796     case	0x044b	: //	Kannada
797         return 1252;
798     case	0x0460	: //	Kashmiri
799     case	0x0860	: //	Kashmiri India
800         return 1252;
801     case	0x043f	: //	Kazakh
802         return 1251;
803     case	0x0453	: //	Khmer
804         return 1252;
805     case	0x0440	: //	Kirghiz
806         return 1252;
807     case	0x0457	: //	Konkani
808         return 1252;
809     case	0x0412	: //	Korean
810     case	0x0812	: //	Korean Johab
811         return 1252;
812     case	0x0454	: //	Lao
813         return 1252;
814     case	0x0426	: //	Latvian
815         return 1257;
816     case	0x0427	: //	Lithuanian
817     case	0x0827	: //	Lithuanian Classic
818         return 1257;
819     case	0x043e	: //	Macedonian
820         return 1252;
821     //case	0x043e	: //	Malay
822     case	0x083e	: //	Malay Brunei Darussalam
823     case	0x044c	: //	Malayalam
824         return 1252;
825     case	0x043a	: //	Maltese
826         return 1252;
827     case	0x0458  : //	Manipuri
828         return 1252;
829     case	0x044e	: //	Marathi
830         return 1252;
831     case	0x0450	: //	Mongolian
832         return 1252;
833     case	0x0461	: //	Nepali
834     case	0x0861	: //	Nepali India
835         return 1252;
836     case	0x0414	: //	Norwegian Bokmal
837     case	0x0814	: //	Norwegian Nynorsk
838         return 1252;
839     case	0x0448	: //	Oriya
840         return 1252;
841     case	0x0415	: //	Polish
842         return 1250;
843     case	0x0416	: //	Portuguese Brazil
844     case	0x0816	: //	Portuguese Iberian
845         return 1252;
846     case	0x0446	: //	Punjabi
847     case	0x0417	: //	Rhaeto-Romanic
848         return 1252;
849     case	0x0418	: //	Romanian
850     case	0x0818	: //	Romanian Moldova
851         return 1252;
852     case	0x0419	: //	Russian
853     case	0x0819	: //	Russian Moldova
854         return 1251;
855     case	0x043b	: //	Sami Lappish
856         return 1252;
857     case	0x044f	: //	Sanskrit
858         return 1252;
859     case	0x0c1a	: //	Serbian Cyrillic
860         return 1251;
861     case	0x081a	: //	Serbian Latin
862         return 1252;
863     case	0x0459	: //	Sindhi
864         return 1252;
865     case	0x041b	: //	Slovak
866         return 1252;
867     case	0x0424	: //	Slovenian
868         return 1252;
869     case	0x042e	: //	Sorbian
870         return 1252;
871     case	0x2c0a	: //	Spanish Argentina
872     case	0x400a	: //	Spanish Bolivia
873     case	0x340a	: //	Spanish Chile
874     case	0x240a	: //	Spanish Colombia
875     case	0x140a	: //	Spanish Costa Rica
876     case	0x1c0a	: //	Spanish Dominican Republic
877     case	0x300a	: //	Spanish Ecuador
878     case	0x440a	: //	Spanish El Salvador
879     case	0x100a	: //	Spanish Guatemala
880     case	0x480a	: //	Spanish Honduras
881     case	0x080a	: //	Spanish Mexico
882     case	0x0c0a	: //	Spanish Modern
883     case	0x4c0a	: //	Spanish Nicaragua
884     case	0x180a	: //	Spanish Panama
885     case	0x3c0a	: //	Spanish Paraguay
886     case	0x280a	: //	Spanish Peru
887     case	0x500a	: //	Spanish Puerto Rico
888     case	0x040a	: //	Spanish Traditional
889     case	0x380a	: //	Spanish Uruguay
890     case	0x200a	: //	Spanish Venezuela
891         return 1252;
892     case	0x0430	: //	Sutu
893         return 1252;
894     case	0x0441	: //	Swahili
895         return 1252;
896     case	0x041d	: //	Swedish
897     case	0x081d	: //	Swedish Finland
898         return 1252;
899     case	0x0428	: //	Tajik
900         return 1252;
901     case	0x0449	: //	Tamil
902         return 1252;
903     case	0x0444	: //	Tatar
904         return 1251;
905     case	0x044a	: //	Telugu
906         return 1252;
907     case	0x041e	: //	Thai
908         return 1252;
909     case	0x0451	: //	Tibetan
910         return 1252;
911     case	0x0431	: //	Tsonga
912         return 1252;
913     case	0x0432	: //	Tswana
914         return 1252;
915     case	0x041f	: //	Turkish
916         return 1254;
917     case	0x0442	: //	Turkmen
918         return 1251;
919     case	0x0422	: //	Ukrainian
920         return 1251;
921     case	0x0420	: //	Urdu
922         return 1252;
923     case	0x0820	: //	Urdu India
924         return 1252;
925     case	0x0843	: //	Uzbek Cyrillic
926         return 1251;
927     case	0x0443	: //	Uzbek Latin
928         return 1252;
929     case	0x0433	: //	Venda
930         return 1252;
931     case	0x042a	: //	Vietnamese
932         return 1252;
933     case	0x0452	: //	Welsh
934         return 1252;
935     case	0x0434	: //	Xhosa
936         return 1252;
937     case	0x043d	: //	Yiddish
938         return 1252;
939     case	0x0435	: //	Zulu
940         return 1252;
941     default:
942         return 1251;
943     }
944 }
945 
langToLanguage(int lang)946 const char* langToLanguage( int lang )
947 {
948     switch ( lang )
949     {
950     case	0x0436	: //	Afrikaans
951         return "af";
952     case	0x041c	: //	Albanian
953         return "sq";
954     case	0x0401	: //	Arabic
955     case	0x1401	: //	Arabic Algeria
956     case	0x3c01	: //	Arabic Bahrain
957     case	0x0c01	: //	Arabic Egypt
958     case	0x0001	: //	Arabic General
959     case	0x0801	: //	Arabic Iraq
960     case	0x2c01	: //	Arabic Jordan
961     case	0x3401	: //	Arabic Kuwait
962     case	0x3001	: //	Arabic Lebanon
963     case	0x1001	: //	Arabic Libya
964     case	0x1801	: //	Arabic Morocco
965     case	0x2001	: //	Arabic Oman
966     case	0x4001	: //	Arabic Qatar
967     case	0x2801	: //	Arabic Syria
968     case	0x1c01	: //	Arabic Tunisia
969     case	0x3801	: //	Arabic U.A.E.
970     case	0x2401	: //	Arabic Yemen
971         return "ar";
972     case	0x042b	: //	Armenian
973         return "hy";
974     case	0x044d	: //	Assamese
975         return "as";
976     case	0x082c	: //	Azeri Cyrillic
977     case	0x042c	: //	Azeri Latin
978         return "az";
979     case	0x042d	: //	Basque
980         return "eu";
981     case	0x0445	: //	Bengali
982         return "bn";
983     case	0x101a	: //	Bosnia Herzegovina
984         return "hr";
985     case	0x0402	: //	Bulgarian
986         return "bg";
987     case	0x0455	: //	Burmese
988         return "my";
989     case	0x0423	: //	Byelorussian
990         return "be";
991     case	0x0403	: //	Catalan
992         return "ca";
993     case	0x0804	: //	Chinese China
994     case	0x0004	: //	Chinese General
995     case	0x0c04	: //	Chinese Hong Kong
996     //case	0x0c04	: //	Chinese Macao
997     case	0x1004	: //	Chinese Singapore
998     case	0x0404	: //	Chinese Taiwan
999         return "zh";
1000     case	0x041a	: //	Croatian
1001         return "hr";
1002     case	0x0405	: //	Czech
1003         return "cs";
1004     case	0x0406	: //	Danish
1005         return "da";
1006     case	0x0813	: //	Dutch Belgium
1007     case	0x0413	: //	Dutch Standard
1008         return "nl";
1009     case	0x0c09	: //	English Australia
1010     case	0x2809	: //	English Belize
1011     case	0x0809	: //	English British
1012     case	0x1009	: //	English Canada
1013     case	0x2409	: //	English Caribbean
1014     case	0x0009	: //	English General
1015     case	0x1809	: //	English Ireland
1016     case	0x2009	: //	English Jamaica
1017     case	0x1409	: //	English New Zealand
1018     case	0x3409	: //	English Philippines
1019     case	0x1c09	: //	English South Africa
1020     case	0x2c09	: //	English Trinidad
1021     case	0x0409	: //	English United States
1022     //case	0x0409	: //	English Zimbabwe
1023         return "en";
1024     case	0x0425	: //	Estonian
1025         return "et";
1026     case	0x0438	: //	Faeroese
1027         return "fo";
1028     case	0x0429	: //	Farsi
1029         return "fa";
1030     case	0x040b	: //	Finnish
1031         return "fi";
1032     case	0x040c	: //	French
1033     case	0x080c	: //	French Belgium
1034     case	0x2c0c	: //	French Cameroon
1035     case	0x0c0c	: //	French Canada
1036     case	0x300c	: //	French Cote d'Ivoire
1037     case	0x140c	: //	French Luxemburg
1038     case	0x340c	: //	French Mali
1039     case	0x180c	: //	French Monaco
1040     case	0x200c	: //	French Reunion
1041     case	0x280c	: //	French Senegal
1042     case	0x100c	: //	French Swiss
1043     case	0x1c0c	: //	French West Indies
1044     case	0x240c	: //	French Zaire
1045         return "fr";
1046     case	0x0462	: //	Frisian
1047         return "fy";
1048     case	0x043c	: //	Gaelic
1049     case	0x083c	: //	Gaelic Ireland
1050 	return "ga";
1051     case	0x0456	: //	Galician
1052 	return "gl";
1053     case	0x0437	: //	Georgian
1054         return "ka";
1055     case	0x0407	: //	German
1056     case	0x0c07	: //	German Austrian
1057     case	0x1407	: //	German Liechtenstein
1058     case	0x1007	: //	German Luxemburg
1059     case	0x0807	: //	German Switzerland
1060         return "de";
1061     case	0x0408	: //	Greek
1062         return "el";
1063     case	0x0447	: //	Gujarati
1064         return "gu";
1065     case	0x040d	: //	Hebrew
1066         return "he";
1067     case	0x0439	: //	Hindi
1068         return "hi";
1069     case	0x040e	: //	Hungarian
1070         return "hu";
1071     case	0x040f	: //	Icelandic
1072         return "is";
1073     case	0x0421	: //	Indonesian
1074         return "id";
1075     case	0x0410	: //	Italian
1076     case	0x0810	: //	Italian Switzerland
1077         return "it";
1078     case	0x0411	: //	Japanese
1079         return "ja";
1080     case	0x044b	: //	Kannada
1081         return "kn";
1082     case	0x0460	: //	Kashmiri
1083     case	0x0860	: //	Kashmiri India
1084         return "ks";
1085     case	0x043f	: //	Kazakh
1086         return "kk";
1087     case	0x0453	: //	Khmer
1088         return "km";
1089     case	0x0440	: //	Kirghiz
1090         return "ky";
1091     case	0x0457	: //	Konkani
1092         return "kok";
1093     case	0x0412	: //	Korean
1094     case	0x0812	: //	Korean Johab
1095         return "ko";
1096     case	0x0454	: //	Lao
1097         return "lo";
1098     case	0x0426	: //	Latvian
1099         return "lv";
1100     case	0x0427	: //	Lithuanian
1101     case	0x0827	: //	Lithuanian Classic
1102         return "lt";
1103     case	0x043e	: //	Macedonian
1104     //case	0x043e	: //	Malay
1105     case	0x083e	: //	Malay Brunei Darussalam
1106         return "ms";
1107     case	0x044c	: //	Malayalam
1108         return "ml";
1109     case	0x043a	: //	Maltese
1110         return "mt";
1111     case	0x0458  : //	Manipuri
1112         return "mni";
1113     case	0x044e	: //	Marathi
1114         return "mr";
1115     case	0x0450	: //	Mongolian
1116         return "mn";
1117     case	0x0461	: //	Nepali
1118     case	0x0861	: //	Nepali India
1119         return "ne";
1120     case	0x0414	: //	Norwegian Bokmal
1121     case	0x0814	: //	Norwegian Nynorsk
1122         return "nb";
1123     case	0x0448	: //	Oriya
1124         return "or";
1125     case	0x0415	: //	Polish
1126         return "pl";
1127     case	0x0416	: //	Portuguese Brazil
1128     case	0x0816	: //	Portuguese Iberian
1129         return "pt";
1130     case	0x0446	: //	Punjabi
1131         return "pa";
1132     case	0x0417	: //	Rhaeto-Romanic
1133         return "rm";
1134     case	0x0418	: //	Romanian
1135     case	0x0818	: //	Romanian Moldova
1136         return "ro";
1137     case	0x0419	: //	Russian
1138     case	0x0819	: //	Russian Moldova
1139         return "ru";
1140     case	0x043b	: //	Sami Lappish
1141         return "se";
1142     case	0x044f	: //	Sanskrit
1143         return "sa";
1144     case	0x0c1a	: //	Serbian Cyrillic
1145     case	0x081a	: //	Serbian Latin
1146         return "hr";
1147     case	0x0459	: //	Sindhi
1148         return "sd";
1149     case	0x041b	: //	Slovak
1150         return "sk";
1151     case	0x0424	: //	Slovenian
1152         return "sl";
1153     case	0x042e	: //	Sorbian
1154         return "hsb";
1155     case	0x2c0a	: //	Spanish Argentina
1156     case	0x400a	: //	Spanish Bolivia
1157     case	0x340a	: //	Spanish Chile
1158     case	0x240a	: //	Spanish Colombia
1159     case	0x140a	: //	Spanish Costa Rica
1160     case	0x1c0a	: //	Spanish Dominican Republic
1161     case	0x300a	: //	Spanish Ecuador
1162     case	0x440a	: //	Spanish El Salvador
1163     case	0x100a	: //	Spanish Guatemala
1164     case	0x480a	: //	Spanish Honduras
1165     case	0x080a	: //	Spanish Mexico
1166     case	0x0c0a	: //	Spanish Modern
1167     case	0x4c0a	: //	Spanish Nicaragua
1168     case	0x180a	: //	Spanish Panama
1169     case	0x3c0a	: //	Spanish Paraguay
1170     case	0x280a	: //	Spanish Peru
1171     case	0x500a	: //	Spanish Puerto Rico
1172     case	0x040a	: //	Spanish Traditional
1173     case	0x380a	: //	Spanish Uruguay
1174     case	0x200a	: //	Spanish Venezuela
1175         return "es";
1176     case	0x0430	: //	Sutu
1177         return "st";
1178     case	0x0441	: //	Swahili
1179         return "sw";
1180     case	0x041d	: //	Swedish
1181     case	0x081d	: //	Swedish Finland
1182         return "sv";
1183     case	0x0428	: //	Tajik
1184         return "tg";
1185     case	0x0449	: //	Tamil
1186         return "ta";
1187     case	0x0444	: //	Tatar
1188         return "tt";
1189     case	0x044a	: //	Telugu
1190         return "te";
1191     case	0x041e	: //	Thai
1192         return "th";
1193     case	0x0451	: //	Tibetan
1194         return "bo";
1195     case	0x0431	: //	Tsonga
1196         return "ts";
1197     case	0x0432	: //	Tswana
1198         return "tn";
1199     case	0x041f	: //	Turkish
1200         return "tr";
1201     case	0x0442	: //	Turkmen
1202         return "tk";
1203     case	0x0422	: //	Ukrainian
1204         return "uk";
1205     case	0x0420	: //	Urdu
1206     case	0x0820	: //	Urdu India
1207         return "ur";
1208     case	0x0843	: //	Uzbek Cyrillic
1209     case	0x0443	: //	Uzbek Latin
1210         return "uz";
1211     case	0x0433	: //	Venda
1212         return "ve";
1213     case	0x042a	: //	Vietnamese
1214         return "vi";
1215     case	0x0452	: //	Welsh
1216         return "cy";
1217     case	0x0434	: //	Xhosa
1218         return "xh";
1219     case	0x043d	: //	Yiddish
1220         return "yi";
1221     case	0x0435	: //	Zulu
1222         return "zu";
1223     default:
1224         return NULL;
1225     }
1226 }
1227 
GetCharsetByte2UnicodeTable(int codepage)1228 const lChar32 * GetCharsetByte2UnicodeTable( int codepage )
1229 {
1230     switch ( codepage )
1231     {
1232     case 1251:
1233         return __cp1251;
1234     case 1257:
1235         return __cp1257;
1236     case 204:
1237         return __cp1251;
1238     case 1252:
1239         return __cp1252;
1240     case 1253:
1241         return __cp1253;
1242     case 1254:
1243         return __cp1254;
1244     case 737:
1245         return __cp737;
1246     case 1250: return __cp1250;
1247     case 866:  return __cp866;
1248     case 850:  return __cp850;
1249     default:   return __cp1252;
1250     }
1251 }
1252 
GetCharsetName(int codepage)1253 const lChar32 * GetCharsetName( int codepage )
1254 {
1255     switch ( codepage )
1256     {
1257     case 1251:
1258         return U"cp1251";
1259     case 1257:
1260         return U"cp1257";
1261     case 204:
1262         return U"cp1251";
1263     case 1252:
1264         return U"cp1252";
1265     case 1253:
1266         return U"cp1253";
1267     case 737:
1268         return U"cp737";
1269     case 1250: return U"cp1250";
1270     case 866:  return U"cp866";
1271     case 850:  return U"cp850";
1272     default:   return U"cp1252";
1273     }
1274 }
1275 
1276 static unsigned char cp1252_page00[256] = {
1277 	0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */
1278 	0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */
1279 	0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */
1280 	0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */
1281 	0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */
1282 	0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */
1283 	0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */
1284 	0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */
1285 	0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */
1286 	0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */
1287 	0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */
1288 	0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */
1289 	0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */
1290 	0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */
1291 	0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */
1292 	0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */
1293 
1294 	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */
1295 	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */
1296 	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */
1297 	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */
1298 	0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, /* 0xa0-0xa7 */
1299 	0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, /* 0xa8-0xaf */
1300 	0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, /* 0xb0-0xb7 */
1301 	0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, /* 0xb8-0xbf */
1302 	0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xc0-0xc7 */
1303 	0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xc8-0xcf */
1304 	0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, /* 0xd0-0xd7 */
1305 	0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, /* 0xd8-0xdf */
1306 	0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, /* 0xe0-0xe7 */
1307 	0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0xe8-0xef */
1308 	0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0xf0-0xf7 */
1309 	0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff, /* 0xf8-0xff */
1310 };
1311 
1312 static unsigned char *cp1252_page_uni2charset[256] = {
1313 	cp1252_page00, NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
1314 };
1315 
1316 static unsigned char cp1251_page00[256] = {
1317 	0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */
1318 	0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */
1319 	0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */
1320 	0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */
1321 	0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */
1322 	0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */
1323 	0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */
1324 	0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */
1325 	0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */
1326 	0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */
1327 	0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */
1328 	0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */
1329 	0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */
1330 	0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */
1331 	0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */
1332 	0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */
1333 
1334 	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */
1335 	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */
1336 	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */
1337 	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */
1338 	0xa0, 0x00, 0x00, 0x00, 0xa4, 0x00, 0xa6, 0xa7, /* 0xa0-0xa7 */
1339 	0x00, 0xa9, 0x00, 0xab, 0xac, 0xad, 0xae, 0x00, /* 0xa8-0xaf */
1340 	0xb0, 0xb1, 0x00, 0x00, 0x00, 0xb5, 0xb6, 0xb7, /* 0xb0-0xb7 */
1341 	0x00, 0x00, 0x00, 0xbb, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */
1342 	0x00, 0x00, 0x00, 0xbb, 0x00, 0x00, 0x00, 0x00, /* 0xc0-0xc7 */
1343 	0x00, 0x00, 0x00, 0xbb, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */
1344 	0x00, 0x00, 0x00, 0xbb, 0x00, 0x00, 0x00, 0x00, /* 0xd0-0xd7 */
1345 	0x00, 0x00, 0x00, 0xbb, 0x00, 0x00, 0x00, 0x00, /* 0xd8-0xdf */
1346 	0x00, 0x00, 0x00, 0xbb, 0x00, 0x00, 0x00, 0x00, /* 0xe0-0xe7 */
1347 	0x00, 0x00, 0x00, 0xbb, 0x00, 0x00, 0x00, 0x00, /* 0xe8-0xef */
1348 	0x00, 0x00, 0x00, 0xbb, 0x00, 0x00, 0x00, 0x00, /* 0xf0-0xf7 */
1349 	0x00, 0x00, 0x00, 0xbb, 0x00, 0x00, 0x00, 0x00, /* 0xf8-0xff */
1350 };
1351 
1352 static unsigned char cp1251_page04[256] = {
1353 	0x00, 0xa8, 0x80, 0x81, 0xaa, 0xbd, 0xb2, 0xaf, /* 0x00-0x07 */
1354 	0xa3, 0x8a, 0x8c, 0x8e, 0x8d, 0x00, 0xa1, 0x8f, /* 0x08-0x0f */
1355 	0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0x10-0x17 */
1356 	0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0x18-0x1f */
1357 	0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, /* 0x20-0x27 */
1358 	0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, /* 0x28-0x2f */
1359 	0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, /* 0x30-0x37 */
1360 	0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0x38-0x3f */
1361 	0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0x40-0x47 */
1362 	0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff, /* 0x48-0x4f */
1363 	0x00, 0xb8, 0x90, 0x83, 0xba, 0xbe, 0xb3, 0xbf, /* 0x50-0x57 */
1364 	0xbc, 0x9a, 0x9c, 0x9e, 0x9d, 0x00, 0xa2, 0x9f, /* 0x58-0x5f */
1365 	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */
1366 	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */
1367 	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */
1368 	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */
1369 
1370 	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */
1371 	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */
1372 	0x00, 0xa5, 0xb4, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */
1373 };
1374 
1375 static unsigned char cp1251_page20[256] = {
1376 	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */
1377 	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */
1378 	0x00, 0x00, 0x00, 0x96, 0x97, 0x00, 0x00, 0x00, /* 0x10-0x17 */
1379 	0x91, 0x92, 0x82, 0x00, 0x93, 0x94, 0x84, 0x00, /* 0x18-0x1f */
1380 	0x86, 0x87, 0x95, 0x00, 0x00, 0x00, 0x85, 0x00, /* 0x20-0x27 */
1381 	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */
1382 	0x89, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */
1383 	0x00, 0x8b, 0x9b, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */
1384 };
1385 
1386 static unsigned char cp1251_page21[256] = {
1387 	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */
1388 	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */
1389 	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xb9, 0x00, /* 0x10-0x17 */
1390 	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */
1391 	0x00, 0x00, 0x99, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */
1392 };
1393 
1394 static unsigned char *cp1251_page_uni2charset[256] = {
1395 	cp1251_page00, NULL,   NULL,   NULL,   cp1251_page04, NULL,   NULL,   NULL,
1396 	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
1397 	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
1398 	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
1399 	cp1251_page20, cp1251_page21, NULL,	NULL,   NULL,   NULL,   NULL,   NULL,
1400 };
1401 
1402 /// add other encodings here
1403 static struct {
1404     const char * name;
1405     unsigned char ** table;
1406 } _uni2byte_enc_table[] = {
1407     {"windows-1251", cp1251_page_uni2charset},
1408     {"cp1251", cp1251_page_uni2charset},
1409     {"windows-1252", cp1252_page_uni2charset},
1410     {"cp1252", cp1252_page_uni2charset},
1411     {NULL, NULL}
1412 };
1413 
GetCharsetUnicode2ByteTable(const lChar32 * enc_name)1414 const lChar8 ** GetCharsetUnicode2ByteTable( const lChar32 * enc_name )
1415 {
1416     lString32 s( enc_name );
1417     s.lowercase();
1418     const lChar32 * encoding_name = s.c_str();
1419     for (int i=0; _uni2byte_enc_table[i].name!=NULL; i++)
1420     {
1421         if ( !lStr_cmp(encoding_name, _uni2byte_enc_table[i].name) )
1422         {
1423             return (const lChar8 **)_uni2byte_enc_table[i].table;
1424         }
1425     }
1426     return NULL; // not found
1427 }
1428 
1429 
1430 
1431 // AUTODETECT ENCODINGS feature
1432 #define DBL_CHAR_STAT_SIZE 256
1433 
1434 class CDoubleCharStat
1435 {
1436 
1437    struct CDblCharNode
1438    {
1439       unsigned char ch1;
1440       unsigned char ch2;
1441       unsigned int  count;
1442       unsigned int  index;
1443       CDblCharNode * left;
1444       CDblCharNode * right;
1445       CDblCharNode * sleft;
1446       CDblCharNode * sright;
CDblCharNodeCDoubleCharStat::CDblCharNode1447       CDblCharNode( unsigned char c1, unsigned char c2 ) :
1448          ch1(c1), ch2(c2), count(1), index(0), left(NULL), right(NULL),
1449          sleft(NULL), sright(NULL)
1450       {
1451       }
~CDblCharNodeCDoubleCharStat::CDblCharNode1452       ~CDblCharNode()
1453       {
1454          if (left)
1455             delete left;
1456          if (right)
1457             delete right;
1458       }
operator <CDoubleCharStat::CDblCharNode1459       bool operator < (const CDblCharNode & node )
1460       {
1461          return (ch1<node.ch2) || (ch1==node.ch1 && ch2<node.ch2);
1462       }
operator ==CDoubleCharStat::CDblCharNode1463       bool operator == (const CDblCharNode & node )
1464       {
1465          return (ch1==node.ch1) && (ch2=node.ch2);
1466       }
AddCDoubleCharStat::CDblCharNode1467       static inline void Add( CDblCharNode * & pnode, unsigned char c1, unsigned char c2 )
1468       {
1469          if (pnode)
1470             pnode->Add( c1, c2 );
1471          else
1472             pnode = new CDblCharNode( c1, c2 );
1473       }
AddCDoubleCharStat::CDblCharNode1474       void Add( unsigned char c1, unsigned char c2 )
1475       {
1476          if (c1==ch1 && c2==ch2) {
1477             count++; // found
1478          } else if (c1<ch1 || (c1==ch1 && c2<ch2) ) {
1479             Add(left, c1, c2 );
1480          } else {
1481             Add(right, c1, c2 );
1482          }
1483       }
AddSortedCDoubleCharStat::CDblCharNode1484       void AddSorted( CDblCharNode * & sroot )
1485       {
1486          if (!sroot)
1487             sroot = this;
1488          else if (count>sroot->count)
1489             AddSorted( sroot->sleft );
1490          else
1491             AddSorted( sroot->sright );
1492       }
SortCDoubleCharStat::CDblCharNode1493       void Sort( CDblCharNode * & sroot )
1494       {
1495          if (left)
1496             left->Sort( sroot );
1497          AddSorted( sroot );
1498          if (right)
1499             right->Sort( sroot );
1500       }
RenumberCDoubleCharStat::CDblCharNode1501       void Renumber( int & curr_index )
1502       {
1503          if (sleft)
1504             sleft->Renumber( curr_index );
1505          index = curr_index++;
1506          if (sright)
1507             sright->Renumber( curr_index );
1508       }
Renumber1CDoubleCharStat::CDblCharNode1509       void Renumber1( int & curr_index )
1510       {
1511          if (left)
1512             left->Renumber1( curr_index );
1513          index = curr_index++;
1514          if (right)
1515             right->Renumber1( curr_index );
1516       }
GetDataCDoubleCharStat::CDblCharNode1517       void GetData( dbl_char_stat_long_t * & pData, int & len, unsigned int maxindex )
1518       {
1519          if (len<=0)
1520             return;
1521          if (left)
1522             left->GetData( pData, len, maxindex );
1523          if (len<=0)
1524             return;
1525          if (index<maxindex)
1526          {
1527             pData->ch1 = ch1;
1528             pData->ch2 = ch2;
1529             pData->count = count;
1530             pData++;
1531             len--;
1532          }
1533          if (len<=0)
1534             return;
1535          if (right)
1536             right->GetData( pData, len, maxindex );
1537       }
1538    };
1539 
1540    CDblCharNode * nodes;
1541    int total;
1542 public:
CDoubleCharStat()1543    CDoubleCharStat() : nodes(NULL), total(0)
1544    {
1545    }
Add(unsigned char c1,unsigned char c2)1546    void Add( unsigned char c1, unsigned char c2 )
1547    {
1548 /*   	if ( !(c1>127 || c1>='a' && c1<='z' || c1>='A' && c1<='Z' || c1=='\'')
1549            && !(c2>127 || c2>='a' && c2<='z' || c2>='A' && c2<='Z' || c2=='\'') )
1550       {
1551          return;
1552       }
1553       */
1554       if (c1==' ' && c2==' ')
1555          return;
1556       total++;
1557       CDblCharNode::Add( nodes, c1, c2 );
1558    }
GetData(dbl_char_stat_t * pData,int len)1559    void GetData( dbl_char_stat_t * pData, int len )
1560    {
1561        dbl_char_stat_long_t data[DBL_CHAR_STAT_SIZE];
1562       dbl_char_stat_long_t * pData2 = data;
1563       int len2 = len;
1564       int idx = 0;
1565       if (nodes && total)
1566       {
1567          nodes->Renumber1( idx );
1568          idx = 0;
1569          if (nodes->left)
1570             nodes->left->Sort(nodes);
1571          if (nodes->right)
1572             nodes->right->Sort(nodes);
1573          //nodes->Sort( nodes );
1574          nodes->Renumber( idx );
1575          nodes->GetData( pData2, len2, len2 );
1576       }
1577       // fill rest of array
1578       for ( ; len2>0; len2--, pData2++ ) {
1579          pData2->ch1 = 0;
1580          pData2->ch2 = 0;
1581          pData2->count = 0;
1582       }
1583       // scale by total
1584       if (total) {
1585           for (int i=0; i<len; i++) {
1586               if ( data[i].count<0 ) {
1587                     data[i].count = -data[i].count;
1588               }
1589               data[i].count = (int)(data[i].count * (lInt64)0x7000 / total);
1590           }
1591       }
1592       for ( int i=0; i<len; i++ ) {
1593            pData[i].ch1 = data[i].ch1;
1594            pData[i].ch2 = data[i].ch2;
1595            pData[i].count = data[i].count;
1596       }
1597       Close();
1598    }
Close()1599    void Close()
1600    {
1601       if (nodes)
1602          delete nodes;
1603       nodes = NULL;
1604       total = 0;
1605    }
~CDoubleCharStat()1606    virtual ~CDoubleCharStat()
1607    {
1608       Close();
1609    }
1610 };
1611 
sort_dblstats_by_count(const void * p1,const void * p2)1612 int sort_dblstats_by_count( const void * p1, const void * p2 )
1613 {
1614     int n1 = static_cast<const dbl_char_stat_long_t*>(p1)->count;
1615     int n2 = static_cast<const dbl_char_stat_long_t*>(p2)->count;
1616     if ( n1>n2 )
1617         return -1;
1618     else if ( n2>n1 )
1619         return 1;
1620     else
1621         return 0;
1622 }
1623 
sort_dblstats_by_ch(const void * p1,const void * p2)1624 int sort_dblstats_by_ch( const void * p1, const void * p2 )
1625 {
1626     const dbl_char_stat_long_t* n1 = static_cast<const dbl_char_stat_long_t*>(p1);
1627     const dbl_char_stat_long_t* n2 = static_cast<const dbl_char_stat_long_t*>(p2);
1628     if ( n1->ch1>n2->ch1 )
1629         return 1;
1630     else if ( n1->ch1<n2->ch1 )
1631         return -1;
1632     if ( n1->ch2>n2->ch2 )
1633         return 1;
1634     else if ( n1->ch2<n2->ch2 )
1635         return -1;
1636     else
1637         return 0;
1638 }
1639 
1640 class CDoubleCharStat2
1641 {
1642 private:
1643     lUInt16 * * stats;
1644     int total;
1645     int items;
1646 public:
CDoubleCharStat2()1647     CDoubleCharStat2() : stats(NULL), total(0), items(0)
1648     {
1649     }
Add(unsigned char c1,unsigned char c2)1650     void Add( unsigned char c1, unsigned char c2 )
1651     {
1652         if ( !stats ) {
1653             stats = new lUInt16* [256]();
1654         }
1655         if (c1==' ' && c2==' ')
1656             return;
1657         total++;
1658         if ( stats[c1]==NULL ) {
1659             stats[c1] = new lUInt16[256]();
1660         }
1661         if ( stats[c1][c2]++ == 0)
1662             items++;
1663     }
GetData(dbl_char_stat_t * pData,int len)1664     void GetData( dbl_char_stat_t * pData, int len )
1665     {
1666         int count = 0;
1667         dbl_char_stat_long_t * pdata = new dbl_char_stat_long_t[items];
1668         if ( total ) {
1669             for ( int i=0; i<256; i++ ) {
1670                 if ( stats[i] ) {
1671                     for ( int j=0; j<256; j++ ) {
1672                         if ( stats[i][j]> 0 ) {
1673                             pdata[count].ch1 = i;
1674                             pdata[count].ch2 = j;
1675                             int n = stats[i][j];
1676                             n = (int)(n * (lInt64)0x7000 / total);
1677                             pdata[count].count = n;
1678                             count++;
1679                         }
1680                     }
1681                 }
1682             }
1683             qsort(pdata, count, sizeof(dbl_char_stat_long_t), sort_dblstats_by_count);
1684             int nsort = count;
1685             if ( nsort>len )
1686                 nsort = len;
1687             qsort(pdata, nsort, sizeof(dbl_char_stat_long_t), sort_dblstats_by_ch);
1688         }
1689         // copy data to destination
1690         for ( int k=0; k<len; k++ ) {
1691             if ( k<count ) {
1692                 pData[k].ch1 = pdata[k].ch1;
1693                 pData[k].ch2 = pdata[k].ch2;
1694                 pData[k].count = pdata[k].count;
1695             } else {
1696                 pData[k].ch1 = 0;
1697                 pData[k].ch2 = 0;
1698                 pData[k].count = 0;
1699             }
1700         }
1701         delete[] pdata;
1702         Close();
1703    }
1704 
Close()1705    void Close()
1706    {
1707        if ( stats ) {
1708            for ( int i=0; i<256; i++ )
1709                if ( stats[i] )
1710                    delete[] stats[i];
1711            delete[] stats;
1712            stats = NULL;
1713        }
1714        total = 0;
1715    }
1716 
~CDoubleCharStat2()1717    virtual ~CDoubleCharStat2()
1718    {
1719        Close();
1720    }
1721 };
1722 
isValidUtf8Data(const unsigned char * buf,int buf_size)1723 bool isValidUtf8Data( const unsigned char * buf, int buf_size )
1724 {
1725     const unsigned char * start = buf;
1726     const unsigned char * end_buf = buf + buf_size - 5;
1727     while ( buf < end_buf ) {
1728         lUInt8 ch = *buf++;
1729         if ( (ch & 0x80) == 0 ) {
1730         } else if ( (ch & 0xC0) == 0x80 ) {
1731             CRLog::trace("unexpected char %02x at position %x, str=%s", ch, (buf-1-start), lString8((const char *)(buf-1), 32).c_str());
1732             return false;
1733         } else if ( (ch & 0xE0) == 0xC0 ) {
1734             ch = *buf++;
1735             if ( (ch & 0xC0) != 0x80 ) {
1736                 CRLog::trace("unexpected char %02x at position %x, str=%s", ch, (buf-1-start), lString8((const char *)(buf-1), 32).c_str());
1737                 return false;
1738             }
1739         } else if ( (ch & 0xF0) == 0xE0 ) {
1740             ch = *buf++;
1741             if ( (ch & 0xC0) != 0x80 )
1742                 return false;
1743             ch = *buf++;
1744             if ( (ch & 0xC0) != 0x80 )
1745                 return false;
1746         } else if ( (ch & 0xF8) == 0xF0 ) {
1747             ch = *buf++;
1748             if ( (ch & 0xC0) != 0x80 )
1749                 return false;
1750             ch = *buf++;
1751             if ( (ch & 0xC0) != 0x80 )
1752                 return false;
1753             ch = *buf++;
1754             if ( (ch & 0xC0) != 0x80 )
1755                 return false;
1756         } else {
1757             return false;
1758         }
1759     }
1760     return true;
1761 }
1762 
MakeDblCharStat(const unsigned char * buf,int buf_size,dbl_char_stat_t * stat,int stat_len,bool skipHtml)1763 void MakeDblCharStat(const unsigned char * buf, int buf_size, dbl_char_stat_t * stat, int stat_len, bool skipHtml)
1764 {
1765    CDoubleCharStat2 maker;
1766    unsigned char ch1=' ';
1767    unsigned char ch2=' ';
1768    bool insideTag = false;
1769    for ( int i=1; i<buf_size; i++) {
1770       lChar8 ch = buf[i];
1771       if (skipHtml) {
1772           if (ch == '<') {
1773               insideTag = true;
1774               continue;
1775           } else if (ch == '>') {
1776               insideTag = false;
1777               ch = ' ';
1778           }
1779       }
1780       if (insideTag)
1781           continue;
1782       ch1 = ch2;
1783       ch2 = ch;
1784       if ( ch2<128 && ch2!='\'' && !( (ch2>='a' && ch2<='z') || (ch2>='A' && ch2<='Z')) )
1785          ch2 = ' ';
1786       //if (i>0)
1787       maker.Add( ch1, ch2 );
1788    }
1789    maker.GetData( stat, stat_len );
1790 }
1791 
MakeCharStat(const unsigned char * buf,int buf_size,short stat_table[256],bool skipHtml)1792 void MakeCharStat(const unsigned char * buf, int buf_size, short stat_table[256], bool skipHtml)
1793 {
1794    int stat[256] = { 0 };
1795    int total=0;
1796    unsigned char ch;
1797    bool insideTag = false;
1798    for (int i=0; i<buf_size; i++) {
1799       ch = buf[i];
1800       if (skipHtml) {
1801           if (ch == '<') {
1802               insideTag = true;
1803               continue;
1804           }
1805           if (ch == '>') {
1806               insideTag = false;
1807               continue;
1808           }
1809           if (insideTag)
1810               continue;
1811       }
1812       if ( ch>127 || (ch>='a' && ch<='z') || (ch>='A' && ch<='Z') || ch=='\'') {
1813          stat[ch]++;
1814          total++;
1815       }
1816    }
1817    if (total) {
1818       for (int i=0; i<256; i++) {
1819          stat_table[i] = (short)(stat[i] * (lInt64)0x7000 / total);
1820       }
1821    }
1822 }
1823 
CompareCharStats(const short * stat1,const short * stat2,double & k1,double & k2)1824 double CompareCharStats( const short * stat1, const short * stat2, double &k1, double &k2 )
1825 {
1826    double sum = 0;
1827    double psum = 0;
1828    double psum2 = 0;
1829    for (int i=0; i<256; i++) {
1830 	  psum += ( (double)stat1[i] * stat2[i] / 0x7000 / 0x7000);
1831 	  if (i>=128)
1832 		psum2 += ( (double)stat1[i] * stat2[i] / 0x7000 / 0x7000);
1833       int delta = stat1[i] - stat2[i];
1834       if (delta<0)
1835          delta = -delta;
1836       sum += delta;
1837    }
1838    sum /= 0x7000;
1839    k1 = psum;
1840    k2 = psum2;
1841    return sum / 256;
1842 }
1843 
CompareDblCharStats(const dbl_char_stat_t * stat1,const dbl_char_stat_t * stat2,int stat_len,double & k1,double & k2)1844 double CompareDblCharStats( const dbl_char_stat_t * stat1, const dbl_char_stat_t * stat2, int stat_len, double &k1, double &k2 )
1845 {
1846    double sum = 0;
1847    int len1 = stat_len;
1848    int len2 = stat_len;
1849    double psum = 0;
1850    double psum2 = 0;
1851    while (len1 && len2) {
1852       //
1853       if (stat1->ch1==stat2->ch1 && stat1->ch2==stat2->ch2) {
1854           if (stat1->ch1 != ' ' || stat1->ch2 != ' ') {
1855              // add stat
1856              int delta = (stat1->count - stat2->count);
1857              if (delta<0)
1858                 delta = -delta;
1859              sum += delta;
1860              psum += ( (double)stat1->count * stat2->count / 0x7000 / 0x7000);
1861              if (stat1->ch1>=128 || stat1->ch2>=128)
1862                 psum2 += ( (double)stat1->count * stat2->count / 0x7000 / 0x7000);
1863           }
1864           // move both
1865           stat1++;
1866           len1--;
1867           stat2++;
1868           len2--;
1869       } else if ( stat1->ch1<stat2->ch1 || (stat1->ch1==stat2->ch1 && stat1->ch2<stat2->ch2) ) {
1870          // add stat
1871          //int delta = (stat1->count);
1872          sum += stat1->count;
1873          // move 1st
1874          stat1++;
1875          len1--;
1876       } else {
1877          // add stat
1878          //int delta = (stat2->count);
1879          sum += stat2->count;
1880          stat2++;
1881          len2--;
1882       }
1883    }
1884    sum /= 0x7000;
1885    k1 = psum;
1886    k2 = psum2;
1887    return sum / stat_len;
1888 }
1889 
1890 
1891 //==========================================
1892 // Stats
1893 typedef struct {
1894 	const short * ch_stat;       // int[256] statistics table table
1895     const dbl_char_stat_t * dbl_ch_stat;
1896 	char * cp_name;   // codepage name
1897 	char * lang_name; // lang name
1898 } cp_stat_t;
1899 // EXTERNAL DEFINE
1900 extern cp_stat_t cp_stat_table[];
1901 
AutodetectCodePageUtf(const unsigned char * buf,int buf_size,char * cp_name,char * lang_name)1902 int AutodetectCodePageUtf( const unsigned char * buf, int buf_size, char * cp_name, char * lang_name )
1903 {
1904     // checking byte order signatures
1905     if ( buf[0]==0xEF && buf[1]==0xBB && buf[2]==0xBF ) {
1906         strcpy( cp_name, "utf-8" );     // NOLINT: strcpy is fine with hardcoded string with len < 32
1907         strcpy( lang_name, "en" );      // NOLINT
1908         return 1;
1909     } else if ( buf[0]==0 && buf[1]==0 && buf[2]==0xFE && buf[3]==0xFF ) {
1910         strcpy( cp_name, "utf-32be" ); // NOLINT
1911         strcpy( lang_name, "en" );     // NOLINT
1912         return 1;
1913     } else if ( buf[0]==0xFE && buf[1]==0xFF ) {
1914         strcpy( cp_name, "utf-16be" ); // NOLINT
1915         strcpy( lang_name, "en" );     // NOLINT
1916         return 1;
1917     } else if ( buf[0]==0xFF && buf[1]==0xFE && buf[2]==0 && buf[3]==0 ) {
1918         strcpy( cp_name, "utf-32le" ); // NOLINT
1919         strcpy( lang_name, "en" );     // NOLINT
1920         return 1;
1921     } else if ( buf[0]==0xFF && buf[1]==0xFE ) {
1922         strcpy( cp_name, "utf-16le" ); // NOLINT
1923         strcpy( lang_name, "en" );     // NOLINT
1924         return 1;
1925     }
1926     if ( isValidUtf8Data( buf, buf_size ) ) {
1927         strcpy( cp_name, "utf-8" );    // NOLINT
1928         strcpy( lang_name, "en" );     // NOLINT
1929         return 1;
1930     }
1931    return 0;
1932 }
1933 
strincmp(const unsigned char * buf,const char * pattern,int len)1934 int strincmp(const unsigned char * buf, const char * pattern, int len)
1935 {
1936     for (int i=0; i<len && pattern[i] && buf[i]; i++) {
1937         int ch = buf[i];
1938         if (ch >= 'A' && ch<='Z')
1939             ch += 'a' - 'A';
1940         int ch2 = pattern[i];
1941         if (ch2 >= 'A' && ch2<='Z')
1942             ch2 += 'a' - 'A';
1943         if (ch < ch2)
1944             return -1;
1945         if (ch > ch2)
1946             return 1;
1947     }
1948     return 0;
1949 }
1950 
strnstr(const unsigned char * buf,int buf_len,const char * pattern)1951 int strnstr(const unsigned char * buf, int buf_len, const char * pattern)
1952 {
1953     int plen = (int)strlen(pattern);
1954     for (int i=0; i<=buf_len - plen; i++) {
1955         if (!strincmp(buf + i, pattern, plen)) {
1956             return i;
1957         }
1958     }
1959     return -1;
1960 }
1961 
rstrnstr(const unsigned char * buf,int buf_len,const char * pattern)1962 int rstrnstr(const unsigned char * buf, int buf_len, const char * pattern)
1963 {
1964     int plen = (int)strlen(pattern);
1965     for (int i=buf_len - plen; i>=0; i--) {
1966         if (!strincmp(buf + i, pattern, plen)) {
1967             return i;
1968         }
1969     }
1970     return -1;
1971 }
1972 
detectXmlHtmlEncoding(const unsigned char * buf,int buf_len,char * html_enc_name)1973 bool detectXmlHtmlEncoding(const unsigned char * buf, int buf_len, char * html_enc_name)
1974 {
1975     int xml_p = strnstr(buf, buf_len, "<?xml");
1976     int xml_end_p = strnstr(buf, buf_len, "?>");
1977     if (xml_p >= 0 && xml_end_p > xml_p) {
1978         // XML
1979         int enc_p = strnstr(buf, buf_len, "encoding=\"");
1980         if (enc_p < xml_p || enc_p > xml_end_p)
1981             return false;
1982         enc_p += 10;
1983         int enc_end_p = strnstr(buf + enc_p, xml_end_p - enc_p, "\"");
1984         if (enc_end_p < 0 || enc_end_p > 20)
1985             return false;
1986         strncpy(html_enc_name, (char *)(buf + enc_p), enc_end_p);
1987         html_enc_name[enc_end_p] = 0;
1988         CRLog::debug("XML header encoding detected: %s", html_enc_name);
1989         return true;
1990     }
1991     int content_type_p = strnstr(buf, buf_len, "http-equiv=\"Content-Type\"");
1992     if (content_type_p >= 0) {
1993         int meta_p = rstrnstr(buf, content_type_p, "<meta");
1994         if (meta_p < 0)
1995             return false;
1996         int meta_end_p = strnstr(buf + meta_p, buf_len - meta_p, ">");
1997         if (meta_end_p < 0)
1998             return false;
1999         int charset_p = strnstr(buf + meta_p, meta_end_p, "charset=");
2000         if (charset_p < 0)
2001             return false;
2002         charset_p += 8;
2003         int charset_end_p = strnstr(buf + meta_p + charset_p, meta_end_p - charset_p, "\"");
2004         if (charset_end_p < 0)
2005             return false;
2006         strncpy(html_enc_name, (char *)(buf + meta_p + charset_p), charset_end_p);
2007         html_enc_name[charset_end_p] = 0;
2008         CRLog::debug("HTML header meta encoding detected: %s", html_enc_name);
2009         return true;
2010     }
2011     return false;
2012 }
2013 
AutodetectCodePage(const unsigned char * buf,int buf_size,char * cp_name,char * lang_name,bool skipHtml)2014 int AutodetectCodePage(const unsigned char * buf, int buf_size, char * cp_name, char * lang_name, bool skipHtml)
2015 {
2016     int res = AutodetectCodePageUtf( buf, buf_size, cp_name, lang_name );
2017     if ( res )
2018         return res;
2019     // use character statistics
2020    short char_stat[256];
2021    dbl_char_stat_t dbl_char_stat[DBL_CHAR_STAT_SIZE];
2022    MakeCharStat(buf, buf_size, char_stat, skipHtml);
2023    MakeDblCharStat(buf, buf_size, dbl_char_stat, DBL_CHAR_STAT_SIZE, skipHtml);
2024    int bestn = 0;
2025    double bestq = 0; //1000000;
2026    for (int i=0; cp_stat_table[i].ch_stat; i++) {
2027 	   double q12, q11;
2028 	   double q22, q21;
2029 	   double q1 = CompareCharStats( cp_stat_table[i].ch_stat, char_stat, q11, q12 );
2030 	   double q2 = CompareDblCharStats( cp_stat_table[i].dbl_ch_stat, dbl_char_stat, DBL_CHAR_STAT_SIZE, q21, q22 );
2031 //       double q_1 = q11 + 3*q12;
2032 //	   double q_2 = q21 + 5*q22;
2033 //	   double q_ = q_1 * q_2;
2034        if (q1 < 0.00001)
2035            q1 = 0.00001;
2036        if (q2 < 0.00001)
2037            q2 = 0.00001;
2038        double q = q11 * 0 + q12 * 2 + q21 * 0 + q22 * 6; //(q_>0) ? (q1*2+q2*7) / (q_) : 1000000;
2039        q = q / (q1 + q2);
2040        //CRLog::debug("%d %10s %4s : %lf %lf %lf - %lf %lf %lf  :  %lf", i, cp_stat_table[i].cp_name, cp_stat_table[i].lang_name, q1, q11, q12, q2, q21, q22, q);
2041        if (q > bestq) {
2042 		   bestn = i;
2043 		   bestq = q;
2044 	   }
2045    }
2046    strcpy(cp_name, cp_stat_table[bestn].cp_name);     // NOLINT: strcpy is fine, all strings are len < 32
2047    strcpy(lang_name, cp_stat_table[bestn].lang_name); // NOLINT
2048    CRLog::debug("Detected codepage:%s lang:%s index:%d %s", cp_name, lang_name, bestn, skipHtml ? "(skipHtml)" : "");
2049    if (skipHtml) {
2050        if (detectXmlHtmlEncoding(buf, buf_size, cp_name)) {
2051            CRLog::debug("Encoding parsed from XML/HTML: %s", cp_name);
2052        }
2053    }
2054    return 1;
2055 }
2056 
hasXmlTags(const lUInt8 * buf,int size)2057 bool hasXmlTags(const lUInt8 * buf, int size) {
2058     int openCount = 0;
2059     int closeCount = 0;
2060     for (int i=0; i<size; i++) {
2061         if (buf[i]=='<')
2062             openCount++;
2063         else if (buf[i]=='>')
2064             closeCount++;
2065     }
2066     if (openCount > 2 && closeCount > 2) {
2067         int diff = openCount - closeCount;
2068         if (diff<0)
2069             diff = -diff;
2070         if (diff < 2)
2071             return true;
2072     }
2073     return false;
2074 }
2075 
MakeStatsForFile(const char * fname,const char * cp_name,const char * lang_name,int index,FILE * f,lString8 & list)2076 void MakeStatsForFile( const char * fname, const char * cp_name, const char * lang_name, int index, FILE * f, lString8 & list )
2077 {
2078    FILE * in = fopen( fname, "rb" );
2079    if (!in)
2080       return;
2081    fseek( in, 0, SEEK_END );
2082    int buf_size = ftell(in);
2083    fseek( in, 0, SEEK_SET );
2084    unsigned char * buf = new unsigned char[buf_size];
2085    fread(buf, 1, buf_size, in);
2086    short char_stat[256] = { 0 };
2087    dbl_char_stat_t dbl_char_stat[DBL_CHAR_STAT_SIZE];
2088    bool skipHtml = hasXmlTags(buf, buf_size);
2089    MakeCharStat(buf, buf_size, char_stat, skipHtml);
2090    MakeDblCharStat(buf, buf_size, dbl_char_stat, DBL_CHAR_STAT_SIZE, skipHtml);
2091    fprintf(f, "\n\nstatic const short ch_stat_%s_%s%d[256]={\n", cp_name, lang_name, index);
2092    int i;
2093    for (i=0; i<16; i++)
2094    {
2095       for (int j=0; j<16; j++)
2096       {
2097          fprintf(f, "0x%04x,", (unsigned int)char_stat[i*16+j] );
2098       }
2099       fprintf(f, "// %d..%d\n", i*16, i*16+15 );
2100    }
2101    fprintf(f, "};\n\n" );
2102    fprintf(f, "static const dbl_char_stat_t dbl_ch_stat_%s_%s%d[%d] = {\n", cp_name, lang_name, index, DBL_CHAR_STAT_SIZE  );
2103    for (i=0; i<DBL_CHAR_STAT_SIZE/16; i++)
2104    {
2105       for (int j=0; j<16; j++)
2106       {
2107          fprintf(f, "{0x%02x,0x%02x,0x%04x}, ", (unsigned int)dbl_char_stat[i*16+j].ch1, (unsigned int)dbl_char_stat[i*16+j].ch2, (unsigned int)((lUInt16)dbl_char_stat[i*16+j].count) );
2108       }
2109       fprintf(f, "// %d..%d\n", i*16, i*16+15 );
2110    }
2111    char str[100];
2112    sprintf(str, "{ch_stat_%s_%s%d,dbl_ch_stat_%s_%s%d,\"%s\",\"%s\"}, \n", cp_name, lang_name, index, cp_name, lang_name, index, cp_name, lang_name );
2113    list += str;
2114    fprintf(f, "};\n\n" );
2115    delete [] buf;
2116    fclose(in);
2117 }
2118