1 #include "links.h"
2 
3 struct table_entry {
4 	unsigned char c;
5 	int u;
6 };
7 
8 struct codepage_desc {
9 	unsigned char *name;
10 	unsigned char **aliases;
11 	struct table_entry *table;
12 };
13 
14 #include "codepage.inc"
15 #include "uni_7b.inc"
16 #include "entity.inc"
17 #include "upcase.inc"
18 
19 unsigned char strings[256][2] = {
20 	"\000", "\001", "\002", "\003", "\004", "\005", "\006", "\007",
21 	"\010", "\011", "\012", "\013", "\014", "\015", "\016", "\017",
22 	"\020", "\021", "\022", "\023", "\024", "\025", "\026", "\033",
23 	"\030", "\031", "\032", "\033", "\034", "\035", "\036", "\033",
24 	"\040", "\041", "\042", "\043", "\044", "\045", "\046", "\047",
25 	"\050", "\051", "\052", "\053", "\054", "\055", "\056", "\057",
26 	"\060", "\061", "\062", "\063", "\064", "\065", "\066", "\067",
27 	"\070", "\071", "\072", "\073", "\074", "\075", "\076", "\077",
28 	"\100", "\101", "\102", "\103", "\104", "\105", "\106", "\107",
29 	"\110", "\111", "\112", "\113", "\114", "\115", "\116", "\117",
30 	"\120", "\121", "\122", "\123", "\124", "\125", "\126", "\127",
31 	"\130", "\131", "\132", "\133", "\134", "\135", "\136", "\137",
32 	"\140", "\141", "\142", "\143", "\144", "\145", "\146", "\147",
33 	"\150", "\151", "\152", "\153", "\154", "\155", "\156", "\157",
34 	"\160", "\161", "\162", "\163", "\164", "\165", "\166", "\167",
35 	"\170", "\171", "\172", "\173", "\174", "\175", "\176", "\177",
36 	"\200", "\201", "\202", "\203", "\204", "\205", "\206", "\207",
37 	"\210", "\211", "\212", "\213", "\214", "\215", "\216", "\217",
38 	"\220", "\221", "\222", "\223", "\224", "\225", "\226", "\227",
39 	"\230", "\231", "\232", "\233", "\234", "\235", "\236", "\237",
40 	"\240", "\241", "\242", "\243", "\244", "\245", "\246", "\247",
41 	"\250", "\251", "\252", "\253", "\254", "\255", "\256", "\257",
42 	"\260", "\261", "\262", "\263", "\264", "\265", "\266", "\267",
43 	"\270", "\271", "\272", "\273", "\274", "\275", "\276", "\277",
44 	"\300", "\301", "\302", "\303", "\304", "\305", "\306", "\307",
45 	"\310", "\311", "\312", "\313", "\314", "\315", "\316", "\317",
46 	"\320", "\321", "\322", "\323", "\324", "\325", "\326", "\327",
47 	"\330", "\331", "\332", "\333", "\334", "\335", "\336", "\337",
48 	"\340", "\341", "\342", "\343", "\344", "\345", "\346", "\347",
49 	"\350", "\351", "\352", "\353", "\354", "\355", "\356", "\357",
50 	"\360", "\361", "\362", "\363", "\364", "\365", "\366", "\367",
51 	"\370", "\371", "\372", "\373", "\374", "\375", "\376", "\377",
52 };
53 
free_translation_table(struct conv_table * p)54 void free_translation_table(struct conv_table *p)
55 {
56 	int i;
57 	for (i = 0; i < 256; i++) if (p[i].t) free_translation_table(p[i].u.tbl);
58 	mem_free(p);
59 }
60 
61 unsigned char no_str[] = "*";
62 
new_translation_table(struct conv_table * p)63 void new_translation_table(struct conv_table *p)
64 {
65 	int i;
66 	for (i = 0; i < 256; i++) if (p[i].t) free_translation_table(p[i].u.tbl);
67 	for (i = 0; i < 128; i++) p[i].t = 0, p[i].u.str = strings[i];
68 	for (; i < 256; i++) p[i].t = 0, p[i].u.str = no_str;
69 }
70 
71 #define BIN_SEARCH(table, entry, entries, key, result)			\
72 {									\
73 	int s_ = 0, e_ = (entries) - 1;					\
74 	while (s_ <= e_ || !((result) = -1)) {				\
75 		int m_ = ((unsigned)s_ + (unsigned)e_) / 2;		\
76 		if ((table)[m_].entry == (key)) {			\
77 			(result) = m_;					\
78 			break;						\
79 		}							\
80 		if ((table)[m_].entry > (key)) e_ = m_ - 1;		\
81 		if ((table)[m_].entry < (key)) s_ = m_ + 1;		\
82 	}								\
83 }									\
84 
85 int strange_chars[32] = {
86 0x20ac, 0x0000, 0x002a, 0x0000, 0x201e, 0x2026, 0x2020, 0x2021,
87 0x005e, 0x2030, 0x0160, 0x003c, 0x0152, 0x0000, 0x0000, 0x0000,
88 0x0000, 0x0060, 0x0027, 0x0022, 0x0022, 0x002a, 0x2013, 0x2014,
89 0x007e, 0x2122, 0x0161, 0x003e, 0x0153, 0x0000, 0x0000, 0x0000,
90 };
91 
u2cp(int u,int to,int fallback)92 static inline unsigned char *u2cp(int u, int to, int fallback)
93 {
94 	int j, s;
95 	again:
96 	if (u < 128) return strings[u];
97 	if (u == 0xa0) return strings[1];
98 	if (u == 0xad) return strings[0];
99 	if (u < 0xa0) {
100 		u = strange_chars[u - 0x80];
101 		if (!u) return NULL;
102 		goto again;
103 	}
104 	for (j = 0; codepages[to].table[j].c; j++)
105 		if (codepages[to].table[j].u == u)
106 			return strings[codepages[to].table[j].c];
107 	if (!fallback) return NULL;
108 	BIN_SEARCH(unicode_7b, x, N_UNICODE_7B, u, s);
109 	if (s != -1) return unicode_7b[s].s;
110 	return NULL;
111 }
112 
cp2u(unsigned char ch,int from)113 int cp2u(unsigned char ch, int from)
114 {
115 	struct table_entry *e;
116 	if (from < 0 || ch < 0x80) return ch;
117 	for (e = codepages[from].table; e->c; e++) if (e->c == ch) return e->u;
118 	return -1;
119 }
120 
121 unsigned char utf_buffer[7];
122 
encode_utf_8(int u)123 unsigned char *encode_utf_8(int u)
124 {
125 	memset(utf_buffer, 0, 7);
126 	if (u < 0x80) utf_buffer[0] = u;
127 	else if (u < 0x800)
128 		utf_buffer[0] = 0xc0 | ((u >> 6) & 0x1f),
129 		utf_buffer[1] = 0x80 | (u & 0x3f);
130 	else if (u < 0x10000)
131 		utf_buffer[0] = 0xe0 | ((u >> 12) & 0x0f),
132 		utf_buffer[1] = 0x80 | ((u >> 6) & 0x3f),
133 		utf_buffer[2] = 0x80 | (u & 0x3f);
134 	else if (u < 0x200000)
135 		utf_buffer[0] = 0xf0 | ((u >> 18) & 0x0f),
136 		utf_buffer[1] = 0x80 | ((u >> 12) & 0x3f),
137 		utf_buffer[2] = 0x80 | ((u >> 6) & 0x3f),
138 		utf_buffer[3] = 0x80 | (u & 0x3f);
139 	else if (u < 0x4000000)
140 		utf_buffer[0] = 0xf8 | ((u >> 24) & 0x0f),
141 		utf_buffer[1] = 0x80 | ((u >> 18) & 0x3f),
142 		utf_buffer[2] = 0x80 | ((u >> 12) & 0x3f),
143 		utf_buffer[3] = 0x80 | ((u >> 6) & 0x3f),
144 		utf_buffer[4] = 0x80 | (u & 0x3f);
145 	else	utf_buffer[0] = 0xfc | ((u >> 30) & 0x01),
146 		utf_buffer[1] = 0x80 | ((u >> 24) & 0x3f),
147 		utf_buffer[2] = 0x80 | ((u >> 18) & 0x3f),
148 		utf_buffer[3] = 0x80 | ((u >> 12) & 0x3f),
149 		utf_buffer[4] = 0x80 | ((u >> 6) & 0x3f),
150 		utf_buffer[5] = 0x80 | (u & 0x3f);
151 	return utf_buffer;
152 }
153 
add_utf_8(struct conv_table * ct,int u,unsigned char * str)154 void add_utf_8(struct conv_table *ct, int u, unsigned char *str)
155 {
156 	unsigned char *p = encode_utf_8(u);
157 	while (p[1]) {
158 		if (ct[*p].t) ct = ct[*p].u.tbl;
159 		else {
160 			struct conv_table *nct;
161 			if (ct[*p].u.str != no_str) {
162 				internal("bad utf encoding #1");
163 				return;
164 			}
165 			nct = mem_alloc(sizeof(struct conv_table) * 256);
166 			memset(nct, 0, sizeof(struct conv_table) * 256);
167 			new_translation_table(nct);
168 			ct[*p].t = 1;
169 			ct[*p].u.tbl = nct;
170 			ct = nct;
171 		}
172 		p++;
173 	}
174 	if (ct[*p].t) {
175 		internal("bad utf encoding #2");
176 		return;
177 	}
178 	if (ct[*p].u.str == no_str) ct[*p].u.str = str;
179 }
180 
181 struct conv_table utf_table[256];
182 int utf_table_init = 1;
183 
free_utf_table()184 void free_utf_table()
185 {
186 	int i;
187 	for (i = 128; i < 256; i++) mem_free(utf_table[i].u.str);
188 }
189 
get_translation_table_to_utf_8(int from)190 struct conv_table *get_translation_table_to_utf_8(int from)
191 {
192 	int i;
193 	static int lfr = -1;
194 	if (from == -1) return NULL;
195 	if (from == lfr) return utf_table;
196 	lfr = from;
197 	if (utf_table_init) memset(utf_table, 0, sizeof(struct conv_table) * 256), utf_table_init = 0;
198 	else free_utf_table();
199 	for (i = 0; i < 128; i++) utf_table[i].u.str = strings[i];
200 	if (codepages[from].table == table_utf_8) {
201 		for (i = 128; i < 256; i++) utf_table[i].u.str = stracpy(strings[i]);
202 		return utf_table;
203 	}
204 	for (i = 128; i < 256; i++) utf_table[i].u.str = NULL;
205 	for (i = 0; codepages[from].table[i].c; i++) {
206 		int u = codepages[from].table[i].u;
207 		if (!utf_table[codepages[from].table[i].c].u.str)
208 			utf_table[codepages[from].table[i].c].u.str = stracpy(encode_utf_8(u));
209 	}
210 	for (i = 128; i < 256; i++)
211 		if (!utf_table[i].u.str) utf_table[i].u.str = stracpy(no_str);
212 	return utf_table;
213 }
214 
215 struct conv_table table[256];
216 static int first = 1;
217 
free_conv_table()218 void free_conv_table()
219 {
220 	if (!utf_table_init) free_utf_table();
221 	if (first) memset(table, 0, sizeof(struct conv_table) * 256), first = 0;
222 	new_translation_table(table);
223 }
224 
get_translation_table(int from,int to)225 struct conv_table *get_translation_table(int from, int to)
226 {
227 	int i;
228 	static int lfr = -1;
229 	static int lto = -1;
230 	if (first) memset(table, 0, sizeof(struct conv_table) * 256), first = 0;
231 	if (/*from == to ||*/ from == -1 || to == -1) return NULL;
232 	if (codepages[to].table == table_utf_8) return get_translation_table_to_utf_8(from);
233 	if (from == lfr && to == lto) return table;
234 	lfr = from; lto = to;
235 	new_translation_table(table);
236 	if (codepages[from].table == table_utf_8) {
237 		int j;
238 		for (j = 0; codepages[to].table[j].c; j++) add_utf_8(table, codepages[to].table[j].u, codepages[to].table[j].u == 0xa0 ? strings[1] : codepages[to].table[j].u == 0xad ? strings[0] : strings[codepages[to].table[j].c]);
239 		for (i = 0; unicode_7b[i].x != -1; i++) if (unicode_7b[i].x >= 0x80) add_utf_8(table, unicode_7b[i].x, unicode_7b[i].s);
240 	} else for (i = 128; i < 256; i++) {
241 		int j;
242 		char *u;
243 		for (j = 0; codepages[from].table[j].c; j++) {
244 			if (codepages[from].table[j].c == i) goto f;
245 		}
246 		continue;
247 		f:
248 		u = u2cp(codepages[from].table[j].u, to, 1);
249 		if (u) table[i].u.str = u;
250 	}
251 	return table;
252 }
253 
xxstrcmp(unsigned char * s1,unsigned char * s2,int l2)254 static inline int xxstrcmp(unsigned char *s1, unsigned char *s2, int l2)
255 {
256 	while (l2) {
257 		if (*s1 > *s2) return 1;
258 		if (!*s1 || *s1 < *s2) return -1;
259 		s1++, s2++, l2--;
260 	}
261 	return !!*s1;
262 }
263 
get_entity_number(unsigned char * st,int l)264 int get_entity_number(unsigned char *st, int l)
265 {
266 	int n = 0;
267 	if (upcase(st[0]) == 'X') {
268 		st++, l--;
269 		if (!l) return -1;
270 		do {
271 			char c = upcase(*(st++));
272 			if (c >= '0' && c <= '9') n = n * 16 + c - '0';
273 			else if (c >= 'A' && c <= 'F') n = n * 16 + c - 'A' + 10;
274 			else return -1;
275 			if (n >= 0x10000) return -1;
276 		} while (--l);
277 	} else {
278 		if (!l) return -1;
279 		do {
280 			char c = *(st++);
281 			if (c >= '0' && c <= '9') n = n * 10 + c - '0';
282 			else return -1;
283 			if (n >= 0x10000) return -1;
284 		} while (--l);
285 	}
286 	return n;
287 }
288 
get_entity_string(unsigned char * st,int l,int encoding)289 unsigned char *get_entity_string(unsigned char *st, int l, int encoding)
290 {
291 	int n;
292 	if (l <= 0) return NULL;
293 	if (st[0] == '#') {
294 		if (l == 1) return NULL;
295 		if ((n = get_entity_number(st + 1, l - 1)) == -1) return NULL;
296 		if (n < 32 && get_attr_val_nl != 2) n = 32;
297 	} else {
298 		int s = 0, e = N_ENTITIES - 1;
299 		while (s <= e) {
300 			int c;
301 			int m = (s + e) / 2;
302 			c = xxstrcmp(entities[m].s, st, l);
303 			if (!c) {
304 				n = entities[m].c;
305 				goto f;
306 			}
307 			if (c > 0) e = m - 1;
308 			else s = m + 1;
309 		}
310 		return NULL;
311 		f:;
312 	}
313 
314 	return u2cp(n, encoding, 1);
315 }
316 
convert_string(struct conv_table * ct,unsigned char * c,int l)317 unsigned char *convert_string(struct conv_table *ct, unsigned char *c, int l)
318 {
319 	unsigned char *buffer;
320 	int bp = 0;
321 	int pp = 0;
322 	if (!ct) {
323 		int i;
324 		for (i = 0; i < l; i++) if (c[i] == '&') goto xx;
325 		return memacpy(c, l);
326 		xx:;
327 	}
328 	buffer = mem_alloc(ALLOC_GR);
329 	while (pp < l) {
330 		unsigned char *e;
331 		if (c[pp] < 128 && c[pp] != '&') {
332 			put_c:
333 			buffer[bp++] = c[pp++];
334 			if (!(bp & (ALLOC_GR - 1))) {
335 				if ((unsigned)bp > MAXINT - ALLOC_GR) overalloc();
336 				buffer = mem_realloc(buffer, bp + ALLOC_GR);
337 			}
338 			continue;
339 		}
340 		if (c[pp] != '&') {
341 			struct conv_table *t;
342 			int i;
343 			if (!ct) goto put_c;
344 			t = ct;
345 			i = pp;
346 			decode:
347 			if (!t[c[i]].t) {
348 				e = t[c[i]].u.str;
349 			} else {
350 				t = t[c[i++]].u.tbl;
351 				if (i >= l) goto put_c;
352 				goto decode;
353 			}
354 			pp = i + 1;
355 		} else {
356 			int i = pp + 1;
357 			if (d_opt->plain) goto put_c;
358 			while (i < l && c[i] != ';' && c[i] != '&' && c[i] > ' ') i++;
359 			if (!(e = get_entity_string(&c[pp + 1], i - pp - 1, d_opt->cp))) goto put_c;
360 			pp = i + (i < l && c[i] == ';');
361 		}
362 		if (!e[0]) continue;
363 		if (!e[1]) {
364 			buffer[bp++] = e[0];
365 			if (!(bp & (ALLOC_GR - 1))) {
366 				if ((unsigned)bp > MAXINT - ALLOC_GR) overalloc();
367 				buffer = mem_realloc(buffer, bp + ALLOC_GR);
368 			}
369 			continue;
370 		}
371 		while (*e) {
372 			buffer[bp++] = *(e++);
373 			if (!(bp & (ALLOC_GR - 1))) {
374 				if ((unsigned)bp > MAXINT - ALLOC_GR) overalloc();
375 				buffer = mem_realloc(buffer, bp + ALLOC_GR);
376 			}
377 		}
378 	}
379 	buffer[bp] = 0;
380 	return buffer;
381 }
382 
get_cp_index(unsigned char * n)383 int get_cp_index(unsigned char *n)
384 {
385 	int i, a, p, q;
386 	int ii = -1, ll = 0;
387 	for (i = 0; codepages[i].name; i++) {
388 		for (a = 0; codepages[i].aliases[a]; a++) {
389 			for (p = 0; n[p]; p++) {
390 				if (upcase(n[p]) == upcase(codepages[i].aliases[a][0])) {
391 					for (q = 1; codepages[i].aliases[a][q]; q++) {
392 						if (upcase(n[p+q]) != upcase(codepages[i].aliases[a][q])) goto fail;
393 					}
394 					if (strlen(codepages[i].aliases[a]) > (size_t)ll) {
395 						ll = strlen(codepages[i].aliases[a]);
396 						ii = i;
397 					}
398 				}
399 				fail:;
400 			}
401 		}
402 	}
403 	return ii;
404 }
405 
get_cp_name(int index)406 unsigned char *get_cp_name(int index)
407 {
408 	if (index < 0) return "none";
409 	return codepages[index].name;
410 }
411 
get_cp_mime_name(int index)412 unsigned char *get_cp_mime_name(int index)
413 {
414 	if (index < 0) return "none";
415 	if (!codepages[index].aliases) return NULL;
416 	return codepages[index].aliases[0];
417 }
418 
is_cp_special(int index)419 int is_cp_special(int index)
420 {
421 	return codepages[index].table == table_utf_8;
422 }
423 
charset_upcase(unsigned char ch,int cp)424 unsigned char charset_upcase(unsigned char ch, int cp)
425 {
426 	int u, res;
427 	unsigned char *str;
428 	if (ch < 0x80) return upcase(ch);
429 	u = cp2u(ch, cp);
430 	BIN_SEARCH(unicode_upcase, lo, sizeof(unicode_upcase) / sizeof(*unicode_upcase), u, res);
431 	if (res == -1) return ch;
432 	str = u2cp(unicode_upcase[res].up, cp, 0);
433 	if (!str || !str[0] || str[1]) return ch;
434 	return str[0];
435 }
436 
charset_upcase_string(unsigned char ** chp,int cp)437 void charset_upcase_string(unsigned char **chp, int cp)
438 {
439 	unsigned char *ch = *chp;
440 	int i;
441 	for (i = 0; ch[i]; i++) ch[i] = charset_upcase(ch[i], cp);
442 }
443 
444