1 /* charsets.c
2  * (c) 2002 Mikulas Patocka, Karel 'Clock' Kulhavy
3  * This file is a part of the Links program, released under GPL.
4  */
5 
6 #include "links.h"
7 
8 int utf8_table;
9 
10 struct table_entry {
11 	unsigned char c;
12 	int u;
13 };
14 
15 struct codepage_desc {
16 	const char *name;
17 	const char * const *aliases;
18 	const struct table_entry *table;
19 };
20 
21 #include "codepage.inc"
22 #include "uni_7b.inc"
23 #include "entity.inc"
24 #include "upcase.inc"
25 #include "locase.inc"
26 
27 static_const unsigned char strings[256][2] = {
28 	"\000", "\001", "\002", "\003", "\004", "\005", "\006", "\007",
29 	"\010", "\011", "\012", "\013", "\014", "\015", "\016", "\017",
30 	"\020", "\021", "\022", "\023", "\024", "\025", "\026", "\033",
31 	"\030", "\031", "\032", "\033", "\034", "\035", "\036", "\033",
32 	"\040", "\041", "\042", "\043", "\044", "\045", "\046", "\047",
33 	"\050", "\051", "\052", "\053", "\054", "\055", "\056", "\057",
34 	"\060", "\061", "\062", "\063", "\064", "\065", "\066", "\067",
35 	"\070", "\071", "\072", "\073", "\074", "\075", "\076", "\077",
36 	"\100", "\101", "\102", "\103", "\104", "\105", "\106", "\107",
37 	"\110", "\111", "\112", "\113", "\114", "\115", "\116", "\117",
38 	"\120", "\121", "\122", "\123", "\124", "\125", "\126", "\127",
39 	"\130", "\131", "\132", "\133", "\134", "\135", "\136", "\137",
40 	"\140", "\141", "\142", "\143", "\144", "\145", "\146", "\147",
41 	"\150", "\151", "\152", "\153", "\154", "\155", "\156", "\157",
42 	"\160", "\161", "\162", "\163", "\164", "\165", "\166", "\167",
43 	"\170", "\171", "\172", "\173", "\174", "\175", "\176", "\177",
44 	"\200", "\201", "\202", "\203", "\204", "\205", "\206", "\207",
45 	"\210", "\211", "\212", "\213", "\214", "\215", "\216", "\217",
46 	"\220", "\221", "\222", "\223", "\224", "\225", "\226", "\227",
47 	"\230", "\231", "\232", "\233", "\234", "\235", "\236", "\237",
48 	"\240", "\241", "\242", "\243", "\244", "\245", "\246", "\247",
49 	"\250", "\251", "\252", "\253", "\254", "\255", "\256", "\257",
50 	"\260", "\261", "\262", "\263", "\264", "\265", "\266", "\267",
51 	"\270", "\271", "\272", "\273", "\274", "\275", "\276", "\277",
52 	"\300", "\301", "\302", "\303", "\304", "\305", "\306", "\307",
53 	"\310", "\311", "\312", "\313", "\314", "\315", "\316", "\317",
54 	"\320", "\321", "\322", "\323", "\324", "\325", "\326", "\327",
55 	"\330", "\331", "\332", "\333", "\334", "\335", "\336", "\337",
56 	"\340", "\341", "\342", "\343", "\344", "\345", "\346", "\347",
57 	"\350", "\351", "\352", "\353", "\354", "\355", "\356", "\357",
58 	"\360", "\361", "\362", "\363", "\364", "\365", "\366", "\367",
59 	"\370", "\371", "\372", "\373", "\374", "\375", "\376", "\377",
60 };
61 
free_translation_table(struct conv_table * p)62 static void free_translation_table(struct conv_table *p)
63 {
64 	int i;
65 	for (i = 0; i < 256; i++) if (p[i].t) free_translation_table(p[i].u.tbl);
66 	mem_free(p);
67 }
68 
69 static_const unsigned char no_str[] = "*";
70 
new_translation_table(struct conv_table * p)71 static void new_translation_table(struct conv_table *p)
72 {
73 	int i;
74 	for (i = 0; i < 256; i++) if (p[i].t) free_translation_table(p[i].u.tbl);
75 	for (i = 0; i < 128; i++) p[i].t = 0, p[i].u.str = cast_uchar strings[i];
76 	for (; i < 256; i++) p[i].t = 0, p[i].u.str = cast_uchar no_str;
77 }
78 
79 static_const unsigned short strange_chars[32] = {
80 	0x20ac, 0x0000, 0x002a, 0x0000, 0x201e, 0x2026, 0x2020, 0x2021,
81 	0x005e, 0x2030, 0x0160, 0x003c, 0x0152, 0x0000, 0x0000, 0x0000,
82 	0x0000, 0x0060, 0x0027, 0x0022, 0x0022, 0x002a, 0x2013, 0x2014,
83 	0x007e, 0x2122, 0x0161, 0x003e, 0x0153, 0x0000, 0x0000, 0x0000,
84 };
85 
86 #define U_EQUAL(a, b) unicode_7b[a].x == (b)
87 #define U_ABOVE(a, b) unicode_7b[a].x > (b)
88 
is_nbsp(int u)89 static int is_nbsp(int u)
90 {
91 	return u == 0xa0 || u == 0x202f;
92 }
93 
u2cp(int u,int to,int fallback)94 unsigned char *u2cp(int u, int to, int fallback)
95 {
96 	int j, s;
97 	again:
98 	if (u < 0) return cast_uchar "";
99 	if (u < 128) return cast_uchar strings[u];
100 	if (is_nbsp(u)) return cast_uchar strings[1];
101 	if (u == 0xad) return cast_uchar strings[0];
102 	if (to == utf8_table) return encode_utf_8(u);
103 	if (u < 0xa0) {
104 		u = strange_chars[u - 0x80];
105 		if (!u) return NULL;
106 		goto again;
107 	}
108 	for (j = 0; codepages[to].table[j].c; j++)
109 		if (codepages[to].table[j].u == u)
110 			return cast_uchar strings[codepages[to].table[j].c];
111 	if (!fallback) return NULL;
112 	BIN_SEARCH(N_UNICODE_7B, U_EQUAL, U_ABOVE, u, s);
113 	if (s != -1) return cast_uchar unicode_7b[s].s;
114 	return NULL;
115 }
116 
cp2u(unsigned ch,int from)117 int cp2u(unsigned ch, int from)
118 {
119 	const struct table_entry *e;
120 	if (from == utf8_table) return ch;
121 	if (from < 0 || ch < 0x80) return ch;
122 	for (e = codepages[from].table; e->c; e++) if (e->c == ch) return e->u;
123 	return -1;
124 }
125 
126 static unsigned char utf_buffer[7];
127 
encode_utf_8(int u)128 unsigned char *encode_utf_8(int u)
129 {
130 	memset(utf_buffer, 0, 7);
131 	if (u < 0) ;
132 	else if (u < 0x80) utf_buffer[0] = (unsigned char)u;
133 	else if (u < 0x800)
134 		utf_buffer[0] = 0xc0 | ((u >> 6) & 0x1f),
135 		utf_buffer[1] = 0x80 | (u & 0x3f);
136 	else if (u < 0x10000)
137 		utf_buffer[0] = 0xe0 | ((u >> 12) & 0x0f),
138 		utf_buffer[1] = 0x80 | ((u >> 6) & 0x3f),
139 		utf_buffer[2] = 0x80 | (u & 0x3f);
140 	else if (u < 0x200000)
141 		utf_buffer[0] = 0xf0 | ((u >> 18) & 0x0f),
142 		utf_buffer[1] = 0x80 | ((u >> 12) & 0x3f),
143 		utf_buffer[2] = 0x80 | ((u >> 6) & 0x3f),
144 		utf_buffer[3] = 0x80 | (u & 0x3f);
145 	else if (u < 0x4000000)
146 		utf_buffer[0] = 0xf8 | ((u >> 24) & 0x0f),
147 		utf_buffer[1] = 0x80 | ((u >> 18) & 0x3f),
148 		utf_buffer[2] = 0x80 | ((u >> 12) & 0x3f),
149 		utf_buffer[3] = 0x80 | ((u >> 6) & 0x3f),
150 		utf_buffer[4] = 0x80 | (u & 0x3f);
151 	else	utf_buffer[0] = 0xfc | ((u >> 30) & 0x01),
152 		utf_buffer[1] = 0x80 | ((u >> 24) & 0x3f),
153 		utf_buffer[2] = 0x80 | ((u >> 18) & 0x3f),
154 		utf_buffer[3] = 0x80 | ((u >> 12) & 0x3f),
155 		utf_buffer[4] = 0x80 | ((u >> 6) & 0x3f),
156 		utf_buffer[5] = 0x80 | (u & 0x3f);
157 	return utf_buffer;
158 }
159 
add_utf_8(struct conv_table * ct,int u,unsigned char * str)160 static void add_utf_8(struct conv_table *ct, int u, unsigned char *str)
161 {
162 	unsigned char *p = encode_utf_8(u);
163 	while (p[1]) {
164 		if (ct[*p].t) ct = ct[*p].u.tbl;
165 		else {
166 			struct conv_table *nct;
167 			if (ct[*p].u.str != no_str) {
168 				internal_error("bad utf encoding #1");
169 				return;
170 			}
171 			nct = mem_alloc(sizeof(struct conv_table) * 256);
172 			memset(nct, 0, sizeof(struct conv_table) * 256);
173 			new_translation_table(nct);
174 			ct[*p].t = 1;
175 			ct[*p].u.tbl = nct;
176 			ct = nct;
177 		}
178 		p++;
179 	}
180 	if (ct[*p].t) {
181 		internal_error("bad utf encoding #2");
182 		return;
183 	}
184 	if (ct[*p].u.str == no_str) ct[*p].u.str = str;
185 }
186 
187 static struct conv_table utf_table[256];
188 static int utf_table_init = 1;
189 
free_utf_table(void)190 static void free_utf_table(void)
191 {
192 	int i;
193 	for (i = 128; i < 256; i++) mem_free(utf_table[i].u.str);
194 }
195 
get_translation_table_to_utf_8(int from)196 static struct conv_table *get_translation_table_to_utf_8(int from)
197 {
198 	int i;
199 	static int lfr = -1;
200 	if (from == -1) return NULL;
201 	if (from == lfr) return utf_table;
202 	lfr = from;
203 	if (utf_table_init) {
204 		memset(utf_table, 0, sizeof(struct conv_table) * 256);
205 		for (i = 0; i < 128; i++) utf_table[i].u.str = cast_uchar strings[i];
206 		utf_table_init = 0;
207 	} else {
208 		free_utf_table();
209 	}
210 	if (from == utf8_table) {
211 		for (i = 128; i < 256; i++) utf_table[i].u.str = stracpy(strings[i]);
212 		return utf_table;
213 	}
214 	for (i = 128; i < 256; i++) utf_table[i].u.str = NULL;
215 	for (i = 0; codepages[from].table[i].c; i++) {
216 		int u = codepages[from].table[i].u;
217 		if (!utf_table[codepages[from].table[i].c].u.str)
218 			utf_table[codepages[from].table[i].c].u.str = stracpy(encode_utf_8(u));
219 	}
220 	for (i = 128; i < 256; i++)
221 		if (!utf_table[i].u.str) utf_table[i].u.str = stracpy(no_str);
222 	return utf_table;
223 }
224 
225 unsigned char utf_8_1[256] = {
226 	6, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
227 	7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
228 	7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
229 	7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
230 	7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
231 	7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
232 	7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
233 	7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
234 	6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
235 	6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
236 	6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
237 	6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
238 	6, 6, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
239 	5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
240 	4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
241 	3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 2, 2, 1, 1, 6, 6,
242 };
243 
244 static_const unsigned min_utf_8[8] = {
245 	0, 0x4000000, 0x200000, 0x10000, 0x800, 0x80, 0x100, 0x1,
246 };
247 
get_utf_8(unsigned char ** s)248 unsigned get_utf_8(unsigned char **s)
249 {
250 	unsigned v, min;
251 	int l;
252 	unsigned char *p = *s;
253 	l = utf_8_1[p[0]];
254 	min = min_utf_8[l];
255 	v = p[0] & ((1 << l) - 1);
256 	(*s)++;
257 	while (l++ <= 5) {
258 		unsigned c = **s - 0x80;
259 		if (c >= 0x40) {
260 			return 0;
261 		}
262 		(*s)++;
263 		v = (v << 6) + c;
264 	}
265 	if (v < min)
266 		return 0;
267 	if (v > 0x10FFFF)
268 		return 0;
269 	return v;
270 }
271 
272 static struct conv_table table[256];
273 static int table_init = 1;
274 
free_conv_table(void)275 void free_conv_table(void)
276 {
277 	if (!utf_table_init) free_utf_table();
278 	if (!table_init) new_translation_table(table);
279 }
280 
get_translation_table(int from,int to)281 struct conv_table *get_translation_table(int from, int to)
282 {
283 	int i;
284 	static int lfr = -1;
285 	static int lto = -1;
286 	if (/*from == to ||*/ from == -1 || to == -1) return NULL;
287 	if (to == utf8_table) return get_translation_table_to_utf_8(from);
288 	if (table_init) memset(table, 0, sizeof(struct conv_table) * 256), table_init = 0;
289 	if (from == lfr && to == lto) return table;
290 	lfr = from; lto = to;
291 	new_translation_table(table);
292 	if (from == utf8_table) {
293 		int j;
294 		for (j = 0; codepages[to].table[j].c; j++) add_utf_8(table, codepages[to].table[j].u, is_nbsp(codepages[to].table[j].u) ? cast_uchar strings[1] : codepages[to].table[j].u == 0xad ? cast_uchar strings[0] : cast_uchar strings[codepages[to].table[j].c]);
295 		for (i = 0; unicode_7b[i].x != -1; i++) if (unicode_7b[i].x >= 0x80) add_utf_8(table, unicode_7b[i].x, cast_uchar unicode_7b[i].s);
296 	} else for (i = 128; i < 256; i++) {
297 		int j;
298 		unsigned char *u;
299 		for (j = 0; codepages[from].table[j].c; j++) {
300 			if (codepages[from].table[j].c == i) goto f;
301 		}
302 		continue;
303 		f:
304 		u = u2cp(codepages[from].table[j].u, to, 1);
305 		if (u) table[i].u.str = u;
306 	}
307 	return table;
308 }
309 
xxstrcmp(unsigned char * s1,unsigned char * s2,int l2)310 static inline int xxstrcmp(unsigned char *s1, unsigned char *s2, int l2)
311 {
312 	while (l2) {
313 		if (*s1 > *s2) return 1;
314 		if (!*s1 || *s1 < *s2) return -1;
315 		s1++, s2++, l2--;
316 	}
317 	return !!*s1;
318 }
319 
get_entity_number(unsigned char * st,int l)320 int get_entity_number(unsigned char *st, int l)
321 {
322 	int n = 0;
323 	if (upcase(st[0]) == 'X') {
324 		st++, l--;
325 		if (!l) return -1;
326 		do {
327 			unsigned char c = upcase(*(st++));
328 			if (c >= '0' && c <= '9') n = n * 16 + c - '0';
329 			else if (c >= 'A' && c <= 'F') n = n * 16 + c - 'A' + 10;
330 			else return -1;
331 			if (n > 0x10FFFF) return -1;
332 		} while (--l);
333 	} else {
334 		if (!l) return -1;
335 		do {
336 			unsigned char c = *(st++);
337 			if (c >= '0' && c <= '9') n = n * 10 + c - '0';
338 			else return -1;
339 			if (n > 0x10FFFF) return -1;
340 		} while (--l);
341 	}
342 	return n;
343 }
344 
get_entity_string(unsigned char * st,int l,int encoding)345 unsigned char *get_entity_string(unsigned char *st, int l, int encoding)
346 {
347 	int n;
348 	if (l <= 0) return NULL;
349 	if (st[0] == '#') {
350 		if (l == 1) return NULL;
351 		if ((n = get_entity_number(st + 1, l - 1)) == -1) return NULL;
352 		if (n < 32 && get_attr_val_nl != 2) n = 32;
353 	} else {
354 		int s = 0, e = N_ENTITIES - 1;
355 		while (s <= e) {
356 			int c;
357 			int m = (s + e) / 2;
358 			c = xxstrcmp(cast_uchar entities[m].s, st, l);
359 			if (!c) {
360 				n = entities[m].c;
361 				goto f;
362 			}
363 			if (c > 0) e = m - 1;
364 			else s = m + 1;
365 		}
366 		return NULL;
367 		f:;
368 	}
369 
370 	return u2cp(n, encoding, 1);
371 }
372 
convert_string(struct conv_table * ct,unsigned char * c,int l,struct document_options * dopt)373 unsigned char *convert_string(struct conv_table *ct, unsigned char *c, int l, struct document_options *dopt)
374 {
375 	unsigned char *buffer;
376 	int bp = 0;
377 	int pp = 0;
378 	if (!ct) {
379 		int i;
380 		for (i = 0; i < l; i++) if (c[i] == '&') goto xx;
381 		return memacpy(c, l);
382 		xx:;
383 	}
384 	buffer = mem_alloc(ALLOC_GR);
385 	while (pp < l) {
386 		unsigned char *e = NULL;	/* against warning */
387 		if (c[pp] < 128 && c[pp] != '&') {
388 			put_c:
389 			buffer[bp++] = c[pp++];
390 			if (!(bp & (ALLOC_GR - 1))) {
391 				if ((unsigned)bp > MAXINT - ALLOC_GR) overalloc();
392 				buffer = mem_realloc(buffer, bp + ALLOC_GR);
393 			}
394 			continue;
395 		}
396 		if (c[pp] != '&') {
397 			struct conv_table *t;
398 			int i;
399 			if (!ct) goto put_c;
400 			t = ct;
401 			i = pp;
402 			decode:
403 			if (!t[c[i]].t) {
404 				e = t[c[i]].u.str;
405 			} else {
406 				t = t[c[i++]].u.tbl;
407 				if (i >= l) goto put_c;
408 				goto decode;
409 			}
410 			pp = i + 1;
411 		} else {
412 			int i = pp + 1;
413 			if (!dopt || dopt->plain) goto put_c;
414 			while (i < l && !is_entity_terminator(c[i])) i++;
415 			if (!(e = get_entity_string(&c[pp + 1], i - pp - 1, dopt->cp))) goto put_c;
416 			pp = i + (i < l && c[i] == ';');
417 		}
418 		if (!e[0]) continue;
419 		if (!e[1]) {
420 			buffer[bp++] = e[0];
421 			if (!(bp & (ALLOC_GR - 1))) {
422 				if ((unsigned)bp > MAXINT - ALLOC_GR) overalloc();
423 				buffer = mem_realloc(buffer, bp + ALLOC_GR);
424 			}
425 			continue;
426 		}
427 		while (*e) {
428 			buffer[bp++] = *(e++);
429 			if (!(bp & (ALLOC_GR - 1))) {
430 				if ((unsigned)bp > MAXINT - ALLOC_GR) overalloc();
431 				buffer = mem_realloc(buffer, bp + ALLOC_GR);
432 			}
433 		}
434 	}
435 	buffer[bp] = 0;
436 	return buffer;
437 }
438 
convert(int from,int to,unsigned char * c,struct document_options * dopt)439 unsigned char *convert(int from, int to, unsigned char *c, struct document_options *dopt)
440 {
441 	unsigned char *cc;
442 	struct conv_table *ct;
443 
444 	for (cc = c; *cc; cc++) {
445 		if (*cc >= 128 && from != to)
446 			goto need_table;
447 		if (*cc == '&' && dopt && !dopt->plain)
448 			goto need_table;
449 	}
450 	return stracpy(c);
451 
452 need_table:
453 	ct = get_translation_table(from, to);
454 	return convert_string(ct, c, (int)strlen(cast_const_char c), dopt);
455 }
456 
get_cp_index(unsigned char * n)457 int get_cp_index(unsigned char *n)
458 {
459 	decc_volatile int i, a, p, q, sl, ii = -1, ll = 0;
460 	for (i = 0; codepages[i].name; i++) {
461 		for (a = 0; codepages[i].aliases[a]; a++) {
462 			for (p = 0; n[p]; p++) {
463 				if (upcase(n[p]) == upcase(codepages[i].aliases[a][0])) {
464 					for (q = 1; codepages[i].aliases[a][q]; q++) {
465 						if (upcase(n[p+q]) != upcase(codepages[i].aliases[a][q])) goto fail;
466 					}
467 					sl = (int)strlen(cast_const_char codepages[i].aliases[a]);
468 					if (sl > ll) {
469 						ll = sl;
470 						ii = i;
471 					}
472 				}
473 				fail:;
474 			}
475 		}
476 	}
477 	return ii;
478 }
479 
get_cp_name(int index)480 unsigned char *get_cp_name(int index)
481 {
482 	if (index < 0) return cast_uchar "none";
483 	return cast_uchar codepages[index].name;
484 }
485 
get_cp_mime_name(int index)486 unsigned char *get_cp_mime_name(int index)
487 {
488 	if (index < 0) return cast_uchar "none";
489 	if (!codepages[index].aliases) return NULL;
490 	return cast_uchar codepages[index].aliases[0];
491 }
492 
493 #define LO_EQUAL(a, b) unicode_locase[a].o == (b)
494 #define LO_ABOVE(a, b) unicode_locase[a].o > (b)
495 
uni_locase(unsigned ch)496 unsigned uni_locase(unsigned ch)
497 {
498 	int res;
499 	BIN_SEARCH(array_elements(unicode_locase), LO_EQUAL, LO_ABOVE, ch, res);
500 	if (res == -1) return ch;
501 	return unicode_locase[res].n;
502 }
503 
504 #define UP_EQUAL(a, b) unicode_upcase[a].o == (b)
505 #define UP_ABOVE(a, b) unicode_upcase[a].o > (b)
506 
charset_upcase(unsigned ch,int cp)507 unsigned charset_upcase(unsigned ch, int cp)
508 {
509 	unsigned u;
510 	int res;
511 	unsigned char *str;
512 	if (ch < 0x80) return upcase(ch);
513 	u = cp2u(ch, cp);
514 	BIN_SEARCH(array_elements(unicode_upcase), UP_EQUAL, UP_ABOVE, u, res);
515 	if (res == -1) return ch;
516 	if (cp == utf8_table) return unicode_upcase[res].n;
517 	str = u2cp(unicode_upcase[res].n, cp, 0);
518 	if (!str || !str[0] || str[1]) return ch;
519 	return str[0];
520 }
521 
uni_upcase(unsigned ch)522 unsigned uni_upcase(unsigned ch)
523 {
524 	return charset_upcase(ch, utf8_table);
525 }
526 
charset_upcase_string(unsigned char ** chp,int cp)527 void charset_upcase_string(unsigned char **chp, int cp)
528 {
529 	unsigned char *ch = *chp;
530 	int i;
531 	if (cp == utf8_table) {
532 		ch = unicode_upcase_string(ch);
533 		mem_free(*chp);
534 		*chp = ch;
535 	} else {
536 		for (i = 0; ch[i]; i++) ch[i] = charset_upcase(ch[i], cp);
537 	}
538 }
539 
unicode_upcase_string(unsigned char * ch)540 unsigned char *unicode_upcase_string(unsigned char *ch)
541 {
542 	unsigned char *r = init_str();
543 	int rl = 0;
544 	while (1) {
545 		unsigned c;
546 		int res;
547 		GET_UTF_8(ch, c);
548 		if (!c) break;
549 		BIN_SEARCH(array_elements(unicode_upcase), UP_EQUAL, UP_ABOVE, c, res);
550 		if (res != -1) c = unicode_upcase[res].n;
551 		add_to_str(&r, &rl, encode_utf_8(c));
552 	}
553 	return r;
554 }
555 
to_utf8_upcase(unsigned char * str,int cp)556 unsigned char *to_utf8_upcase(unsigned char *str, int cp)
557 {
558 	unsigned char *str1, *str2;
559 	str1 = convert(cp, utf8_table, str, NULL);
560 	str2 = unicode_upcase_string(str1);
561 	mem_free(str1);
562 	return str2;
563 }
564 
compare_case_utf8(unsigned char * u1,unsigned char * u2)565 int compare_case_utf8(unsigned char *u1, unsigned char *u2)
566 {
567 	unsigned char *uu1 = u1;
568 	unsigned c1, c2;
569 	int cc1;
570 	while (1) {
571 		GET_UTF_8(u2, c2);
572 		if (!c2) return (int)(u1 - uu1);
573 		skip_discr:
574 		GET_UTF_8(u1, c1);
575 		BIN_SEARCH(array_elements(unicode_upcase), UP_EQUAL, UP_ABOVE, c1, cc1);
576 		if (cc1 != -1) c1 = unicode_upcase[cc1].n;
577 		if (c1 == 0xad) goto skip_discr;
578 		if (c1 != c2) return 0;
579 		if (c1 == ' ') {
580 			unsigned char *x1;
581 			do {
582 				x1 = u1;
583 				GET_UTF_8(u1, c1);
584 				BIN_SEARCH(array_elements(unicode_upcase), UP_EQUAL, UP_ABOVE, c1, cc1);
585 				if (cc1 >= 0) c1 = unicode_upcase[cc1].n;
586 			} while (c1 == ' ');
587 			u1 = x1;
588 		}
589 	}
590 }
591 
strlen_utf8(unsigned char * s)592 int strlen_utf8(unsigned char *s)
593 {
594 	int len = 0;
595 	while (1) {
596 		unsigned c;
597 		GET_UTF_8(s, c);
598 		if (!c) return len;
599 		len++;
600 	}
601 }
602 
cp_len(int cp,unsigned char * s)603 int cp_len(int cp, unsigned char *s)
604 {
605 	if (cp == utf8_table) return strlen_utf8(s);
606 	return (int)strlen(cast_const_char s);
607 }
608 
cp_strchr(int charset,unsigned char * str,unsigned chr)609 unsigned char *cp_strchr(int charset, unsigned char *str, unsigned chr)
610 {
611 	if (charset != utf8_table) {
612 		if (chr >= 0x100)
613 			return NULL;
614 		return cast_uchar strchr(cast_const_char str, chr);
615 	}
616 	while (1) {
617 		unsigned char *o_str = str;
618 		unsigned c;
619 		GET_UTF_8(str, c);
620 		if (!c) return NULL;
621 		if (c == chr) return o_str;
622 	}
623 }
624 
init_charset(void)625 void init_charset(void)
626 {
627 	utf8_table = get_cp_index(cast_uchar "UTF-8");
628 	if (utf8_table == -1) internal_error("no UTF-8 charset");
629 	bookmarks_codepage = utf8_table;
630 }
631