1 /* charsets.c
2 * (c) 2002 Mikulas Patocka, Karel 'Clock' Kulhavy
3 * This file is a part of the Links program, released under GPL.
4 */
5
6 #include "links.h"
7
8 int utf8_table;
9
10 struct table_entry {
11 unsigned char c;
12 int u;
13 };
14
15 struct codepage_desc {
16 const char *name;
17 const char * const *aliases;
18 const struct table_entry *table;
19 };
20
21 #include "codepage.inc"
22 #include "uni_7b.inc"
23 #include "entity.inc"
24 #include "upcase.inc"
25 #include "locase.inc"
26
27 static_const unsigned char strings[256][2] = {
28 "\000", "\001", "\002", "\003", "\004", "\005", "\006", "\007",
29 "\010", "\011", "\012", "\013", "\014", "\015", "\016", "\017",
30 "\020", "\021", "\022", "\023", "\024", "\025", "\026", "\033",
31 "\030", "\031", "\032", "\033", "\034", "\035", "\036", "\033",
32 "\040", "\041", "\042", "\043", "\044", "\045", "\046", "\047",
33 "\050", "\051", "\052", "\053", "\054", "\055", "\056", "\057",
34 "\060", "\061", "\062", "\063", "\064", "\065", "\066", "\067",
35 "\070", "\071", "\072", "\073", "\074", "\075", "\076", "\077",
36 "\100", "\101", "\102", "\103", "\104", "\105", "\106", "\107",
37 "\110", "\111", "\112", "\113", "\114", "\115", "\116", "\117",
38 "\120", "\121", "\122", "\123", "\124", "\125", "\126", "\127",
39 "\130", "\131", "\132", "\133", "\134", "\135", "\136", "\137",
40 "\140", "\141", "\142", "\143", "\144", "\145", "\146", "\147",
41 "\150", "\151", "\152", "\153", "\154", "\155", "\156", "\157",
42 "\160", "\161", "\162", "\163", "\164", "\165", "\166", "\167",
43 "\170", "\171", "\172", "\173", "\174", "\175", "\176", "\177",
44 "\200", "\201", "\202", "\203", "\204", "\205", "\206", "\207",
45 "\210", "\211", "\212", "\213", "\214", "\215", "\216", "\217",
46 "\220", "\221", "\222", "\223", "\224", "\225", "\226", "\227",
47 "\230", "\231", "\232", "\233", "\234", "\235", "\236", "\237",
48 "\240", "\241", "\242", "\243", "\244", "\245", "\246", "\247",
49 "\250", "\251", "\252", "\253", "\254", "\255", "\256", "\257",
50 "\260", "\261", "\262", "\263", "\264", "\265", "\266", "\267",
51 "\270", "\271", "\272", "\273", "\274", "\275", "\276", "\277",
52 "\300", "\301", "\302", "\303", "\304", "\305", "\306", "\307",
53 "\310", "\311", "\312", "\313", "\314", "\315", "\316", "\317",
54 "\320", "\321", "\322", "\323", "\324", "\325", "\326", "\327",
55 "\330", "\331", "\332", "\333", "\334", "\335", "\336", "\337",
56 "\340", "\341", "\342", "\343", "\344", "\345", "\346", "\347",
57 "\350", "\351", "\352", "\353", "\354", "\355", "\356", "\357",
58 "\360", "\361", "\362", "\363", "\364", "\365", "\366", "\367",
59 "\370", "\371", "\372", "\373", "\374", "\375", "\376", "\377",
60 };
61
free_translation_table(struct conv_table * p)62 static void free_translation_table(struct conv_table *p)
63 {
64 int i;
65 for (i = 0; i < 256; i++) if (p[i].t) free_translation_table(p[i].u.tbl);
66 mem_free(p);
67 }
68
69 static_const unsigned char no_str[] = "*";
70
new_translation_table(struct conv_table * p)71 static void new_translation_table(struct conv_table *p)
72 {
73 int i;
74 for (i = 0; i < 256; i++) if (p[i].t) free_translation_table(p[i].u.tbl);
75 for (i = 0; i < 128; i++) p[i].t = 0, p[i].u.str = cast_uchar strings[i];
76 for (; i < 256; i++) p[i].t = 0, p[i].u.str = cast_uchar no_str;
77 }
78
79 static_const unsigned short strange_chars[32] = {
80 0x20ac, 0x0000, 0x002a, 0x0000, 0x201e, 0x2026, 0x2020, 0x2021,
81 0x005e, 0x2030, 0x0160, 0x003c, 0x0152, 0x0000, 0x0000, 0x0000,
82 0x0000, 0x0060, 0x0027, 0x0022, 0x0022, 0x002a, 0x2013, 0x2014,
83 0x007e, 0x2122, 0x0161, 0x003e, 0x0153, 0x0000, 0x0000, 0x0000,
84 };
85
86 #define U_EQUAL(a, b) unicode_7b[a].x == (b)
87 #define U_ABOVE(a, b) unicode_7b[a].x > (b)
88
is_nbsp(int u)89 static int is_nbsp(int u)
90 {
91 return u == 0xa0 || u == 0x202f;
92 }
93
u2cp(int u,int to,int fallback)94 unsigned char *u2cp(int u, int to, int fallback)
95 {
96 int j, s;
97 again:
98 if (u < 0) return cast_uchar "";
99 if (u < 128) return cast_uchar strings[u];
100 if (is_nbsp(u)) return cast_uchar strings[1];
101 if (u == 0xad) return cast_uchar strings[0];
102 if (to == utf8_table) return encode_utf_8(u);
103 if (u < 0xa0) {
104 u = strange_chars[u - 0x80];
105 if (!u) return NULL;
106 goto again;
107 }
108 for (j = 0; codepages[to].table[j].c; j++)
109 if (codepages[to].table[j].u == u)
110 return cast_uchar strings[codepages[to].table[j].c];
111 if (!fallback) return NULL;
112 BIN_SEARCH(N_UNICODE_7B, U_EQUAL, U_ABOVE, u, s);
113 if (s != -1) return cast_uchar unicode_7b[s].s;
114 return NULL;
115 }
116
cp2u(unsigned ch,int from)117 int cp2u(unsigned ch, int from)
118 {
119 const struct table_entry *e;
120 if (from == utf8_table) return ch;
121 if (from < 0 || ch < 0x80) return ch;
122 for (e = codepages[from].table; e->c; e++) if (e->c == ch) return e->u;
123 return -1;
124 }
125
126 static unsigned char utf_buffer[7];
127
encode_utf_8(int u)128 unsigned char *encode_utf_8(int u)
129 {
130 memset(utf_buffer, 0, 7);
131 if (u < 0) ;
132 else if (u < 0x80) utf_buffer[0] = (unsigned char)u;
133 else if (u < 0x800)
134 utf_buffer[0] = 0xc0 | ((u >> 6) & 0x1f),
135 utf_buffer[1] = 0x80 | (u & 0x3f);
136 else if (u < 0x10000)
137 utf_buffer[0] = 0xe0 | ((u >> 12) & 0x0f),
138 utf_buffer[1] = 0x80 | ((u >> 6) & 0x3f),
139 utf_buffer[2] = 0x80 | (u & 0x3f);
140 else if (u < 0x200000)
141 utf_buffer[0] = 0xf0 | ((u >> 18) & 0x0f),
142 utf_buffer[1] = 0x80 | ((u >> 12) & 0x3f),
143 utf_buffer[2] = 0x80 | ((u >> 6) & 0x3f),
144 utf_buffer[3] = 0x80 | (u & 0x3f);
145 else if (u < 0x4000000)
146 utf_buffer[0] = 0xf8 | ((u >> 24) & 0x0f),
147 utf_buffer[1] = 0x80 | ((u >> 18) & 0x3f),
148 utf_buffer[2] = 0x80 | ((u >> 12) & 0x3f),
149 utf_buffer[3] = 0x80 | ((u >> 6) & 0x3f),
150 utf_buffer[4] = 0x80 | (u & 0x3f);
151 else utf_buffer[0] = 0xfc | ((u >> 30) & 0x01),
152 utf_buffer[1] = 0x80 | ((u >> 24) & 0x3f),
153 utf_buffer[2] = 0x80 | ((u >> 18) & 0x3f),
154 utf_buffer[3] = 0x80 | ((u >> 12) & 0x3f),
155 utf_buffer[4] = 0x80 | ((u >> 6) & 0x3f),
156 utf_buffer[5] = 0x80 | (u & 0x3f);
157 return utf_buffer;
158 }
159
add_utf_8(struct conv_table * ct,int u,unsigned char * str)160 static void add_utf_8(struct conv_table *ct, int u, unsigned char *str)
161 {
162 unsigned char *p = encode_utf_8(u);
163 while (p[1]) {
164 if (ct[*p].t) ct = ct[*p].u.tbl;
165 else {
166 struct conv_table *nct;
167 if (ct[*p].u.str != no_str) {
168 internal_error("bad utf encoding #1");
169 return;
170 }
171 nct = mem_alloc(sizeof(struct conv_table) * 256);
172 memset(nct, 0, sizeof(struct conv_table) * 256);
173 new_translation_table(nct);
174 ct[*p].t = 1;
175 ct[*p].u.tbl = nct;
176 ct = nct;
177 }
178 p++;
179 }
180 if (ct[*p].t) {
181 internal_error("bad utf encoding #2");
182 return;
183 }
184 if (ct[*p].u.str == no_str) ct[*p].u.str = str;
185 }
186
187 static struct conv_table utf_table[256];
188 static int utf_table_init = 1;
189
free_utf_table(void)190 static void free_utf_table(void)
191 {
192 int i;
193 for (i = 128; i < 256; i++) mem_free(utf_table[i].u.str);
194 }
195
get_translation_table_to_utf_8(int from)196 static struct conv_table *get_translation_table_to_utf_8(int from)
197 {
198 int i;
199 static int lfr = -1;
200 if (from == -1) return NULL;
201 if (from == lfr) return utf_table;
202 lfr = from;
203 if (utf_table_init) {
204 memset(utf_table, 0, sizeof(struct conv_table) * 256);
205 for (i = 0; i < 128; i++) utf_table[i].u.str = cast_uchar strings[i];
206 utf_table_init = 0;
207 } else {
208 free_utf_table();
209 }
210 if (from == utf8_table) {
211 for (i = 128; i < 256; i++) utf_table[i].u.str = stracpy(strings[i]);
212 return utf_table;
213 }
214 for (i = 128; i < 256; i++) utf_table[i].u.str = NULL;
215 for (i = 0; codepages[from].table[i].c; i++) {
216 int u = codepages[from].table[i].u;
217 if (!utf_table[codepages[from].table[i].c].u.str)
218 utf_table[codepages[from].table[i].c].u.str = stracpy(encode_utf_8(u));
219 }
220 for (i = 128; i < 256; i++)
221 if (!utf_table[i].u.str) utf_table[i].u.str = stracpy(no_str);
222 return utf_table;
223 }
224
225 unsigned char utf_8_1[256] = {
226 6, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
227 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
228 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
229 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
230 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
231 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
232 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
233 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
234 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
235 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
236 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
237 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
238 6, 6, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
239 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
240 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
241 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 2, 2, 1, 1, 6, 6,
242 };
243
244 static_const unsigned min_utf_8[8] = {
245 0, 0x4000000, 0x200000, 0x10000, 0x800, 0x80, 0x100, 0x1,
246 };
247
get_utf_8(unsigned char ** s)248 unsigned get_utf_8(unsigned char **s)
249 {
250 unsigned v, min;
251 int l;
252 unsigned char *p = *s;
253 l = utf_8_1[p[0]];
254 min = min_utf_8[l];
255 v = p[0] & ((1 << l) - 1);
256 (*s)++;
257 while (l++ <= 5) {
258 unsigned c = **s - 0x80;
259 if (c >= 0x40) {
260 return 0;
261 }
262 (*s)++;
263 v = (v << 6) + c;
264 }
265 if (v < min)
266 return 0;
267 if (v > 0x10FFFF)
268 return 0;
269 return v;
270 }
271
272 static struct conv_table table[256];
273 static int table_init = 1;
274
free_conv_table(void)275 void free_conv_table(void)
276 {
277 if (!utf_table_init) free_utf_table();
278 if (!table_init) new_translation_table(table);
279 }
280
get_translation_table(int from,int to)281 struct conv_table *get_translation_table(int from, int to)
282 {
283 int i;
284 static int lfr = -1;
285 static int lto = -1;
286 if (/*from == to ||*/ from == -1 || to == -1) return NULL;
287 if (to == utf8_table) return get_translation_table_to_utf_8(from);
288 if (table_init) memset(table, 0, sizeof(struct conv_table) * 256), table_init = 0;
289 if (from == lfr && to == lto) return table;
290 lfr = from; lto = to;
291 new_translation_table(table);
292 if (from == utf8_table) {
293 int j;
294 for (j = 0; codepages[to].table[j].c; j++) add_utf_8(table, codepages[to].table[j].u, is_nbsp(codepages[to].table[j].u) ? cast_uchar strings[1] : codepages[to].table[j].u == 0xad ? cast_uchar strings[0] : cast_uchar strings[codepages[to].table[j].c]);
295 for (i = 0; unicode_7b[i].x != -1; i++) if (unicode_7b[i].x >= 0x80) add_utf_8(table, unicode_7b[i].x, cast_uchar unicode_7b[i].s);
296 } else for (i = 128; i < 256; i++) {
297 int j;
298 unsigned char *u;
299 for (j = 0; codepages[from].table[j].c; j++) {
300 if (codepages[from].table[j].c == i) goto f;
301 }
302 continue;
303 f:
304 u = u2cp(codepages[from].table[j].u, to, 1);
305 if (u) table[i].u.str = u;
306 }
307 return table;
308 }
309
xxstrcmp(unsigned char * s1,unsigned char * s2,int l2)310 static inline int xxstrcmp(unsigned char *s1, unsigned char *s2, int l2)
311 {
312 while (l2) {
313 if (*s1 > *s2) return 1;
314 if (!*s1 || *s1 < *s2) return -1;
315 s1++, s2++, l2--;
316 }
317 return !!*s1;
318 }
319
get_entity_number(unsigned char * st,int l)320 int get_entity_number(unsigned char *st, int l)
321 {
322 int n = 0;
323 if (upcase(st[0]) == 'X') {
324 st++, l--;
325 if (!l) return -1;
326 do {
327 unsigned char c = upcase(*(st++));
328 if (c >= '0' && c <= '9') n = n * 16 + c - '0';
329 else if (c >= 'A' && c <= 'F') n = n * 16 + c - 'A' + 10;
330 else return -1;
331 if (n > 0x10FFFF) return -1;
332 } while (--l);
333 } else {
334 if (!l) return -1;
335 do {
336 unsigned char c = *(st++);
337 if (c >= '0' && c <= '9') n = n * 10 + c - '0';
338 else return -1;
339 if (n > 0x10FFFF) return -1;
340 } while (--l);
341 }
342 return n;
343 }
344
get_entity_string(unsigned char * st,int l,int encoding)345 unsigned char *get_entity_string(unsigned char *st, int l, int encoding)
346 {
347 int n;
348 if (l <= 0) return NULL;
349 if (st[0] == '#') {
350 if (l == 1) return NULL;
351 if ((n = get_entity_number(st + 1, l - 1)) == -1) return NULL;
352 if (n < 32 && get_attr_val_nl != 2) n = 32;
353 } else {
354 int s = 0, e = N_ENTITIES - 1;
355 while (s <= e) {
356 int c;
357 int m = (s + e) / 2;
358 c = xxstrcmp(cast_uchar entities[m].s, st, l);
359 if (!c) {
360 n = entities[m].c;
361 goto f;
362 }
363 if (c > 0) e = m - 1;
364 else s = m + 1;
365 }
366 return NULL;
367 f:;
368 }
369
370 return u2cp(n, encoding, 1);
371 }
372
convert_string(struct conv_table * ct,unsigned char * c,int l,struct document_options * dopt)373 unsigned char *convert_string(struct conv_table *ct, unsigned char *c, int l, struct document_options *dopt)
374 {
375 unsigned char *buffer;
376 int bp = 0;
377 int pp = 0;
378 if (!ct) {
379 int i;
380 for (i = 0; i < l; i++) if (c[i] == '&') goto xx;
381 return memacpy(c, l);
382 xx:;
383 }
384 buffer = mem_alloc(ALLOC_GR);
385 while (pp < l) {
386 unsigned char *e = NULL; /* against warning */
387 if (c[pp] < 128 && c[pp] != '&') {
388 put_c:
389 buffer[bp++] = c[pp++];
390 if (!(bp & (ALLOC_GR - 1))) {
391 if ((unsigned)bp > MAXINT - ALLOC_GR) overalloc();
392 buffer = mem_realloc(buffer, bp + ALLOC_GR);
393 }
394 continue;
395 }
396 if (c[pp] != '&') {
397 struct conv_table *t;
398 int i;
399 if (!ct) goto put_c;
400 t = ct;
401 i = pp;
402 decode:
403 if (!t[c[i]].t) {
404 e = t[c[i]].u.str;
405 } else {
406 t = t[c[i++]].u.tbl;
407 if (i >= l) goto put_c;
408 goto decode;
409 }
410 pp = i + 1;
411 } else {
412 int i = pp + 1;
413 if (!dopt || dopt->plain) goto put_c;
414 while (i < l && !is_entity_terminator(c[i])) i++;
415 if (!(e = get_entity_string(&c[pp + 1], i - pp - 1, dopt->cp))) goto put_c;
416 pp = i + (i < l && c[i] == ';');
417 }
418 if (!e[0]) continue;
419 if (!e[1]) {
420 buffer[bp++] = e[0];
421 if (!(bp & (ALLOC_GR - 1))) {
422 if ((unsigned)bp > MAXINT - ALLOC_GR) overalloc();
423 buffer = mem_realloc(buffer, bp + ALLOC_GR);
424 }
425 continue;
426 }
427 while (*e) {
428 buffer[bp++] = *(e++);
429 if (!(bp & (ALLOC_GR - 1))) {
430 if ((unsigned)bp > MAXINT - ALLOC_GR) overalloc();
431 buffer = mem_realloc(buffer, bp + ALLOC_GR);
432 }
433 }
434 }
435 buffer[bp] = 0;
436 return buffer;
437 }
438
convert(int from,int to,unsigned char * c,struct document_options * dopt)439 unsigned char *convert(int from, int to, unsigned char *c, struct document_options *dopt)
440 {
441 unsigned char *cc;
442 struct conv_table *ct;
443
444 for (cc = c; *cc; cc++) {
445 if (*cc >= 128 && from != to)
446 goto need_table;
447 if (*cc == '&' && dopt && !dopt->plain)
448 goto need_table;
449 }
450 return stracpy(c);
451
452 need_table:
453 ct = get_translation_table(from, to);
454 return convert_string(ct, c, (int)strlen(cast_const_char c), dopt);
455 }
456
get_cp_index(unsigned char * n)457 int get_cp_index(unsigned char *n)
458 {
459 decc_volatile int i, a, p, q, sl, ii = -1, ll = 0;
460 for (i = 0; codepages[i].name; i++) {
461 for (a = 0; codepages[i].aliases[a]; a++) {
462 for (p = 0; n[p]; p++) {
463 if (upcase(n[p]) == upcase(codepages[i].aliases[a][0])) {
464 for (q = 1; codepages[i].aliases[a][q]; q++) {
465 if (upcase(n[p+q]) != upcase(codepages[i].aliases[a][q])) goto fail;
466 }
467 sl = (int)strlen(cast_const_char codepages[i].aliases[a]);
468 if (sl > ll) {
469 ll = sl;
470 ii = i;
471 }
472 }
473 fail:;
474 }
475 }
476 }
477 return ii;
478 }
479
get_cp_name(int index)480 unsigned char *get_cp_name(int index)
481 {
482 if (index < 0) return cast_uchar "none";
483 return cast_uchar codepages[index].name;
484 }
485
get_cp_mime_name(int index)486 unsigned char *get_cp_mime_name(int index)
487 {
488 if (index < 0) return cast_uchar "none";
489 if (!codepages[index].aliases) return NULL;
490 return cast_uchar codepages[index].aliases[0];
491 }
492
493 #define LO_EQUAL(a, b) unicode_locase[a].o == (b)
494 #define LO_ABOVE(a, b) unicode_locase[a].o > (b)
495
uni_locase(unsigned ch)496 unsigned uni_locase(unsigned ch)
497 {
498 int res;
499 BIN_SEARCH(array_elements(unicode_locase), LO_EQUAL, LO_ABOVE, ch, res);
500 if (res == -1) return ch;
501 return unicode_locase[res].n;
502 }
503
504 #define UP_EQUAL(a, b) unicode_upcase[a].o == (b)
505 #define UP_ABOVE(a, b) unicode_upcase[a].o > (b)
506
charset_upcase(unsigned ch,int cp)507 unsigned charset_upcase(unsigned ch, int cp)
508 {
509 unsigned u;
510 int res;
511 unsigned char *str;
512 if (ch < 0x80) return upcase(ch);
513 u = cp2u(ch, cp);
514 BIN_SEARCH(array_elements(unicode_upcase), UP_EQUAL, UP_ABOVE, u, res);
515 if (res == -1) return ch;
516 if (cp == utf8_table) return unicode_upcase[res].n;
517 str = u2cp(unicode_upcase[res].n, cp, 0);
518 if (!str || !str[0] || str[1]) return ch;
519 return str[0];
520 }
521
uni_upcase(unsigned ch)522 unsigned uni_upcase(unsigned ch)
523 {
524 return charset_upcase(ch, utf8_table);
525 }
526
charset_upcase_string(unsigned char ** chp,int cp)527 void charset_upcase_string(unsigned char **chp, int cp)
528 {
529 unsigned char *ch = *chp;
530 int i;
531 if (cp == utf8_table) {
532 ch = unicode_upcase_string(ch);
533 mem_free(*chp);
534 *chp = ch;
535 } else {
536 for (i = 0; ch[i]; i++) ch[i] = charset_upcase(ch[i], cp);
537 }
538 }
539
unicode_upcase_string(unsigned char * ch)540 unsigned char *unicode_upcase_string(unsigned char *ch)
541 {
542 unsigned char *r = init_str();
543 int rl = 0;
544 while (1) {
545 unsigned c;
546 int res;
547 GET_UTF_8(ch, c);
548 if (!c) break;
549 BIN_SEARCH(array_elements(unicode_upcase), UP_EQUAL, UP_ABOVE, c, res);
550 if (res != -1) c = unicode_upcase[res].n;
551 add_to_str(&r, &rl, encode_utf_8(c));
552 }
553 return r;
554 }
555
to_utf8_upcase(unsigned char * str,int cp)556 unsigned char *to_utf8_upcase(unsigned char *str, int cp)
557 {
558 unsigned char *str1, *str2;
559 str1 = convert(cp, utf8_table, str, NULL);
560 str2 = unicode_upcase_string(str1);
561 mem_free(str1);
562 return str2;
563 }
564
compare_case_utf8(unsigned char * u1,unsigned char * u2)565 int compare_case_utf8(unsigned char *u1, unsigned char *u2)
566 {
567 unsigned char *uu1 = u1;
568 unsigned c1, c2;
569 int cc1;
570 while (1) {
571 GET_UTF_8(u2, c2);
572 if (!c2) return (int)(u1 - uu1);
573 skip_discr:
574 GET_UTF_8(u1, c1);
575 BIN_SEARCH(array_elements(unicode_upcase), UP_EQUAL, UP_ABOVE, c1, cc1);
576 if (cc1 != -1) c1 = unicode_upcase[cc1].n;
577 if (c1 == 0xad) goto skip_discr;
578 if (c1 != c2) return 0;
579 if (c1 == ' ') {
580 unsigned char *x1;
581 do {
582 x1 = u1;
583 GET_UTF_8(u1, c1);
584 BIN_SEARCH(array_elements(unicode_upcase), UP_EQUAL, UP_ABOVE, c1, cc1);
585 if (cc1 >= 0) c1 = unicode_upcase[cc1].n;
586 } while (c1 == ' ');
587 u1 = x1;
588 }
589 }
590 }
591
strlen_utf8(unsigned char * s)592 int strlen_utf8(unsigned char *s)
593 {
594 int len = 0;
595 while (1) {
596 unsigned c;
597 GET_UTF_8(s, c);
598 if (!c) return len;
599 len++;
600 }
601 }
602
cp_len(int cp,unsigned char * s)603 int cp_len(int cp, unsigned char *s)
604 {
605 if (cp == utf8_table) return strlen_utf8(s);
606 return (int)strlen(cast_const_char s);
607 }
608
cp_strchr(int charset,unsigned char * str,unsigned chr)609 unsigned char *cp_strchr(int charset, unsigned char *str, unsigned chr)
610 {
611 if (charset != utf8_table) {
612 if (chr >= 0x100)
613 return NULL;
614 return cast_uchar strchr(cast_const_char str, chr);
615 }
616 while (1) {
617 unsigned char *o_str = str;
618 unsigned c;
619 GET_UTF_8(str, c);
620 if (!c) return NULL;
621 if (c == chr) return o_str;
622 }
623 }
624
init_charset(void)625 void init_charset(void)
626 {
627 utf8_table = get_cp_index(cast_uchar "UTF-8");
628 if (utf8_table == -1) internal_error("no UTF-8 charset");
629 bookmarks_codepage = utf8_table;
630 }
631