1 /* $XTermId: charset.c,v 1.83 2021/02/18 20:00:36 tom Exp $ */
2
3 /*
4 Copyright 2010-2018,2021 by Thomas E. Dickey
5 Copyright (c) 2001 by Juliusz Chroboczek
6
7 Permission is hereby granted, free of charge, to any person obtaining a copy
8 of this software and associated documentation files (the "Software"), to deal
9 in the Software without restriction, including without limitation the rights
10 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
11 copies of the Software, and to permit persons to whom the Software is
12 furnished to do so, subject to the following conditions:
13
14 The above copyright notice and this permission notice shall be included in
15 all copies or substantial portions of the Software.
16
17 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
20 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
22 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
23 THE SOFTWARE.
24 */
25
26 #include <charset.h>
27
28 #include <sys.h>
29 #include <parser.h>
30
31 static unsigned int
IdentityRecode(unsigned int n,const CharsetRec * self GCC_UNUSED)32 IdentityRecode(unsigned int n, const CharsetRec * self GCC_UNUSED)
33 {
34 return n;
35 }
36
37 static int
NullReverse(unsigned int n GCC_UNUSED,const CharsetRec * self GCC_UNUSED)38 NullReverse(unsigned int n GCC_UNUSED, const CharsetRec * self GCC_UNUSED)
39 {
40 return -1;
41 }
42
43 static const CharsetRec Unknown94Charset =
44 {"Unknown (94)", T_94, 0, IdentityRecode, NullReverse, 0, 0, 0, 0, 0, 0};
45 static const CharsetRec Unknown96Charset =
46 {"Unknown (96)", T_96, 0, IdentityRecode, NullReverse, 0, 0, 0, 0, 0, 0};
47 static const CharsetRec Unknown9494Charset =
48 {"Unknown (94x94)", T_9494, 0, IdentityRecode, NullReverse, 0, 0, 0, 0, 0, 0};
49 static const CharsetRec Unknown9696Charset =
50 {"Unknown (96x96)", T_9696, 0, IdentityRecode, NullReverse, 0, 0, 0, 0, 0, 0};
51
52 #define EmptyFontenc {0, 0, 0, 0, 0, 0, 0}
53
54 /*
55 * The "name" given is useful on the command-line.
56 * The "xlfd" column is the name given in the X font-encoding package, where
57 * that has a corresponding built-in table or ".enc" file. Other names (such
58 * as "dec-technical" are chosen for consistency with other entries in the
59 * table.
60 */
61 /* *INDENT-OFF* */
62 static FontencCharsetRec fontencCharsets[] =
63 {
64 {"ISO 646 (1973)", T_94, '@', "iso646.1973-0", 0x00, 0, 0},
65 {"ASCII", T_94, 'B', "iso8859-1", 0x00, 0, 0}, /* bug */
66 {"JIS X 0201:GL", T_94, 'J', "jisx0201.1976-0", 0x00, 0, 0},
67 {"JIS X 0201:GR", T_94, 'I', "jisx0201.1976-0", 0x80, 0, 0},
68 {"DEC Special", T_94, '0', "dec-special", 0x00, 0, 0},
69 {"DEC Technical", T_94, '>', "dec-technical", 0x00, 0, 0},
70
71 {"ISO 8859-1", T_96, 'A', "iso8859-1", 0x80, 0, 0},
72 {"ISO 8859-2", T_96, 'B', "iso8859-2", 0x80, 0, 0},
73 {"ISO 8859-3", T_96, 'C', "iso8859-3", 0x80, 0, 0},
74 {"ISO 8859-4", T_96, 'D', "iso8859-4", 0x80, 0, 0},
75 {"ISO 8859-5", T_96, 'L', "iso8859-5", 0x80, 0, 0},
76 {"ISO 8859-6", T_96, 'G', "iso8859-6", 0x80, 0, 0},
77 {"ISO 8859-7", T_96, 'F', "iso8859-7", 0x80, 0, 0},
78 {"ISO 8859-8", T_96, 'H', "iso8859-8", 0x80, 0, 0},
79 {"ISO 8859-9", T_96, 'M', "iso8859-9", 0x80, 0, 0},
80 {"ISO 8859-10", T_96, 'V', "iso8859-10", 0x80, 0, 0},
81 {"ISO 8859-11", T_96, 'T', "iso8859-11", 0x80, 0, 0},
82 {"TIS 620", T_96, 'T', "iso8859-11", 0x80, 0, 0},
83 {"ISO 8859-13", T_96, 'Y', "iso8859-13", 0x80, 0, 0},
84 {"ISO 8859-14", T_96, '_', "iso8859-14", 0x80, 0, 0},
85 {"ISO 8859-15", T_96, 'b', "iso8859-15", 0x80, 0, 0},
86 {"ISO 8859-16", T_96, 'f', "iso8859-16", 0x80, 0, 0},
87 {"KOI8-E", T_96, '@', "koi8-e", 0x80, 0, 0},
88 {"TCVN", T_96, 'Z', "tcvn-0", 0x80, 0, 0},
89
90 {"GB 2312", T_9494, 'A', "gb2312.1980-0", 0x0000, 0, 0},
91 {"JIS X 0208", T_9494, 'B', "jisx0208.1990-0", 0x0000, 0, 0},
92 {"KSC 5601", T_9494, 'C', "ksc5601.1987-0", 0x0000, 0, 0},
93 {"JIS X 0212", T_9494, 'D', "jisx0212.1990-0", 0x0000, 0, 0},
94
95 {"GB 2312", T_9696, 'A', "gb2312.1980-0", 0x0000, 0, 0},
96 {"JIS X 0208", T_9696, 'B', "jisx0208.1990-0", 0x0000, 0, 0},
97 {"KSC 5601", T_9696, 'C', "ksc5601.1987-0", 0x0000, 0, 0},
98 {"JIS X 0212", T_9696, 'D', "jisx0212.1990-0", 0x0000, 0, 0},
99
100 {"CNS11643-1", T_9494, 'G', "cns11643-1", 0x0000, 0, 0},
101 {"CNS11643-2", T_9494, 'H', "cns11643-2", 0x0000, 0, 0},
102 {"CNS11643-3", T_9494, 'I', "cns11643-3", 0x0000, 0, 0},
103
104 {"APL2", T_128, 0, "apl2", 0x80, 0, 0},
105 {"KOI8-R", T_128, 0, "koi8-r", 0x80, 0, 0},
106 {"KOI8-U", T_128, 0, "koi8-u", 0x80, 0, 0},
107 {"KOI8-RU", T_128, 0, "koi8-ru", 0x80, 0, 0},
108 {"CP 1250", T_128, 0, "microsoft-cp1250", 0x80, 0, 0},
109 {"CP 1251", T_128, 0, "microsoft-cp1251", 0x80, 0, 0},
110 {"CP 1252", T_128, 0, "microsoft-cp1252", 0x80, 0, 0},
111 {"CP 1255", T_128, 0, "microsoft-cp1255", 0x80, 0, 0},
112
113 {"CP 437", T_128, 0, "ibm-cp437", 0x80, 0, 0},
114 {"CP 850", T_128, 0, "ibm-cp850", 0x80, 0, 0},
115 {"CP 852", T_128, 0, "ibm-cp852", 0x80, 0, 0},
116 {"CP 865", T_128, 0, "ibm-cp865", 0x80, 0, 0},
117 {"CP 866", T_128, 0, "ibm-cp866", 0x80, 0, 0},
118
119 {"Big 5", T_94192, 0, "big5.eten-0", 0x8000, 0, 0},
120
121 /*
122 * Several empty slots are reserved, to allow for non-ISO-2022 character
123 * sets to be defined in ".enc" files.
124 */
125 EmptyFontenc, /* G0, from ".enc" file */
126 EmptyFontenc, /* G1, from ".enc" file */
127 EmptyFontenc, /* G2, from ".enc" file */
128 EmptyFontenc, /* G3, from ".enc" file */
129 EmptyFontenc
130 };
131 /* *INDENT-ON* */
132
133 typedef struct _OtherCharset {
134 const char *name;
135 int (*init) (OtherStatePtr);
136 unsigned int (*mapping) (unsigned int, OtherStatePtr);
137 unsigned int (*reverse) (unsigned int, OtherStatePtr);
138 int (*stack) (unsigned, OtherStatePtr);
139 } OtherCharsetRec, *OtherCharsetPtr;
140 /* *INDENT-OFF* */
141
142 static const OtherCharsetRec otherCharsets[] =
143 {
144 {"GBK", init_gbk, mapping_gbk, reverse_gbk, stack_gbk},
145 {"UTF-8", init_utf8, mapping_utf8, reverse_utf8, stack_utf8},
146 {"SJIS", init_sjis, mapping_sjis, reverse_sjis, stack_sjis},
147 {"BIG5-HKSCS", init_hkscs, mapping_hkscs, reverse_hkscs, stack_hkscs},
148 {"GB18030", init_gb18030, mapping_gb18030, reverse_gb18030, stack_gb18030},
149 {0, 0, 0, 0, 0}
150 };
151 /* *INDENT-ON* */
152
153 #define lcIgnore(c) \
154 (c && (isspace(UChar(c)) || c == '-' || c == '_' || c == '/'))
155
156 int
lcStrCmp(const char * s,const char * t)157 lcStrCmp(const char *s, const char *t)
158 {
159 int result = 0;
160
161 while (*s || *t) {
162 if (lcIgnore(*s))
163 s++;
164 else if (lcIgnore(*t))
165 t++;
166 else if (*s && *t && tolower(UChar(*s)) == tolower(UChar(*t))) {
167 s++;
168 t++;
169 } else {
170 result = 1;
171 break;
172 }
173 }
174 return result;
175 }
176
177 static int
compare1(const char * s,const char * t,size_t n)178 compare1(const char *s, const char *t, size_t n)
179 {
180 int result = 0;
181
182 while (n && (*s || *t)) {
183 --n;
184 if (lcIgnore(*s))
185 s++;
186 else if (lcIgnore(*t))
187 t++;
188 else if (*s && *t && tolower(UChar(*s)) == tolower(UChar(*t))) {
189 s++;
190 t++;
191 } else {
192 result = 1;
193 break;
194 }
195 }
196 return result;
197 }
198
199 static unsigned int
FontencCharsetRecode(unsigned int n,const CharsetRec * self)200 FontencCharsetRecode(unsigned int n, const CharsetRec * self)
201 {
202 const FontencCharsetRec *fc = (const FontencCharsetRec *) (self->data);
203 unsigned result;
204
205 result = MapCodeValue(n + fc->shift, fc->mapping);
206
207 TRACE(("FontencCharsetRecode %#x ->%#x%s\n",
208 n,
209 result,
210 (n != result) ? " map" : ""));
211
212 return result;
213 }
214
215 static int
FontencCharsetReverse(unsigned int i,const CharsetRec * self)216 FontencCharsetReverse(unsigned int i, const CharsetRec * self)
217 {
218 const FontencCharsetRec *fc = (const FontencCharsetRec *) (self->data);
219 unsigned n;
220 int result = -1;
221
222 n = fc->reverse->reverse(i, fc->reverse->data);
223 if (n != 0 && n >= fc->shift) {
224 n -= fc->shift;
225
226 #define IS_GL(n) ((n) >= 0x20 && (n) < 0x80)
227 switch (self->type) {
228 case T_94:
229 case T_96:
230 if (IS_GL(n))
231 result = (int) n;
232 break;
233 case T_128:
234 if (n < 0x80)
235 result = (int) n;
236 break;
237 case T_9494:
238 case T_9696:
239 if (IS_GL(n >> 8) && IS_GL(n & 0xFF))
240 result = (int) n;
241 break;
242 case T_94192:
243 if (IS_GL(n >> 8) && IS_GL(n & 0x7F))
244 result = (int) n;
245 break;
246 default:
247 abort();
248 /* NOTREACHED */
249 }
250 #undef IS_GL
251 }
252
253 TRACE(("FontencCharsetReverse %#x ->%#x%s\n",
254 i,
255 result,
256 ((int) i != result) ? " map" : ""));
257
258 return result;
259 }
260
261 static CharsetPtr cachedCharsets = NULL;
262
263 static CharsetPtr
getCachedCharset(unsigned final,int type,const char * name)264 getCachedCharset(unsigned final, int type, const char *name)
265 {
266 CharsetPtr c;
267 for (c = cachedCharsets; c; c = c->next) {
268 if (((c->type == type && c->final == final) ||
269 (name && !lcStrCmp(c->name, name))) &&
270 (c->type != T_FAILED))
271 return c;
272 }
273 return NULL;
274 }
275
276 static void
cacheCharset(CharsetPtr c)277 cacheCharset(CharsetPtr c)
278 {
279 c->next = cachedCharsets;
280 cachedCharsets = c;
281 VERBOSE(2, ("cachedCharset '%s'\n", c->name));
282 }
283
284 #ifdef USE_ICONV
285 static US_SIZE
cpSize(FontencCharsetPtr fc)286 cpSize(FontencCharsetPtr fc)
287 {
288 US_SIZE result = usANY;
289
290 switch (fc->type) {
291 case T_94:
292 case T_96:
293 case T_128:
294 result = us8BIT;
295 break;
296 case T_9494:
297 case T_9696:
298 case T_94192:
299 result = us16BIT;
300 break;
301 }
302 return result;
303 }
304 #endif
305
306 static int
addFontencCharset(const char * name,FontEncPtr f)307 addFontencCharset(const char *name, FontEncPtr f)
308 {
309 FontencCharsetPtr fc = fontencCharsets;
310 FontencCharsetPtr limit = fc + SizeOf(fontencCharsets);
311 int result = 0;
312 int c_size = typeOfFontenc(f);
313 int c_type;
314
315 if (c_size <= 94) {
316 c_type = T_94;
317 } else if (c_size <= 96) {
318 c_type = T_96;
319 } else if (c_size <= 128) {
320 c_type = T_128;
321 } else {
322 VERBOSE(1, ("unexpected character-set size: %d\n", c_size));
323 return 0;
324 }
325
326 while (fc->name) {
327 fc++;
328 }
329 if (fc < (limit - 1)) {
330 result = 1;
331 fc->name = strdup(name);
332 fc->xlfd = strdup(name);
333 fc->type = c_type;
334 fc->shift = shiftOfFontenc(f);
335 }
336 return result;
337 }
338
339 static CharsetPtr
getFontencCharset(unsigned final,int type,const char * name)340 getFontencCharset(unsigned final, int type, const char *name)
341 {
342 FontencCharsetPtr fc;
343 CharsetPtr c = NULL;
344 FontMapPtr mapping;
345 FontMapReversePtr reverse;
346 CharsetPtr result = NULL;
347
348 TRACE(("getFontencCharset(final %#x, type %d, name %s)\n",
349 final, type, NonNull(name)));
350
351 fc = fontencCharsets;
352 while (fc->name) {
353 if (((fc->type == type && fc->final == final) ||
354 (name && !lcStrCmp(fc->name, name))) &&
355 (fc->type != T_FAILED))
356 break;
357 fc++;
358 }
359
360 if (!fc->name) {
361 VERBOSE(2, ("...no match for '%s' in FontEnc charsets\n", NonNull(name)));
362 } else if ((c = TypeCalloc(CharsetRec)) == 0) {
363 VERBOSE(2, ("malloc failed\n"));
364 } else if ((mapping = LookupMapping(fc->xlfd, cpSize(fc))) == NULL) {
365 VERBOSE(2, ("...lookup mapping %s (%s) failed\n", NonNull(name), fc->xlfd));
366 fc->type = T_FAILED;
367 } else if ((reverse = LookupReverse(mapping)) == NULL) {
368 VERBOSE(2, ("...lookup reverse %s failed\n", NonNull(name)));
369 fc->type = T_FAILED;
370 } else {
371 fc->mapping = mapping;
372 fc->reverse = reverse;
373
374 c->name = fc->name;
375 c->type = fc->type;
376 c->final = fc->final;
377 c->recode = FontencCharsetRecode;
378 c->reverse = FontencCharsetReverse;
379 c->data = fc;
380
381 cacheCharset(c);
382 result = c;
383 }
384
385 if (result == NULL && c != NULL)
386 free(c);
387
388 return result;
389 }
390
391 static const OtherCharsetRec *
findOtherCharset(const char * name)392 findOtherCharset(const char *name)
393 {
394 const OtherCharsetRec *fc;
395 fc = otherCharsets;
396 while (fc->name) {
397 if (name && !lcStrCmp(fc->name, name))
398 break;
399 fc++;
400 }
401 return fc;
402 }
403
404 int
isOtherCharset(const char * name)405 isOtherCharset(const char *name)
406 {
407 const OtherCharsetRec *fc = findOtherCharset(name);
408 int result = (fc->name != 0);
409 if (!result) {
410 result = (!lcStrCmp(name, "Big5") ||
411 !lcStrCmp(name, "JOHAB"));
412 }
413 return result;
414 }
415
416 static CharsetPtr
getOtherCharset(const char * name)417 getOtherCharset(const char *name)
418 {
419 const OtherCharsetRec *fc;
420 CharsetPtr c = NULL;
421 OtherStatePtr s = NULL;
422 CharsetPtr result = NULL;
423
424 fc = findOtherCharset(name);
425 if (!fc->name) {
426 VERBOSE(2, ("...no match for '%s' in Other charsets\n", NonNull(name)));
427 } else if ((c = TypeCalloc(CharsetRec)) == NULL) {
428 VERBOSE(2, ("malloc failed\n"));
429 } else if ((s = TypeCalloc(OtherState)) == NULL) {
430 VERBOSE(2, ("malloc failed\n"));
431 } else {
432 c->name = fc->name;
433 c->type = T_OTHER;
434 c->final = 0;
435 c->data = fc;
436 c->other_recode = fc->mapping;
437 c->other_reverse = fc->reverse;
438 c->other_stack = fc->stack;
439 c->other_aux = s;
440
441 if (!fc->init(s)) {
442 VERBOSE(2, ("...initialization %s failed\n", NonNull(name)));
443 c->type = T_FAILED;
444 } else {
445 cacheCharset(c);
446 result = c;
447 }
448 }
449
450 if (result == NULL) {
451 if (c != NULL)
452 free(c);
453 if (s != NULL)
454 free(s);
455 }
456
457 return result;
458 }
459
460 const CharsetRec *
getUnknownCharset(int type)461 getUnknownCharset(int type)
462 {
463 TRACE(("getUnknownCharset(%d)\n", type));
464 switch (type) {
465 case T_94:
466 VERBOSE(2, ("using unknown 94-charset\n"));
467 return &Unknown94Charset;
468 case T_96:
469 VERBOSE(2, ("using unknown 96-charset\n"));
470 return &Unknown96Charset;
471 case T_9494:
472 VERBOSE(2, ("using unknown 9494-charset\n"));
473 return &Unknown9494Charset;
474 case T_9696:
475 VERBOSE(2, ("using unknown 9696-charset\n"));
476 return &Unknown9696Charset;
477 default:
478 VERBOSE(2, ("using unknown 94-charset\n"));
479 return &Unknown94Charset;
480 }
481 }
482
483 const CharsetRec *
getCharset(unsigned final,int type)484 getCharset(unsigned final, int type)
485 {
486 const CharsetRec *c;
487
488 TRACE(("getCharset(final=%c, type=%d)\n", final, type));
489 c = getCachedCharset(final, type, NULL);
490 if (c)
491 return c;
492
493 c = getFontencCharset(final, type, NULL);
494 if (c)
495 return c;
496
497 return getUnknownCharset(type);
498 }
499
500 const CharsetRec *
getCharsetByName(const char * name)501 getCharsetByName(const char *name)
502 {
503 const CharsetRec *c;
504 FontEncPtr f;
505 int type = T_94;
506
507 VERBOSE(2, ("getCharsetByName(%s)\n", NonNull(name)));
508 TRACE(("getCharsetByName(%s)\n", NonNull(name)));
509
510 if (name == NULL)
511 return getUnknownCharset(type);
512
513 c = getCachedCharset(0, 0, name);
514 if (c)
515 return c;
516
517 c = getFontencCharset(0, 0, name);
518 if (c)
519 return c;
520
521 c = getOtherCharset(name);
522 if (c)
523 return c;
524
525 /*
526 * If we did not find the name in a table, look for a ".enc" * file.
527 */
528 if ((f = lookupOneFontenc(name)) != 0) {
529 if (addFontencCharset(name, f)) {
530 c = getFontencCharset(0, 0, name);
531 if (c)
532 return c;
533 }
534 }
535 return getUnknownCharset(type);
536 }
537 /* *INDENT-OFF* */
538 static const LocaleCharsetRec localeCharsets[] =
539 {
540 {"C", 0, 2, "ASCII", NULL, "ISO 8859-1", NULL, NULL},
541 {"POSIX", 0, 2, "ASCII", NULL, "ISO 8859-1", NULL, NULL},
542 {"US-ASCII", 0, 2, "ASCII", NULL, "ISO 8859-1", NULL, NULL},
543
544 {"ISO8859-1", 0, 2, "ASCII", NULL, "ISO 8859-1", NULL, NULL},
545 {"ISO8859-2", 0, 2, "ASCII", NULL, "ISO 8859-2", NULL, NULL},
546 {"ISO8859-3", 0, 2, "ASCII", NULL, "ISO 8859-3", NULL, NULL},
547 {"ISO8859-4", 0, 2, "ASCII", NULL, "ISO 8859-4", NULL, NULL},
548 {"ISO8859-5", 0, 2, "ASCII", NULL, "ISO 8859-5", NULL, NULL},
549 {"ISO8859-6", 0, 2, "ASCII", NULL, "ISO 8859-6", NULL, NULL},
550 {"ISO8859-7", 0, 2, "ASCII", NULL, "ISO 8859-7", NULL, NULL},
551 {"ISO8859-8", 0, 2, "ASCII", NULL, "ISO 8859-8", NULL, NULL},
552 {"ISO8859-9", 0, 2, "ASCII", NULL, "ISO 8859-9", NULL, NULL},
553 {"ISO8859-10", 0, 2, "ASCII", NULL, "ISO 8859-10", NULL, NULL},
554 {"ISO8859-11", 0, 2, "ASCII", NULL, "ISO 8859-11", NULL, NULL},
555 {"TIS620", 0, 2, "ASCII", NULL, "ISO 8859-11", NULL, NULL},
556 {"ISO8859-13", 0, 2, "ASCII", NULL, "ISO 8859-13", NULL, NULL},
557 {"ISO8859-14", 0, 2, "ASCII", NULL, "ISO 8859-14", NULL, NULL},
558 {"ISO8859-15", 0, 2, "ASCII", NULL, "ISO 8859-15", NULL, NULL},
559 {"ISO8859-16", 0, 2, "ASCII", NULL, "ISO 8859-16", NULL, NULL},
560
561 {"KOI8-E", 0, 2, "ASCII", NULL, "KOI8-E", NULL, NULL},
562 {"KOI8-R", 0, 2, "ASCII", NULL, "KOI8-R", NULL, NULL},
563 {"KOI8-U", 0, 2, "ASCII", NULL, "KOI8-U", NULL, NULL},
564 {"KOI8-RU", 0, 2, "ASCII", NULL, "KOI8-RU", NULL, NULL},
565 {"CP1250", 0, 2, "ASCII", NULL, "CP 1250", NULL, NULL},
566 {"CP1251", 0, 2, "ASCII", NULL, "CP 1251", NULL, NULL},
567 {"CP1252", 0, 2, "ASCII", NULL, "CP 1252", NULL, NULL},
568 {"CP1255", 0, 2, "ASCII", NULL, "CP 1255", NULL, NULL},
569 {"CP437", 0, 2, "ASCII", NULL, "CP 437", NULL, NULL},
570 {"CP850", 0, 2, "ASCII", NULL, "CP 850", NULL, NULL},
571 {"CP852", 0, 2, "ASCII", NULL, "CP 852", NULL, NULL},
572 {"CP865", 0, 2, "ASCII", NULL, "CP 865", NULL, NULL},
573 {"CP866", 0, 2, "ASCII", NULL, "CP 866", NULL, NULL},
574 {"TCVN", 0, 2, "ASCII", NULL, "TCVN", NULL, NULL},
575
576 {"GB2312", 0, 1, "ASCII", "GB 2312", NULL, NULL, NULL},
577 {"eucJP", 0, 1, "ASCII", "JIS X 0208", "JIS X 0201:GR", "JIS X 0212", NULL},
578 {"eucKR", 0, 1, "ASCII", "KSC 5601", NULL, NULL, NULL},
579 {"eucCN", 0, 1, "ASCII", "GB 2312", NULL, NULL, NULL},
580 {"eucTW", 0, 1, "ASCII", "CNS11643-1", "CNS11643-2", "CNS11643-3", NULL},
581 {"Big5", 0, 1, "ASCII", "Big 5", NULL, NULL, NULL},
582
583 {"gbk", 0, 1, NULL, NULL, NULL, NULL, "GBK"},
584 {"UTF-8", 0, 1, NULL, NULL, NULL, NULL, "UTF-8"},
585 {"SJIS", 0, 1, NULL, NULL, NULL, NULL, "SJIS"},
586 {"Big5-HKSCS", 0, 1, NULL, NULL, NULL, NULL, "BIG5-HKSCS"},
587 {"gb18030", 0, 1, NULL, NULL, NULL, NULL, "GB18030"},
588
589 {0, 0, 0, 0, 0, 0, 0, 0}
590 };
591 /* *INDENT-ON* */
592
593 void
reportCharsets(void)594 reportCharsets(void)
595 {
596 const LocaleCharsetRec *p;
597 FontencCharsetPtr q;
598 printf("Known locale encodings:\n\n");
599 for (p = localeCharsets; p->name; p++) {
600 if (p->other) {
601 printf(" %s (non-ISO-2022 encoding)\n", p->other);
602 continue;
603 }
604 printf(" %s: GL -> G%d, GR -> G%d", p->name, p->gl, p->gr);
605 if (p->g0)
606 printf(", G0: %s", p->g0);
607 if (p->g1)
608 printf(", G1: %s", p->g1);
609 if (p->g2)
610 printf(", G2: %s", p->g2);
611 if (p->g3)
612 printf(", G3: %s", p->g3);
613 printf("\n");
614 }
615
616 printf("\n\nKnown charsets (not all may be available):\n\n");
617 for (q = fontencCharsets; q->name; q++) {
618 const char *csize = "";
619
620 printf(" %s", q->name);
621 switch (q->type) {
622 case T_94:
623 csize = "94 codes";
624 break;
625 case T_96:
626 csize = "96 codes";
627 break;
628 case T_128:
629 csize = "128 codes";
630 break;
631 case T_9494:
632 csize = "94x94 codes";
633 break;
634 case T_9696:
635 csize = "94x96 codes";
636 break;
637 case T_94192:
638 csize = "94x192 codes";
639 break;
640 }
641 if (q->final) {
642 printf(" (ISO 2022%s%s)", *csize ? ", " : "", csize);
643 } else if (*csize) {
644 printf(" (%s)", csize);
645 }
646 printf("\n");
647 }
648 }
649
650 #ifdef USE_ICONV
651 static LocaleCharsetRec fakeLocaleCharset;
652
653 static const LocaleCharsetRec *
findLocaleByCharset(const char * charset)654 findLocaleByCharset(const char *charset)
655 {
656 const LocaleCharsetRec *lc;
657 const LocaleCharsetRec *result = 0;
658
659 for (lc = localeCharsets; lc->name != 0; ++lc) {
660 if (lc->g1 == 0 && lc->g2 == 0)
661 continue;
662 if ((lc->g3 != 0 && !lcStrCmp(charset, lc->g3))
663 || (lc->g2 != 0 && !lcStrCmp(charset, lc->g2))
664 || (lc->g1 != 0 && !lcStrCmp(charset, lc->g1))) {
665 result = lc;
666 break;
667 }
668 }
669 TRACE(("findLocaleByCharset(%s) ->%s\n",
670 charset, result ? result->name : "?"));
671 return result;
672 }
673
674 static const LocaleCharsetRec *
closestLocaleCharset(FontEncPtr enc)675 closestLocaleCharset(FontEncPtr enc)
676 {
677 const LocaleCharsetRec *result = 0;
678
679 if (enc != 0) {
680 const FontencCharsetRec *fc = getFontencByName(enc->name);
681 if (fc != 0) {
682 result = findLocaleByCharset(fc->name);
683 } else {
684 result = findLocaleByCharset(enc->name);
685 }
686 }
687 TRACE(("closestLocaleCharset(%s) ->%s\n",
688 enc ? enc->name : "?",
689 result ? result->name : "?"));
690 return result;
691 }
692
693 static int
canFakeLocaleCharset(FontEncPtr enc)694 canFakeLocaleCharset(FontEncPtr enc)
695 {
696 int result = 0;
697 if (enc != 0
698 && enc->size <= 256
699 && enc->row_size == 0) {
700 result = 1;
701 }
702 return result;
703 }
704 #endif
705
706 static const LocaleCharsetRec *
findLocaleCharset(const char * charset)707 findLocaleCharset(const char *charset)
708 {
709 const LocaleCharsetRec *p;
710 const LocaleCharsetRec *result = 0;
711
712 for (p = localeCharsets; p->name; p++) {
713 if (lcStrCmp(p->name, charset) == 0) {
714 result = p;
715 break;
716 }
717 }
718 #ifdef USE_ICONV
719 /*
720 * The table is useful, but not complete.
721 * If we can find a mapping for an 8-bit encoding, fake a table entry.
722 */
723 if (result == 0) {
724 FontEncPtr enc = luitGetFontEnc(charset,
725 (UM_MODE) ((int) umICONV
726 | (int) umFONTENC
727 | (int) umBUILTIN));
728 if ((result = closestLocaleCharset(enc)) != 0) {
729 TRACE(("...matched a LocaleCharset record for %s\n", NonNull(charset)));
730 } else if (canFakeLocaleCharset(enc)) {
731 LocaleCharsetRec *temp = &fakeLocaleCharset;
732
733 TRACE(("...fake a LocaleCharset record for %s\n", NonNull(charset)));
734
735 memset(temp, 0, sizeof(*temp));
736 temp->name = strmalloc(charset);
737 temp->gr = 2;
738 temp->g0 = "ASCII";
739 temp->g2 = temp->name;
740 result = temp;
741
742 } else {
743 TRACE(("...do not know how to fake LocaleCharset for %s\n",
744 NonNull(charset)));
745 }
746 luitFreeFontEnc(enc);
747 }
748 #endif
749 return result;
750 }
751
752 static const LocaleCharsetRec *
matchLocaleCharset(const char * charset)753 matchLocaleCharset(const char *charset)
754 {
755 static const struct {
756 const char *source;
757 const char *target;
758 size_t source_len;
759 size_t target_len;
760 } prefixes[] = {
761 #define DATA(source, target) { source, target, sizeof(source)-1, sizeof(target)-1 }
762 DATA("ISO-", "ISO "),
763 DATA("DEC ", "DEC-"),
764 DATA("IBM-CP", "CP "), /* "ibm-cp866" -> "cp 866" (iconv) */
765 DATA("IBM", "CP "),
766 DATA("MICROSOFT-CP", "CP "),
767 DATA("MICROSOFT", "CP "),
768 DATA("CP-", "CP "),
769 DATA("ANSI", "CP "), /* e.g., Solaris ANSI1251 */
770 #undef DATA
771 };
772
773 const LocaleCharsetRec *p = 0;
774
775 TRACE(("matchLocaleCharset(%s)\n", NonNull(charset)));
776 if (!IsEmpty(charset)) {
777 char *euro;
778 char source[MAX_KEYWORD_LENGTH + 1];
779
780 sprintf(source, "%.*s", MAX_KEYWORD_LENGTH, charset);
781 if ((euro = strrchr(source, '@')) != 0 && !strcmp(euro, "@euro")) {
782 Warning("the euro character may not be supported\n");
783 *euro = 0;
784 }
785
786 p = findLocaleCharset(source);
787
788 if (p == 0) {
789 size_t have = strlen(source);
790 size_t n;
791 char target[MAX_KEYWORD_LENGTH + 80];
792
793 for (n = 0; n < SizeOf(prefixes); ++n) {
794 if (have > prefixes[n].source_len
795 && have < MAX_KEYWORD_LENGTH
796 && !compare1(source,
797 prefixes[n].source,
798 prefixes[n].source_len)) {
799 strcpy(target, prefixes[n].target);
800 strcpy(target + prefixes[n].target_len,
801 source + prefixes[n].source_len);
802 if ((p = findLocaleCharset(target)) != 0) {
803 break;
804 }
805 }
806 }
807 }
808 }
809 return p;
810 }
811
812 int
getLocaleState(const char * locale,const char * charset,int * gl_return,int * gr_return,const CharsetRec ** g0_return,const CharsetRec ** g1_return,const CharsetRec ** g2_return,const CharsetRec ** g3_return,const CharsetRec ** other_return)813 getLocaleState(const char *locale,
814 const char *charset,
815 int *gl_return, int *gr_return,
816 const CharsetRec * *g0_return,
817 const CharsetRec * *g1_return,
818 const CharsetRec * *g2_return,
819 const CharsetRec * *g3_return,
820 const CharsetRec * *other_return)
821 {
822 int result = 0;
823 char *resolved = 0;
824 const LocaleCharsetRec *p;
825
826 TRACE(("getLocaleState(locale=%s, charset=%s)\n", locale, NonNull(charset)));
827 if (IsEmpty(charset)) {
828 if (ignore_locale) {
829 charset = locale;
830 } else {
831 resolved = resolveLocale(locale);
832 if (!resolved)
833 return -1;
834 if ((charset = strrchr(resolved, '.')) != 0) {
835 charset++;
836 } else {
837 charset = resolved;
838 }
839 }
840 }
841
842 if ((p = matchLocaleCharset(charset)) != 0) {
843 *gl_return = p->gl;
844 *gr_return = p->gr;
845 *g0_return = getCharsetByName(p->g0);
846 *g1_return = getCharsetByName(p->g1);
847 *g2_return = getCharsetByName(p->g2);
848 *g3_return = getCharsetByName(p->g3);
849 if (p->other)
850 *other_return = getCharsetByName(p->other);
851 else
852 *other_return = NULL;
853 } else {
854 result = -1;
855 }
856
857 if (resolved != 0)
858 free(resolved);
859
860 TRACE(("...getLocaleState ->%d\n", result));
861 return result;
862 }
863
864 #ifdef USE_ICONV
865 /*
866 * Given either a charset name, or the xlfd field (which is more likely to
867 * match iconv), return a pointer to the entry in fontencCharsets which matches.
868 */
869 const FontencCharsetRec *
getFontencByName(const char * encoding_name)870 getFontencByName(const char *encoding_name)
871 {
872 const FontencCharsetRec *result = 0;
873 const FontencCharsetRec *fc;
874 char *gr_special;
875
876 for (fc = fontencCharsets; fc->name != 0; ++fc) {
877 if (!lcStrCmp(encoding_name, fc->name)
878 || (strstr(fc->name, ":GL") == 0
879 && !lcStrCmp(encoding_name, fc->xlfd))) {
880 result = fc;
881 break;
882 }
883 }
884
885 /*
886 * Luit treats ":GR" specially in its charset tables, essentially to
887 * distinguish the case it uses for JIS X 201 from other possibilities.
888 */
889 if (result == 0
890 && strchr(encoding_name, ':') == 0
891 && (gr_special = malloc(strlen(encoding_name) + 4)) != 0) {
892 sprintf(gr_special, "%s:GR", encoding_name);
893 result = getFontencByName(gr_special);
894 free(gr_special);
895 }
896 TRACE(("getFontencByName(%s) ->%s\n",
897 encoding_name,
898 result ? result->name : "?"));
899 return result;
900 }
901
902 /*
903 * Check (for EUC-JP specifically, but generally...) for a charset which
904 * is part of a composite charset using G2/G3 via single-shifts.
905 */
906 const char *
getCompositeCharset(const char * encoding_name)907 getCompositeCharset(const char *encoding_name)
908 {
909 const char *result = 0;
910 const FontencCharsetRec *fc;
911 const LocaleCharsetRec *lc;
912
913 if ((fc = getFontencByName(encoding_name)) != 0) {
914 if ((lc = findLocaleByCharset(fc->name)) != 0) {
915 result = lc->name;
916 TRACE(("getCompositeCharset(%s) ->%s\n",
917 NonNull(encoding_name), NonNull(result)));
918 }
919 }
920 return result;
921 }
922
923 static const char *
selectPart(const LocaleCharsetRec * data,unsigned g)924 selectPart(const LocaleCharsetRec * data, unsigned g)
925 {
926 const char *result = 0;
927 switch (g) {
928 case 0:
929 result = data->g0;
930 break;
931 case 1:
932 result = data->g1;
933 break;
934 case 2:
935 result = data->g2;
936 break;
937 case 3:
938 result = data->g3;
939 break;
940 }
941 return result;
942 }
943
944 /*
945 * Given a composite name returned by getCompositeCharset, return a pointer to
946 * the data which describes the encoding used for a given shift.
947 */
948 const FontencCharsetRec *
getCompositePart(const char * composite_name,unsigned g)949 getCompositePart(const char *composite_name, unsigned g)
950 {
951 const FontencCharsetRec *result = 0;
952 const LocaleCharsetRec *lc;
953 const char *part_name;
954
955 for (lc = localeCharsets; lc->name; ++lc) {
956 if (!lcStrCmp(composite_name, lc->name)) {
957 if ((part_name = selectPart(lc, g)) != 0) {
958 const FontencCharsetRec *fc;
959 for (fc = fontencCharsets; fc->name != 0; ++fc) {
960 if (!lcStrCmp(part_name, fc->name)) {
961 result = fc;
962 break;
963 }
964 }
965 }
966 break;
967 }
968 }
969 return result;
970 }
971 #endif
972
973 #ifdef NO_LEAKS
974 static int
isUnknownCharsetPtr(CharsetPtr p)975 isUnknownCharsetPtr(CharsetPtr p)
976 {
977 return (p == &Unknown94Charset
978 || p == &Unknown96Charset
979 || p == &Unknown9494Charset
980 || p == &Unknown9696Charset);
981 }
982
983 static void
destroyFontencCharsetPtr(FontencCharsetPtr p)984 destroyFontencCharsetPtr(FontencCharsetPtr p)
985 {
986 #ifdef USE_ICONV
987 if (p->reverse) {
988 luitDestroyReverse(p->reverse);
989 }
990 #else
991 p->mapping = 0;
992
993 /*
994 * This should, but does not work -
995 * FontMapReverseFree(p->reverse)
996 *
997 * The iteration for map[] is based on reading the source of
998 * FontMapReverse().
999 */
1000 if (p->reverse) {
1001 int n;
1002 unsigned **map = p->reverse->data;
1003 for (n = 0; n < 256; ++n) {
1004 if (map[n])
1005 free(map[n]);
1006 }
1007 free(p->reverse->data);
1008 free(p->reverse);
1009 p->reverse = 0;
1010 }
1011 #endif
1012 }
1013
1014 static void
destroyCharset(CharsetPtr p)1015 destroyCharset(CharsetPtr p)
1016 {
1017 if (!isUnknownCharsetPtr(p)) {
1018 if (p->type == T_OTHER) {
1019 free(p->other_aux);
1020 } else {
1021 destroyFontencCharsetPtr((FontencCharsetPtr) p->data);
1022 }
1023 free(p);
1024 }
1025 }
1026
1027 void
charset_leaks(void)1028 charset_leaks(void)
1029 {
1030 while (cachedCharsets != 0) {
1031 CharsetPtr next = cachedCharsets->next;
1032 destroyCharset(cachedCharsets);
1033 cachedCharsets = next;
1034 }
1035 #ifdef USE_ICONV
1036 if (fakeLocaleCharset.name != 0) {
1037 free((void *) fakeLocaleCharset.name);
1038 fakeLocaleCharset.name = 0;
1039 }
1040 #endif
1041 }
1042 #endif /* NO_LEAKS */
1043