1 /* $XTermId: charset.c,v 1.83 2021/02/18 20:00:36 tom Exp $ */
2 
3 /*
4 Copyright 2010-2018,2021 by Thomas E. Dickey
5 Copyright (c) 2001 by Juliusz Chroboczek
6 
7 Permission is hereby granted, free of charge, to any person obtaining a copy
8 of this software and associated documentation files (the "Software"), to deal
9 in the Software without restriction, including without limitation the rights
10 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
11 copies of the Software, and to permit persons to whom the Software is
12 furnished to do so, subject to the following conditions:
13 
14 The above copyright notice and this permission notice shall be included in
15 all copies or substantial portions of the Software.
16 
17 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
20 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
22 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
23 THE SOFTWARE.
24 */
25 
26 #include <charset.h>
27 
28 #include <sys.h>
29 #include <parser.h>
30 
31 static unsigned int
IdentityRecode(unsigned int n,const CharsetRec * self GCC_UNUSED)32 IdentityRecode(unsigned int n, const CharsetRec * self GCC_UNUSED)
33 {
34     return n;
35 }
36 
37 static int
NullReverse(unsigned int n GCC_UNUSED,const CharsetRec * self GCC_UNUSED)38 NullReverse(unsigned int n GCC_UNUSED, const CharsetRec * self GCC_UNUSED)
39 {
40     return -1;
41 }
42 
43 static const CharsetRec Unknown94Charset =
44 {"Unknown (94)", T_94, 0, IdentityRecode, NullReverse, 0, 0, 0, 0, 0, 0};
45 static const CharsetRec Unknown96Charset =
46 {"Unknown (96)", T_96, 0, IdentityRecode, NullReverse, 0, 0, 0, 0, 0, 0};
47 static const CharsetRec Unknown9494Charset =
48 {"Unknown (94x94)", T_9494, 0, IdentityRecode, NullReverse, 0, 0, 0, 0, 0, 0};
49 static const CharsetRec Unknown9696Charset =
50 {"Unknown (96x96)", T_9696, 0, IdentityRecode, NullReverse, 0, 0, 0, 0, 0, 0};
51 
52 #define EmptyFontenc {0, 0, 0, 0, 0, 0, 0}
53 
54 /*
55  * The "name" given is useful on the command-line.
56  * The "xlfd" column is the name given in the X font-encoding package, where
57  * that has a corresponding built-in table or ".enc" file.  Other names (such
58  * as "dec-technical" are chosen for consistency with other entries in the
59  * table.
60  */
61 /* *INDENT-OFF* */
62 static FontencCharsetRec fontencCharsets[] =
63 {
64     {"ISO 646 (1973)", T_94,    '@', "iso646.1973-0",    0x00,   0, 0},
65     {"ASCII",          T_94,    'B', "iso8859-1",        0x00,   0, 0},	/* bug */
66     {"JIS X 0201:GL",  T_94,    'J', "jisx0201.1976-0",  0x00,   0, 0},
67     {"JIS X 0201:GR",  T_94,    'I', "jisx0201.1976-0",  0x80,   0, 0},
68     {"DEC Special",    T_94,    '0', "dec-special",      0x00,   0, 0},
69     {"DEC Technical",  T_94,    '>', "dec-technical",    0x00,   0, 0},
70 
71     {"ISO 8859-1",     T_96,    'A', "iso8859-1",        0x80,   0, 0},
72     {"ISO 8859-2",     T_96,    'B', "iso8859-2",        0x80,   0, 0},
73     {"ISO 8859-3",     T_96,    'C', "iso8859-3",        0x80,   0, 0},
74     {"ISO 8859-4",     T_96,    'D', "iso8859-4",        0x80,   0, 0},
75     {"ISO 8859-5",     T_96,    'L', "iso8859-5",        0x80,   0, 0},
76     {"ISO 8859-6",     T_96,    'G', "iso8859-6",        0x80,   0, 0},
77     {"ISO 8859-7",     T_96,    'F', "iso8859-7",        0x80,   0, 0},
78     {"ISO 8859-8",     T_96,    'H', "iso8859-8",        0x80,   0, 0},
79     {"ISO 8859-9",     T_96,    'M', "iso8859-9",        0x80,   0, 0},
80     {"ISO 8859-10",    T_96,    'V', "iso8859-10",       0x80,   0, 0},
81     {"ISO 8859-11",    T_96,    'T', "iso8859-11",       0x80,   0, 0},
82     {"TIS 620",        T_96,    'T', "iso8859-11",       0x80,   0, 0},
83     {"ISO 8859-13",    T_96,    'Y', "iso8859-13",       0x80,   0, 0},
84     {"ISO 8859-14",    T_96,    '_', "iso8859-14",       0x80,   0, 0},
85     {"ISO 8859-15",    T_96,    'b', "iso8859-15",       0x80,   0, 0},
86     {"ISO 8859-16",    T_96,    'f', "iso8859-16",       0x80,   0, 0},
87     {"KOI8-E",         T_96,    '@', "koi8-e",           0x80,   0, 0},
88     {"TCVN",           T_96,    'Z', "tcvn-0",           0x80,   0, 0},
89 
90     {"GB 2312",        T_9494,  'A', "gb2312.1980-0",    0x0000, 0, 0},
91     {"JIS X 0208",     T_9494,  'B', "jisx0208.1990-0",  0x0000, 0, 0},
92     {"KSC 5601",       T_9494,  'C', "ksc5601.1987-0",   0x0000, 0, 0},
93     {"JIS X 0212",     T_9494,  'D', "jisx0212.1990-0",  0x0000, 0, 0},
94 
95     {"GB 2312",        T_9696,  'A', "gb2312.1980-0",    0x0000, 0, 0},
96     {"JIS X 0208",     T_9696,  'B', "jisx0208.1990-0",  0x0000, 0, 0},
97     {"KSC 5601",       T_9696,  'C', "ksc5601.1987-0",   0x0000, 0, 0},
98     {"JIS X 0212",     T_9696,  'D', "jisx0212.1990-0",  0x0000, 0, 0},
99 
100     {"CNS11643-1",     T_9494,  'G', "cns11643-1",       0x0000, 0, 0},
101     {"CNS11643-2",     T_9494,  'H', "cns11643-2",       0x0000, 0, 0},
102     {"CNS11643-3",     T_9494,  'I', "cns11643-3",       0x0000, 0, 0},
103 
104     {"APL2",           T_128,   0,   "apl2",             0x80,   0, 0},
105     {"KOI8-R",         T_128,   0,   "koi8-r",           0x80,   0, 0},
106     {"KOI8-U",         T_128,   0,   "koi8-u",           0x80,   0, 0},
107     {"KOI8-RU",        T_128,   0,   "koi8-ru",          0x80,   0, 0},
108     {"CP 1250",        T_128,   0,   "microsoft-cp1250", 0x80,   0, 0},
109     {"CP 1251",        T_128,   0,   "microsoft-cp1251", 0x80,   0, 0},
110     {"CP 1252",        T_128,   0,   "microsoft-cp1252", 0x80,   0, 0},
111     {"CP 1255",        T_128,   0,   "microsoft-cp1255", 0x80,   0, 0},
112 
113     {"CP 437",         T_128,   0,   "ibm-cp437",        0x80,   0, 0},
114     {"CP 850",         T_128,   0,   "ibm-cp850",        0x80,   0, 0},
115     {"CP 852",         T_128,   0,   "ibm-cp852",        0x80,   0, 0},
116     {"CP 865",         T_128,   0,   "ibm-cp865",        0x80,   0, 0},
117     {"CP 866",         T_128,   0,   "ibm-cp866",        0x80,   0, 0},
118 
119     {"Big 5",          T_94192, 0,   "big5.eten-0",      0x8000, 0, 0},
120 
121     /*
122      * Several empty slots are reserved, to allow for non-ISO-2022 character
123      * sets to be defined in ".enc" files.
124      */
125     EmptyFontenc,	/* G0, from ".enc" file */
126     EmptyFontenc,	/* G1, from ".enc" file */
127     EmptyFontenc,	/* G2, from ".enc" file */
128     EmptyFontenc,	/* G3, from ".enc" file */
129     EmptyFontenc
130 };
131 /* *INDENT-ON* */
132 
133 typedef struct _OtherCharset {
134     const char *name;
135     int (*init) (OtherStatePtr);
136     unsigned int (*mapping) (unsigned int, OtherStatePtr);
137     unsigned int (*reverse) (unsigned int, OtherStatePtr);
138     int (*stack) (unsigned, OtherStatePtr);
139 } OtherCharsetRec, *OtherCharsetPtr;
140 /* *INDENT-OFF* */
141 
142 static const OtherCharsetRec otherCharsets[] =
143 {
144     {"GBK",        init_gbk,     mapping_gbk,     reverse_gbk,     stack_gbk},
145     {"UTF-8",      init_utf8,    mapping_utf8,    reverse_utf8,    stack_utf8},
146     {"SJIS",       init_sjis,    mapping_sjis,    reverse_sjis,    stack_sjis},
147     {"BIG5-HKSCS", init_hkscs,   mapping_hkscs,   reverse_hkscs,   stack_hkscs},
148     {"GB18030",    init_gb18030, mapping_gb18030, reverse_gb18030, stack_gb18030},
149     {0, 0, 0, 0, 0}
150 };
151 /* *INDENT-ON* */
152 
153 #define lcIgnore(c) \
154 	(c && (isspace(UChar(c)) || c == '-' || c == '_' || c == '/'))
155 
156 int
lcStrCmp(const char * s,const char * t)157 lcStrCmp(const char *s, const char *t)
158 {
159     int result = 0;
160 
161     while (*s || *t) {
162 	if (lcIgnore(*s))
163 	    s++;
164 	else if (lcIgnore(*t))
165 	    t++;
166 	else if (*s && *t && tolower(UChar(*s)) == tolower(UChar(*t))) {
167 	    s++;
168 	    t++;
169 	} else {
170 	    result = 1;
171 	    break;
172 	}
173     }
174     return result;
175 }
176 
177 static int
compare1(const char * s,const char * t,size_t n)178 compare1(const char *s, const char *t, size_t n)
179 {
180     int result = 0;
181 
182     while (n && (*s || *t)) {
183 	--n;
184 	if (lcIgnore(*s))
185 	    s++;
186 	else if (lcIgnore(*t))
187 	    t++;
188 	else if (*s && *t && tolower(UChar(*s)) == tolower(UChar(*t))) {
189 	    s++;
190 	    t++;
191 	} else {
192 	    result = 1;
193 	    break;
194 	}
195     }
196     return result;
197 }
198 
199 static unsigned int
FontencCharsetRecode(unsigned int n,const CharsetRec * self)200 FontencCharsetRecode(unsigned int n, const CharsetRec * self)
201 {
202     const FontencCharsetRec *fc = (const FontencCharsetRec *) (self->data);
203     unsigned result;
204 
205     result = MapCodeValue(n + fc->shift, fc->mapping);
206 
207     TRACE(("FontencCharsetRecode %#x ->%#x%s\n",
208 	   n,
209 	   result,
210 	   (n != result) ? " map" : ""));
211 
212     return result;
213 }
214 
215 static int
FontencCharsetReverse(unsigned int i,const CharsetRec * self)216 FontencCharsetReverse(unsigned int i, const CharsetRec * self)
217 {
218     const FontencCharsetRec *fc = (const FontencCharsetRec *) (self->data);
219     unsigned n;
220     int result = -1;
221 
222     n = fc->reverse->reverse(i, fc->reverse->data);
223     if (n != 0 && n >= fc->shift) {
224 	n -= fc->shift;
225 
226 #define IS_GL(n) ((n) >= 0x20 && (n) < 0x80)
227 	switch (self->type) {
228 	case T_94:
229 	case T_96:
230 	    if (IS_GL(n))
231 		result = (int) n;
232 	    break;
233 	case T_128:
234 	    if (n < 0x80)
235 		result = (int) n;
236 	    break;
237 	case T_9494:
238 	case T_9696:
239 	    if (IS_GL(n >> 8) && IS_GL(n & 0xFF))
240 		result = (int) n;
241 	    break;
242 	case T_94192:
243 	    if (IS_GL(n >> 8) && IS_GL(n & 0x7F))
244 		result = (int) n;
245 	    break;
246 	default:
247 	    abort();
248 	    /* NOTREACHED */
249 	}
250 #undef IS_GL
251     }
252 
253     TRACE(("FontencCharsetReverse %#x ->%#x%s\n",
254 	   i,
255 	   result,
256 	   ((int) i != result) ? " map" : ""));
257 
258     return result;
259 }
260 
261 static CharsetPtr cachedCharsets = NULL;
262 
263 static CharsetPtr
getCachedCharset(unsigned final,int type,const char * name)264 getCachedCharset(unsigned final, int type, const char *name)
265 {
266     CharsetPtr c;
267     for (c = cachedCharsets; c; c = c->next) {
268 	if (((c->type == type && c->final == final) ||
269 	     (name && !lcStrCmp(c->name, name))) &&
270 	    (c->type != T_FAILED))
271 	    return c;
272     }
273     return NULL;
274 }
275 
276 static void
cacheCharset(CharsetPtr c)277 cacheCharset(CharsetPtr c)
278 {
279     c->next = cachedCharsets;
280     cachedCharsets = c;
281     VERBOSE(2, ("cachedCharset '%s'\n", c->name));
282 }
283 
284 #ifdef USE_ICONV
285 static US_SIZE
cpSize(FontencCharsetPtr fc)286 cpSize(FontencCharsetPtr fc)
287 {
288     US_SIZE result = usANY;
289 
290     switch (fc->type) {
291     case T_94:
292     case T_96:
293     case T_128:
294 	result = us8BIT;
295 	break;
296     case T_9494:
297     case T_9696:
298     case T_94192:
299 	result = us16BIT;
300 	break;
301     }
302     return result;
303 }
304 #endif
305 
306 static int
addFontencCharset(const char * name,FontEncPtr f)307 addFontencCharset(const char *name, FontEncPtr f)
308 {
309     FontencCharsetPtr fc = fontencCharsets;
310     FontencCharsetPtr limit = fc + SizeOf(fontencCharsets);
311     int result = 0;
312     int c_size = typeOfFontenc(f);
313     int c_type;
314 
315     if (c_size <= 94) {
316 	c_type = T_94;
317     } else if (c_size <= 96) {
318 	c_type = T_96;
319     } else if (c_size <= 128) {
320 	c_type = T_128;
321     } else {
322 	VERBOSE(1, ("unexpected character-set size: %d\n", c_size));
323 	return 0;
324     }
325 
326     while (fc->name) {
327 	fc++;
328     }
329     if (fc < (limit - 1)) {
330 	result = 1;
331 	fc->name = strdup(name);
332 	fc->xlfd = strdup(name);
333 	fc->type = c_type;
334 	fc->shift = shiftOfFontenc(f);
335     }
336     return result;
337 }
338 
339 static CharsetPtr
getFontencCharset(unsigned final,int type,const char * name)340 getFontencCharset(unsigned final, int type, const char *name)
341 {
342     FontencCharsetPtr fc;
343     CharsetPtr c = NULL;
344     FontMapPtr mapping;
345     FontMapReversePtr reverse;
346     CharsetPtr result = NULL;
347 
348     TRACE(("getFontencCharset(final %#x, type %d, name %s)\n",
349 	   final, type, NonNull(name)));
350 
351     fc = fontencCharsets;
352     while (fc->name) {
353 	if (((fc->type == type && fc->final == final) ||
354 	     (name && !lcStrCmp(fc->name, name))) &&
355 	    (fc->type != T_FAILED))
356 	    break;
357 	fc++;
358     }
359 
360     if (!fc->name) {
361 	VERBOSE(2, ("...no match for '%s' in FontEnc charsets\n", NonNull(name)));
362     } else if ((c = TypeCalloc(CharsetRec)) == 0) {
363 	VERBOSE(2, ("malloc failed\n"));
364     } else if ((mapping = LookupMapping(fc->xlfd, cpSize(fc))) == NULL) {
365 	VERBOSE(2, ("...lookup mapping %s (%s) failed\n", NonNull(name), fc->xlfd));
366 	fc->type = T_FAILED;
367     } else if ((reverse = LookupReverse(mapping)) == NULL) {
368 	VERBOSE(2, ("...lookup reverse %s failed\n", NonNull(name)));
369 	fc->type = T_FAILED;
370     } else {
371 	fc->mapping = mapping;
372 	fc->reverse = reverse;
373 
374 	c->name = fc->name;
375 	c->type = fc->type;
376 	c->final = fc->final;
377 	c->recode = FontencCharsetRecode;
378 	c->reverse = FontencCharsetReverse;
379 	c->data = fc;
380 
381 	cacheCharset(c);
382 	result = c;
383     }
384 
385     if (result == NULL && c != NULL)
386 	free(c);
387 
388     return result;
389 }
390 
391 static const OtherCharsetRec *
findOtherCharset(const char * name)392 findOtherCharset(const char *name)
393 {
394     const OtherCharsetRec *fc;
395     fc = otherCharsets;
396     while (fc->name) {
397 	if (name && !lcStrCmp(fc->name, name))
398 	    break;
399 	fc++;
400     }
401     return fc;
402 }
403 
404 int
isOtherCharset(const char * name)405 isOtherCharset(const char *name)
406 {
407     const OtherCharsetRec *fc = findOtherCharset(name);
408     int result = (fc->name != 0);
409     if (!result) {
410 	result = (!lcStrCmp(name, "Big5") ||
411 		  !lcStrCmp(name, "JOHAB"));
412     }
413     return result;
414 }
415 
416 static CharsetPtr
getOtherCharset(const char * name)417 getOtherCharset(const char *name)
418 {
419     const OtherCharsetRec *fc;
420     CharsetPtr c = NULL;
421     OtherStatePtr s = NULL;
422     CharsetPtr result = NULL;
423 
424     fc = findOtherCharset(name);
425     if (!fc->name) {
426 	VERBOSE(2, ("...no match for '%s' in Other charsets\n", NonNull(name)));
427     } else if ((c = TypeCalloc(CharsetRec)) == NULL) {
428 	VERBOSE(2, ("malloc failed\n"));
429     } else if ((s = TypeCalloc(OtherState)) == NULL) {
430 	VERBOSE(2, ("malloc failed\n"));
431     } else {
432 	c->name = fc->name;
433 	c->type = T_OTHER;
434 	c->final = 0;
435 	c->data = fc;
436 	c->other_recode = fc->mapping;
437 	c->other_reverse = fc->reverse;
438 	c->other_stack = fc->stack;
439 	c->other_aux = s;
440 
441 	if (!fc->init(s)) {
442 	    VERBOSE(2, ("...initialization %s failed\n", NonNull(name)));
443 	    c->type = T_FAILED;
444 	} else {
445 	    cacheCharset(c);
446 	    result = c;
447 	}
448     }
449 
450     if (result == NULL) {
451 	if (c != NULL)
452 	    free(c);
453 	if (s != NULL)
454 	    free(s);
455     }
456 
457     return result;
458 }
459 
460 const CharsetRec *
getUnknownCharset(int type)461 getUnknownCharset(int type)
462 {
463     TRACE(("getUnknownCharset(%d)\n", type));
464     switch (type) {
465     case T_94:
466 	VERBOSE(2, ("using unknown 94-charset\n"));
467 	return &Unknown94Charset;
468     case T_96:
469 	VERBOSE(2, ("using unknown 96-charset\n"));
470 	return &Unknown96Charset;
471     case T_9494:
472 	VERBOSE(2, ("using unknown 9494-charset\n"));
473 	return &Unknown9494Charset;
474     case T_9696:
475 	VERBOSE(2, ("using unknown 9696-charset\n"));
476 	return &Unknown9696Charset;
477     default:
478 	VERBOSE(2, ("using unknown 94-charset\n"));
479 	return &Unknown94Charset;
480     }
481 }
482 
483 const CharsetRec *
getCharset(unsigned final,int type)484 getCharset(unsigned final, int type)
485 {
486     const CharsetRec *c;
487 
488     TRACE(("getCharset(final=%c, type=%d)\n", final, type));
489     c = getCachedCharset(final, type, NULL);
490     if (c)
491 	return c;
492 
493     c = getFontencCharset(final, type, NULL);
494     if (c)
495 	return c;
496 
497     return getUnknownCharset(type);
498 }
499 
500 const CharsetRec *
getCharsetByName(const char * name)501 getCharsetByName(const char *name)
502 {
503     const CharsetRec *c;
504     FontEncPtr f;
505     int type = T_94;
506 
507     VERBOSE(2, ("getCharsetByName(%s)\n", NonNull(name)));
508     TRACE(("getCharsetByName(%s)\n", NonNull(name)));
509 
510     if (name == NULL)
511 	return getUnknownCharset(type);
512 
513     c = getCachedCharset(0, 0, name);
514     if (c)
515 	return c;
516 
517     c = getFontencCharset(0, 0, name);
518     if (c)
519 	return c;
520 
521     c = getOtherCharset(name);
522     if (c)
523 	return c;
524 
525     /*
526      * If we did not find the name in a table, look for a ".enc" * file.
527      */
528     if ((f = lookupOneFontenc(name)) != 0) {
529 	if (addFontencCharset(name, f)) {
530 	    c = getFontencCharset(0, 0, name);
531 	    if (c)
532 		return c;
533 	}
534     }
535     return getUnknownCharset(type);
536 }
537 /* *INDENT-OFF* */
538 static const LocaleCharsetRec localeCharsets[] =
539 {
540     {"C",          0, 2, "ASCII", NULL,         "ISO 8859-1",    NULL,         NULL},
541     {"POSIX",      0, 2, "ASCII", NULL,         "ISO 8859-1",    NULL,         NULL},
542     {"US-ASCII",   0, 2, "ASCII", NULL,         "ISO 8859-1",    NULL,         NULL},
543 
544     {"ISO8859-1",  0, 2, "ASCII", NULL,         "ISO 8859-1",    NULL,         NULL},
545     {"ISO8859-2",  0, 2, "ASCII", NULL,         "ISO 8859-2",    NULL,         NULL},
546     {"ISO8859-3",  0, 2, "ASCII", NULL,         "ISO 8859-3",    NULL,         NULL},
547     {"ISO8859-4",  0, 2, "ASCII", NULL,         "ISO 8859-4",    NULL,         NULL},
548     {"ISO8859-5",  0, 2, "ASCII", NULL,         "ISO 8859-5",    NULL,         NULL},
549     {"ISO8859-6",  0, 2, "ASCII", NULL,         "ISO 8859-6",    NULL,         NULL},
550     {"ISO8859-7",  0, 2, "ASCII", NULL,         "ISO 8859-7",    NULL,         NULL},
551     {"ISO8859-8",  0, 2, "ASCII", NULL,         "ISO 8859-8",    NULL,         NULL},
552     {"ISO8859-9",  0, 2, "ASCII", NULL,         "ISO 8859-9",    NULL,         NULL},
553     {"ISO8859-10", 0, 2, "ASCII", NULL,         "ISO 8859-10",   NULL,         NULL},
554     {"ISO8859-11", 0, 2, "ASCII", NULL,         "ISO 8859-11",   NULL,         NULL},
555     {"TIS620",     0, 2, "ASCII", NULL,         "ISO 8859-11",   NULL,         NULL},
556     {"ISO8859-13", 0, 2, "ASCII", NULL,         "ISO 8859-13",   NULL,         NULL},
557     {"ISO8859-14", 0, 2, "ASCII", NULL,         "ISO 8859-14",   NULL,         NULL},
558     {"ISO8859-15", 0, 2, "ASCII", NULL,         "ISO 8859-15",   NULL,         NULL},
559     {"ISO8859-16", 0, 2, "ASCII", NULL,         "ISO 8859-16",   NULL,         NULL},
560 
561     {"KOI8-E",     0, 2, "ASCII", NULL,         "KOI8-E",        NULL,         NULL},
562     {"KOI8-R",     0, 2, "ASCII", NULL,         "KOI8-R",        NULL,         NULL},
563     {"KOI8-U",     0, 2, "ASCII", NULL,         "KOI8-U",        NULL,         NULL},
564     {"KOI8-RU",    0, 2, "ASCII", NULL,         "KOI8-RU",       NULL,         NULL},
565     {"CP1250",     0, 2, "ASCII", NULL,         "CP 1250",       NULL,         NULL},
566     {"CP1251",     0, 2, "ASCII", NULL,         "CP 1251",       NULL,         NULL},
567     {"CP1252",     0, 2, "ASCII", NULL,         "CP 1252",       NULL,         NULL},
568     {"CP1255",     0, 2, "ASCII", NULL,         "CP 1255",       NULL,         NULL},
569     {"CP437",      0, 2, "ASCII", NULL,         "CP 437",        NULL,         NULL},
570     {"CP850",      0, 2, "ASCII", NULL,         "CP 850",        NULL,         NULL},
571     {"CP852",      0, 2, "ASCII", NULL,         "CP 852",        NULL,         NULL},
572     {"CP865",      0, 2, "ASCII", NULL,         "CP 865",        NULL,         NULL},
573     {"CP866",      0, 2, "ASCII", NULL,         "CP 866",        NULL,         NULL},
574     {"TCVN",       0, 2, "ASCII", NULL,         "TCVN",          NULL,         NULL},
575 
576     {"GB2312",     0, 1, "ASCII", "GB 2312",    NULL,            NULL,         NULL},
577     {"eucJP",      0, 1, "ASCII", "JIS X 0208", "JIS X 0201:GR", "JIS X 0212", NULL},
578     {"eucKR",      0, 1, "ASCII", "KSC 5601",   NULL,            NULL,         NULL},
579     {"eucCN",      0, 1, "ASCII", "GB 2312",    NULL,            NULL,         NULL},
580     {"eucTW",      0, 1, "ASCII", "CNS11643-1", "CNS11643-2",    "CNS11643-3", NULL},
581     {"Big5",       0, 1, "ASCII", "Big 5",      NULL,            NULL,         NULL},
582 
583     {"gbk",        0, 1, NULL,    NULL,         NULL,            NULL,         "GBK"},
584     {"UTF-8",      0, 1, NULL,    NULL,         NULL,            NULL,         "UTF-8"},
585     {"SJIS",       0, 1, NULL,    NULL,         NULL,            NULL,         "SJIS"},
586     {"Big5-HKSCS", 0, 1, NULL,    NULL,         NULL,            NULL,         "BIG5-HKSCS"},
587     {"gb18030",    0, 1, NULL,    NULL,         NULL,            NULL,         "GB18030"},
588 
589     {0,            0, 0, 0,       0,            0,               0,            0}
590 };
591 /* *INDENT-ON* */
592 
593 void
reportCharsets(void)594 reportCharsets(void)
595 {
596     const LocaleCharsetRec *p;
597     FontencCharsetPtr q;
598     printf("Known locale encodings:\n\n");
599     for (p = localeCharsets; p->name; p++) {
600 	if (p->other) {
601 	    printf("  %s (non-ISO-2022 encoding)\n", p->other);
602 	    continue;
603 	}
604 	printf("  %s: GL -> G%d, GR -> G%d", p->name, p->gl, p->gr);
605 	if (p->g0)
606 	    printf(", G0: %s", p->g0);
607 	if (p->g1)
608 	    printf(", G1: %s", p->g1);
609 	if (p->g2)
610 	    printf(", G2: %s", p->g2);
611 	if (p->g3)
612 	    printf(", G3: %s", p->g3);
613 	printf("\n");
614     }
615 
616     printf("\n\nKnown charsets (not all may be available):\n\n");
617     for (q = fontencCharsets; q->name; q++) {
618 	const char *csize = "";
619 
620 	printf("  %s", q->name);
621 	switch (q->type) {
622 	case T_94:
623 	    csize = "94 codes";
624 	    break;
625 	case T_96:
626 	    csize = "96 codes";
627 	    break;
628 	case T_128:
629 	    csize = "128 codes";
630 	    break;
631 	case T_9494:
632 	    csize = "94x94 codes";
633 	    break;
634 	case T_9696:
635 	    csize = "94x96 codes";
636 	    break;
637 	case T_94192:
638 	    csize = "94x192 codes";
639 	    break;
640 	}
641 	if (q->final) {
642 	    printf(" (ISO 2022%s%s)", *csize ? ", " : "", csize);
643 	} else if (*csize) {
644 	    printf(" (%s)", csize);
645 	}
646 	printf("\n");
647     }
648 }
649 
650 #ifdef USE_ICONV
651 static LocaleCharsetRec fakeLocaleCharset;
652 
653 static const LocaleCharsetRec *
findLocaleByCharset(const char * charset)654 findLocaleByCharset(const char *charset)
655 {
656     const LocaleCharsetRec *lc;
657     const LocaleCharsetRec *result = 0;
658 
659     for (lc = localeCharsets; lc->name != 0; ++lc) {
660 	if (lc->g1 == 0 && lc->g2 == 0)
661 	    continue;
662 	if ((lc->g3 != 0 && !lcStrCmp(charset, lc->g3))
663 	    || (lc->g2 != 0 && !lcStrCmp(charset, lc->g2))
664 	    || (lc->g1 != 0 && !lcStrCmp(charset, lc->g1))) {
665 	    result = lc;
666 	    break;
667 	}
668     }
669     TRACE(("findLocaleByCharset(%s) ->%s\n",
670 	   charset, result ? result->name : "?"));
671     return result;
672 }
673 
674 static const LocaleCharsetRec *
closestLocaleCharset(FontEncPtr enc)675 closestLocaleCharset(FontEncPtr enc)
676 {
677     const LocaleCharsetRec *result = 0;
678 
679     if (enc != 0) {
680 	const FontencCharsetRec *fc = getFontencByName(enc->name);
681 	if (fc != 0) {
682 	    result = findLocaleByCharset(fc->name);
683 	} else {
684 	    result = findLocaleByCharset(enc->name);
685 	}
686     }
687     TRACE(("closestLocaleCharset(%s) ->%s\n",
688 	   enc ? enc->name : "?",
689 	   result ? result->name : "?"));
690     return result;
691 }
692 
693 static int
canFakeLocaleCharset(FontEncPtr enc)694 canFakeLocaleCharset(FontEncPtr enc)
695 {
696     int result = 0;
697     if (enc != 0
698 	&& enc->size <= 256
699 	&& enc->row_size == 0) {
700 	result = 1;
701     }
702     return result;
703 }
704 #endif
705 
706 static const LocaleCharsetRec *
findLocaleCharset(const char * charset)707 findLocaleCharset(const char *charset)
708 {
709     const LocaleCharsetRec *p;
710     const LocaleCharsetRec *result = 0;
711 
712     for (p = localeCharsets; p->name; p++) {
713 	if (lcStrCmp(p->name, charset) == 0) {
714 	    result = p;
715 	    break;
716 	}
717     }
718 #ifdef USE_ICONV
719     /*
720      * The table is useful, but not complete.
721      * If we can find a mapping for an 8-bit encoding, fake a table entry.
722      */
723     if (result == 0) {
724 	FontEncPtr enc = luitGetFontEnc(charset,
725 					(UM_MODE) ((int) umICONV
726 						   | (int) umFONTENC
727 						   | (int) umBUILTIN));
728 	if ((result = closestLocaleCharset(enc)) != 0) {
729 	    TRACE(("...matched a LocaleCharset record for %s\n", NonNull(charset)));
730 	} else if (canFakeLocaleCharset(enc)) {
731 	    LocaleCharsetRec *temp = &fakeLocaleCharset;
732 
733 	    TRACE(("...fake a LocaleCharset record for %s\n", NonNull(charset)));
734 
735 	    memset(temp, 0, sizeof(*temp));
736 	    temp->name = strmalloc(charset);
737 	    temp->gr = 2;
738 	    temp->g0 = "ASCII";
739 	    temp->g2 = temp->name;
740 	    result = temp;
741 
742 	} else {
743 	    TRACE(("...do not know how to fake LocaleCharset for %s\n",
744 		   NonNull(charset)));
745 	}
746 	luitFreeFontEnc(enc);
747     }
748 #endif
749     return result;
750 }
751 
752 static const LocaleCharsetRec *
matchLocaleCharset(const char * charset)753 matchLocaleCharset(const char *charset)
754 {
755     static const struct {
756 	const char *source;
757 	const char *target;
758 	size_t source_len;
759 	size_t target_len;
760     } prefixes[] = {
761 #define DATA(source, target) { source, target, sizeof(source)-1, sizeof(target)-1 }
762 	DATA("ISO-", "ISO "),
763 	    DATA("DEC ", "DEC-"),
764 	    DATA("IBM-CP", "CP "),	/* "ibm-cp866" -> "cp 866" (iconv) */
765 	    DATA("IBM", "CP "),
766 	    DATA("MICROSOFT-CP", "CP "),
767 	    DATA("MICROSOFT", "CP "),
768 	    DATA("CP-", "CP "),
769 	    DATA("ANSI", "CP "),	/* e.g., Solaris ANSI1251 */
770 #undef DATA
771     };
772 
773     const LocaleCharsetRec *p = 0;
774 
775     TRACE(("matchLocaleCharset(%s)\n", NonNull(charset)));
776     if (!IsEmpty(charset)) {
777 	char *euro;
778 	char source[MAX_KEYWORD_LENGTH + 1];
779 
780 	sprintf(source, "%.*s", MAX_KEYWORD_LENGTH, charset);
781 	if ((euro = strrchr(source, '@')) != 0 && !strcmp(euro, "@euro")) {
782 	    Warning("the euro character may not be supported\n");
783 	    *euro = 0;
784 	}
785 
786 	p = findLocaleCharset(source);
787 
788 	if (p == 0) {
789 	    size_t have = strlen(source);
790 	    size_t n;
791 	    char target[MAX_KEYWORD_LENGTH + 80];
792 
793 	    for (n = 0; n < SizeOf(prefixes); ++n) {
794 		if (have > prefixes[n].source_len
795 		    && have < MAX_KEYWORD_LENGTH
796 		    && !compare1(source,
797 				 prefixes[n].source,
798 				 prefixes[n].source_len)) {
799 		    strcpy(target, prefixes[n].target);
800 		    strcpy(target + prefixes[n].target_len,
801 			   source + prefixes[n].source_len);
802 		    if ((p = findLocaleCharset(target)) != 0) {
803 			break;
804 		    }
805 		}
806 	    }
807 	}
808     }
809     return p;
810 }
811 
812 int
getLocaleState(const char * locale,const char * charset,int * gl_return,int * gr_return,const CharsetRec ** g0_return,const CharsetRec ** g1_return,const CharsetRec ** g2_return,const CharsetRec ** g3_return,const CharsetRec ** other_return)813 getLocaleState(const char *locale,
814 	       const char *charset,
815 	       int *gl_return, int *gr_return,
816 	       const CharsetRec * *g0_return,
817 	       const CharsetRec * *g1_return,
818 	       const CharsetRec * *g2_return,
819 	       const CharsetRec * *g3_return,
820 	       const CharsetRec * *other_return)
821 {
822     int result = 0;
823     char *resolved = 0;
824     const LocaleCharsetRec *p;
825 
826     TRACE(("getLocaleState(locale=%s, charset=%s)\n", locale, NonNull(charset)));
827     if (IsEmpty(charset)) {
828 	if (ignore_locale) {
829 	    charset = locale;
830 	} else {
831 	    resolved = resolveLocale(locale);
832 	    if (!resolved)
833 		return -1;
834 	    if ((charset = strrchr(resolved, '.')) != 0) {
835 		charset++;
836 	    } else {
837 		charset = resolved;
838 	    }
839 	}
840     }
841 
842     if ((p = matchLocaleCharset(charset)) != 0) {
843 	*gl_return = p->gl;
844 	*gr_return = p->gr;
845 	*g0_return = getCharsetByName(p->g0);
846 	*g1_return = getCharsetByName(p->g1);
847 	*g2_return = getCharsetByName(p->g2);
848 	*g3_return = getCharsetByName(p->g3);
849 	if (p->other)
850 	    *other_return = getCharsetByName(p->other);
851 	else
852 	    *other_return = NULL;
853     } else {
854 	result = -1;
855     }
856 
857     if (resolved != 0)
858 	free(resolved);
859 
860     TRACE(("...getLocaleState ->%d\n", result));
861     return result;
862 }
863 
864 #ifdef USE_ICONV
865 /*
866  * Given either a charset name, or the xlfd field (which is more likely to
867  * match iconv), return a pointer to the entry in fontencCharsets which matches.
868  */
869 const FontencCharsetRec *
getFontencByName(const char * encoding_name)870 getFontencByName(const char *encoding_name)
871 {
872     const FontencCharsetRec *result = 0;
873     const FontencCharsetRec *fc;
874     char *gr_special;
875 
876     for (fc = fontencCharsets; fc->name != 0; ++fc) {
877 	if (!lcStrCmp(encoding_name, fc->name)
878 	    || (strstr(fc->name, ":GL") == 0
879 		&& !lcStrCmp(encoding_name, fc->xlfd))) {
880 	    result = fc;
881 	    break;
882 	}
883     }
884 
885     /*
886      * Luit treats ":GR" specially in its charset tables, essentially to
887      * distinguish the case it uses for JIS X 201 from other possibilities.
888      */
889     if (result == 0
890 	&& strchr(encoding_name, ':') == 0
891 	&& (gr_special = malloc(strlen(encoding_name) + 4)) != 0) {
892 	sprintf(gr_special, "%s:GR", encoding_name);
893 	result = getFontencByName(gr_special);
894 	free(gr_special);
895     }
896     TRACE(("getFontencByName(%s) ->%s\n",
897 	   encoding_name,
898 	   result ? result->name : "?"));
899     return result;
900 }
901 
902 /*
903  * Check (for EUC-JP specifically, but generally...) for a charset which
904  * is part of a composite charset using G2/G3 via single-shifts.
905  */
906 const char *
getCompositeCharset(const char * encoding_name)907 getCompositeCharset(const char *encoding_name)
908 {
909     const char *result = 0;
910     const FontencCharsetRec *fc;
911     const LocaleCharsetRec *lc;
912 
913     if ((fc = getFontencByName(encoding_name)) != 0) {
914 	if ((lc = findLocaleByCharset(fc->name)) != 0) {
915 	    result = lc->name;
916 	    TRACE(("getCompositeCharset(%s) ->%s\n",
917 		   NonNull(encoding_name), NonNull(result)));
918 	}
919     }
920     return result;
921 }
922 
923 static const char *
selectPart(const LocaleCharsetRec * data,unsigned g)924 selectPart(const LocaleCharsetRec * data, unsigned g)
925 {
926     const char *result = 0;
927     switch (g) {
928     case 0:
929 	result = data->g0;
930 	break;
931     case 1:
932 	result = data->g1;
933 	break;
934     case 2:
935 	result = data->g2;
936 	break;
937     case 3:
938 	result = data->g3;
939 	break;
940     }
941     return result;
942 }
943 
944 /*
945  * Given a composite name returned by getCompositeCharset, return a pointer to
946  * the data which describes the encoding used for a given shift.
947  */
948 const FontencCharsetRec *
getCompositePart(const char * composite_name,unsigned g)949 getCompositePart(const char *composite_name, unsigned g)
950 {
951     const FontencCharsetRec *result = 0;
952     const LocaleCharsetRec *lc;
953     const char *part_name;
954 
955     for (lc = localeCharsets; lc->name; ++lc) {
956 	if (!lcStrCmp(composite_name, lc->name)) {
957 	    if ((part_name = selectPart(lc, g)) != 0) {
958 		const FontencCharsetRec *fc;
959 		for (fc = fontencCharsets; fc->name != 0; ++fc) {
960 		    if (!lcStrCmp(part_name, fc->name)) {
961 			result = fc;
962 			break;
963 		    }
964 		}
965 	    }
966 	    break;
967 	}
968     }
969     return result;
970 }
971 #endif
972 
973 #ifdef NO_LEAKS
974 static int
isUnknownCharsetPtr(CharsetPtr p)975 isUnknownCharsetPtr(CharsetPtr p)
976 {
977     return (p == &Unknown94Charset
978 	    || p == &Unknown96Charset
979 	    || p == &Unknown9494Charset
980 	    || p == &Unknown9696Charset);
981 }
982 
983 static void
destroyFontencCharsetPtr(FontencCharsetPtr p)984 destroyFontencCharsetPtr(FontencCharsetPtr p)
985 {
986 #ifdef USE_ICONV
987     if (p->reverse) {
988 	luitDestroyReverse(p->reverse);
989     }
990 #else
991     p->mapping = 0;
992 
993     /*
994      * This should, but does not work -
995      *     FontMapReverseFree(p->reverse)
996      *
997      * The iteration for map[] is based on reading the source of
998      * FontMapReverse().
999      */
1000     if (p->reverse) {
1001 	int n;
1002 	unsigned **map = p->reverse->data;
1003 	for (n = 0; n < 256; ++n) {
1004 	    if (map[n])
1005 		free(map[n]);
1006 	}
1007 	free(p->reverse->data);
1008 	free(p->reverse);
1009 	p->reverse = 0;
1010     }
1011 #endif
1012 }
1013 
1014 static void
destroyCharset(CharsetPtr p)1015 destroyCharset(CharsetPtr p)
1016 {
1017     if (!isUnknownCharsetPtr(p)) {
1018 	if (p->type == T_OTHER) {
1019 	    free(p->other_aux);
1020 	} else {
1021 	    destroyFontencCharsetPtr((FontencCharsetPtr) p->data);
1022 	}
1023 	free(p);
1024     }
1025 }
1026 
1027 void
charset_leaks(void)1028 charset_leaks(void)
1029 {
1030     while (cachedCharsets != 0) {
1031 	CharsetPtr next = cachedCharsets->next;
1032 	destroyCharset(cachedCharsets);
1033 	cachedCharsets = next;
1034     }
1035 #ifdef USE_ICONV
1036     if (fakeLocaleCharset.name != 0) {
1037 	free((void *) fakeLocaleCharset.name);
1038 	fakeLocaleCharset.name = 0;
1039     }
1040 #endif
1041 }
1042 #endif /* NO_LEAKS */
1043