1 /*-------------------------------------------------------------------------
2  *
3  * encnames.c
4  *	  Encoding names and routines for working with them.
5  *
6  * Portions Copyright (c) 2001-2020, PostgreSQL Global Development Group
7  *
8  * IDENTIFICATION
9  *	  src/common/encnames.c
10  *
11  *-------------------------------------------------------------------------
12  */
13 #include "c.h"
14 
15 #include <ctype.h>
16 #include <unistd.h>
17 
18 #include "mb/pg_wchar.h"
19 
20 
21 /* ----------
22  * All encoding names, sorted:		 *** A L P H A B E T I C ***
23  *
24  * All names must be without irrelevant chars, search routines use
25  * isalnum() chars only. It means ISO-8859-1, iso_8859-1 and Iso8859_1
26  * are always converted to 'iso88591'. All must be lower case.
27  *
28  * The table doesn't contain 'cs' aliases (like csISOLatin1). It's needed?
29  *
30  * Karel Zak, Aug 2001
31  * ----------
32  */
33 typedef struct pg_encname
34 {
35 	const char *name;
36 	pg_enc		encoding;
37 } pg_encname;
38 
39 static const pg_encname pg_encname_tbl[] =
40 {
41 	{
42 		"abc", PG_WIN1258
43 	},							/* alias for WIN1258 */
44 	{
45 		"alt", PG_WIN866
46 	},							/* IBM866 */
47 	{
48 		"big5", PG_BIG5
49 	},							/* Big5; Chinese for Taiwan multibyte set */
50 	{
51 		"euccn", PG_EUC_CN
52 	},							/* EUC-CN; Extended Unix Code for simplified
53 								 * Chinese */
54 	{
55 		"eucjis2004", PG_EUC_JIS_2004
56 	},							/* EUC-JIS-2004; Extended UNIX Code fixed
57 								 * Width for Japanese, standard JIS X 0213 */
58 	{
59 		"eucjp", PG_EUC_JP
60 	},							/* EUC-JP; Extended UNIX Code fixed Width for
61 								 * Japanese, standard OSF */
62 	{
63 		"euckr", PG_EUC_KR
64 	},							/* EUC-KR; Extended Unix Code for Korean , KS
65 								 * X 1001 standard */
66 	{
67 		"euctw", PG_EUC_TW
68 	},							/* EUC-TW; Extended Unix Code for
69 								 *
70 								 * traditional Chinese */
71 	{
72 		"gb18030", PG_GB18030
73 	},							/* GB18030;GB18030 */
74 	{
75 		"gbk", PG_GBK
76 	},							/* GBK; Chinese Windows CodePage 936
77 								 * simplified Chinese */
78 	{
79 		"iso88591", PG_LATIN1
80 	},							/* ISO-8859-1; RFC1345,KXS2 */
81 	{
82 		"iso885910", PG_LATIN6
83 	},							/* ISO-8859-10; RFC1345,KXS2 */
84 	{
85 		"iso885913", PG_LATIN7
86 	},							/* ISO-8859-13; RFC1345,KXS2 */
87 	{
88 		"iso885914", PG_LATIN8
89 	},							/* ISO-8859-14; RFC1345,KXS2 */
90 	{
91 		"iso885915", PG_LATIN9
92 	},							/* ISO-8859-15; RFC1345,KXS2 */
93 	{
94 		"iso885916", PG_LATIN10
95 	},							/* ISO-8859-16; RFC1345,KXS2 */
96 	{
97 		"iso88592", PG_LATIN2
98 	},							/* ISO-8859-2; RFC1345,KXS2 */
99 	{
100 		"iso88593", PG_LATIN3
101 	},							/* ISO-8859-3; RFC1345,KXS2 */
102 	{
103 		"iso88594", PG_LATIN4
104 	},							/* ISO-8859-4; RFC1345,KXS2 */
105 	{
106 		"iso88595", PG_ISO_8859_5
107 	},							/* ISO-8859-5; RFC1345,KXS2 */
108 	{
109 		"iso88596", PG_ISO_8859_6
110 	},							/* ISO-8859-6; RFC1345,KXS2 */
111 	{
112 		"iso88597", PG_ISO_8859_7
113 	},							/* ISO-8859-7; RFC1345,KXS2 */
114 	{
115 		"iso88598", PG_ISO_8859_8
116 	},							/* ISO-8859-8; RFC1345,KXS2 */
117 	{
118 		"iso88599", PG_LATIN5
119 	},							/* ISO-8859-9; RFC1345,KXS2 */
120 	{
121 		"johab", PG_JOHAB
122 	},							/* JOHAB; Extended Unix Code for simplified
123 								 * Chinese */
124 	{
125 		"koi8", PG_KOI8R
126 	},							/* _dirty_ alias for KOI8-R (backward
127 								 * compatibility) */
128 	{
129 		"koi8r", PG_KOI8R
130 	},							/* KOI8-R; RFC1489 */
131 	{
132 		"koi8u", PG_KOI8U
133 	},							/* KOI8-U; RFC2319 */
134 	{
135 		"latin1", PG_LATIN1
136 	},							/* alias for ISO-8859-1 */
137 	{
138 		"latin10", PG_LATIN10
139 	},							/* alias for ISO-8859-16 */
140 	{
141 		"latin2", PG_LATIN2
142 	},							/* alias for ISO-8859-2 */
143 	{
144 		"latin3", PG_LATIN3
145 	},							/* alias for ISO-8859-3 */
146 	{
147 		"latin4", PG_LATIN4
148 	},							/* alias for ISO-8859-4 */
149 	{
150 		"latin5", PG_LATIN5
151 	},							/* alias for ISO-8859-9 */
152 	{
153 		"latin6", PG_LATIN6
154 	},							/* alias for ISO-8859-10 */
155 	{
156 		"latin7", PG_LATIN7
157 	},							/* alias for ISO-8859-13 */
158 	{
159 		"latin8", PG_LATIN8
160 	},							/* alias for ISO-8859-14 */
161 	{
162 		"latin9", PG_LATIN9
163 	},							/* alias for ISO-8859-15 */
164 	{
165 		"mskanji", PG_SJIS
166 	},							/* alias for Shift_JIS */
167 	{
168 		"muleinternal", PG_MULE_INTERNAL
169 	},
170 	{
171 		"shiftjis", PG_SJIS
172 	},							/* Shift_JIS; JIS X 0202-1991 */
173 
174 	{
175 		"shiftjis2004", PG_SHIFT_JIS_2004
176 	},							/* SHIFT-JIS-2004; Shift JIS for Japanese,
177 								 * standard JIS X 0213 */
178 	{
179 		"sjis", PG_SJIS
180 	},							/* alias for Shift_JIS */
181 	{
182 		"sqlascii", PG_SQL_ASCII
183 	},
184 	{
185 		"tcvn", PG_WIN1258
186 	},							/* alias for WIN1258 */
187 	{
188 		"tcvn5712", PG_WIN1258
189 	},							/* alias for WIN1258 */
190 	{
191 		"uhc", PG_UHC
192 	},							/* UHC; Korean Windows CodePage 949 */
193 	{
194 		"unicode", PG_UTF8
195 	},							/* alias for UTF8 */
196 	{
197 		"utf8", PG_UTF8
198 	},							/* alias for UTF8 */
199 	{
200 		"vscii", PG_WIN1258
201 	},							/* alias for WIN1258 */
202 	{
203 		"win", PG_WIN1251
204 	},							/* _dirty_ alias for windows-1251 (backward
205 								 * compatibility) */
206 	{
207 		"win1250", PG_WIN1250
208 	},							/* alias for Windows-1250 */
209 	{
210 		"win1251", PG_WIN1251
211 	},							/* alias for Windows-1251 */
212 	{
213 		"win1252", PG_WIN1252
214 	},							/* alias for Windows-1252 */
215 	{
216 		"win1253", PG_WIN1253
217 	},							/* alias for Windows-1253 */
218 	{
219 		"win1254", PG_WIN1254
220 	},							/* alias for Windows-1254 */
221 	{
222 		"win1255", PG_WIN1255
223 	},							/* alias for Windows-1255 */
224 	{
225 		"win1256", PG_WIN1256
226 	},							/* alias for Windows-1256 */
227 	{
228 		"win1257", PG_WIN1257
229 	},							/* alias for Windows-1257 */
230 	{
231 		"win1258", PG_WIN1258
232 	},							/* alias for Windows-1258 */
233 	{
234 		"win866", PG_WIN866
235 	},							/* IBM866 */
236 	{
237 		"win874", PG_WIN874
238 	},							/* alias for Windows-874 */
239 	{
240 		"win932", PG_SJIS
241 	},							/* alias for Shift_JIS */
242 	{
243 		"win936", PG_GBK
244 	},							/* alias for GBK */
245 	{
246 		"win949", PG_UHC
247 	},							/* alias for UHC */
248 	{
249 		"win950", PG_BIG5
250 	},							/* alias for BIG5 */
251 	{
252 		"windows1250", PG_WIN1250
253 	},							/* Windows-1251; Microsoft */
254 	{
255 		"windows1251", PG_WIN1251
256 	},							/* Windows-1251; Microsoft */
257 	{
258 		"windows1252", PG_WIN1252
259 	},							/* Windows-1252; Microsoft */
260 	{
261 		"windows1253", PG_WIN1253
262 	},							/* Windows-1253; Microsoft */
263 	{
264 		"windows1254", PG_WIN1254
265 	},							/* Windows-1254; Microsoft */
266 	{
267 		"windows1255", PG_WIN1255
268 	},							/* Windows-1255; Microsoft */
269 	{
270 		"windows1256", PG_WIN1256
271 	},							/* Windows-1256; Microsoft */
272 	{
273 		"windows1257", PG_WIN1257
274 	},							/* Windows-1257; Microsoft */
275 	{
276 		"windows1258", PG_WIN1258
277 	},							/* Windows-1258; Microsoft */
278 	{
279 		"windows866", PG_WIN866
280 	},							/* IBM866 */
281 	{
282 		"windows874", PG_WIN874
283 	},							/* Windows-874; Microsoft */
284 	{
285 		"windows932", PG_SJIS
286 	},							/* alias for Shift_JIS */
287 	{
288 		"windows936", PG_GBK
289 	},							/* alias for GBK */
290 	{
291 		"windows949", PG_UHC
292 	},							/* alias for UHC */
293 	{
294 		"windows950", PG_BIG5
295 	}							/* alias for BIG5 */
296 };
297 
298 /* ----------
299  * These are "official" encoding names.
300  * XXX must be sorted by the same order as enum pg_enc (in mb/pg_wchar.h)
301  * ----------
302  */
303 #ifndef WIN32
304 #define DEF_ENC2NAME(name, codepage) { #name, PG_##name }
305 #else
306 #define DEF_ENC2NAME(name, codepage) { #name, PG_##name, codepage }
307 #endif
308 
309 const pg_enc2name pg_enc2name_tbl[] =
310 {
311 	DEF_ENC2NAME(SQL_ASCII, 0),
312 	DEF_ENC2NAME(EUC_JP, 20932),
313 	DEF_ENC2NAME(EUC_CN, 20936),
314 	DEF_ENC2NAME(EUC_KR, 51949),
315 	DEF_ENC2NAME(EUC_TW, 0),
316 	DEF_ENC2NAME(EUC_JIS_2004, 20932),
317 	DEF_ENC2NAME(UTF8, 65001),
318 	DEF_ENC2NAME(MULE_INTERNAL, 0),
319 	DEF_ENC2NAME(LATIN1, 28591),
320 	DEF_ENC2NAME(LATIN2, 28592),
321 	DEF_ENC2NAME(LATIN3, 28593),
322 	DEF_ENC2NAME(LATIN4, 28594),
323 	DEF_ENC2NAME(LATIN5, 28599),
324 	DEF_ENC2NAME(LATIN6, 0),
325 	DEF_ENC2NAME(LATIN7, 0),
326 	DEF_ENC2NAME(LATIN8, 0),
327 	DEF_ENC2NAME(LATIN9, 28605),
328 	DEF_ENC2NAME(LATIN10, 0),
329 	DEF_ENC2NAME(WIN1256, 1256),
330 	DEF_ENC2NAME(WIN1258, 1258),
331 	DEF_ENC2NAME(WIN866, 866),
332 	DEF_ENC2NAME(WIN874, 874),
333 	DEF_ENC2NAME(KOI8R, 20866),
334 	DEF_ENC2NAME(WIN1251, 1251),
335 	DEF_ENC2NAME(WIN1252, 1252),
336 	DEF_ENC2NAME(ISO_8859_5, 28595),
337 	DEF_ENC2NAME(ISO_8859_6, 28596),
338 	DEF_ENC2NAME(ISO_8859_7, 28597),
339 	DEF_ENC2NAME(ISO_8859_8, 28598),
340 	DEF_ENC2NAME(WIN1250, 1250),
341 	DEF_ENC2NAME(WIN1253, 1253),
342 	DEF_ENC2NAME(WIN1254, 1254),
343 	DEF_ENC2NAME(WIN1255, 1255),
344 	DEF_ENC2NAME(WIN1257, 1257),
345 	DEF_ENC2NAME(KOI8U, 21866),
346 	DEF_ENC2NAME(SJIS, 932),
347 	DEF_ENC2NAME(BIG5, 950),
348 	DEF_ENC2NAME(GBK, 936),
349 	DEF_ENC2NAME(UHC, 949),
350 	DEF_ENC2NAME(GB18030, 54936),
351 	DEF_ENC2NAME(JOHAB, 0),
352 	DEF_ENC2NAME(SHIFT_JIS_2004, 932)
353 };
354 
355 /* ----------
356  * These are encoding names for gettext.
357  *
358  * This covers all encodings except MULE_INTERNAL, which is alien to gettext.
359  * ----------
360  */
361 const pg_enc2gettext pg_enc2gettext_tbl[] =
362 {
363 	{PG_SQL_ASCII, "US-ASCII"},
364 	{PG_UTF8, "UTF-8"},
365 	{PG_LATIN1, "LATIN1"},
366 	{PG_LATIN2, "LATIN2"},
367 	{PG_LATIN3, "LATIN3"},
368 	{PG_LATIN4, "LATIN4"},
369 	{PG_ISO_8859_5, "ISO-8859-5"},
370 	{PG_ISO_8859_6, "ISO_8859-6"},
371 	{PG_ISO_8859_7, "ISO-8859-7"},
372 	{PG_ISO_8859_8, "ISO-8859-8"},
373 	{PG_LATIN5, "LATIN5"},
374 	{PG_LATIN6, "LATIN6"},
375 	{PG_LATIN7, "LATIN7"},
376 	{PG_LATIN8, "LATIN8"},
377 	{PG_LATIN9, "LATIN-9"},
378 	{PG_LATIN10, "LATIN10"},
379 	{PG_KOI8R, "KOI8-R"},
380 	{PG_KOI8U, "KOI8-U"},
381 	{PG_WIN1250, "CP1250"},
382 	{PG_WIN1251, "CP1251"},
383 	{PG_WIN1252, "CP1252"},
384 	{PG_WIN1253, "CP1253"},
385 	{PG_WIN1254, "CP1254"},
386 	{PG_WIN1255, "CP1255"},
387 	{PG_WIN1256, "CP1256"},
388 	{PG_WIN1257, "CP1257"},
389 	{PG_WIN1258, "CP1258"},
390 	{PG_WIN866, "CP866"},
391 	{PG_WIN874, "CP874"},
392 	{PG_EUC_CN, "EUC-CN"},
393 	{PG_EUC_JP, "EUC-JP"},
394 	{PG_EUC_KR, "EUC-KR"},
395 	{PG_EUC_TW, "EUC-TW"},
396 	{PG_EUC_JIS_2004, "EUC-JP"},
397 	{PG_SJIS, "SHIFT-JIS"},
398 	{PG_BIG5, "BIG5"},
399 	{PG_GBK, "GBK"},
400 	{PG_UHC, "UHC"},
401 	{PG_GB18030, "GB18030"},
402 	{PG_JOHAB, "JOHAB"},
403 	{PG_SHIFT_JIS_2004, "SHIFT_JISX0213"},
404 	{0, NULL}
405 };
406 
407 
408 /*
409  * Table of encoding names for ICU (currently covers backend encodings only)
410  *
411  * Reference: <https://ssl.icu-project.org/icu-bin/convexp>
412  *
413  * NULL entries are not supported by ICU, or their mapping is unclear.
414  */
415 static const char *const pg_enc2icu_tbl[] =
416 {
417 	NULL,						/* PG_SQL_ASCII */
418 	"EUC-JP",					/* PG_EUC_JP */
419 	"EUC-CN",					/* PG_EUC_CN */
420 	"EUC-KR",					/* PG_EUC_KR */
421 	"EUC-TW",					/* PG_EUC_TW */
422 	NULL,						/* PG_EUC_JIS_2004 */
423 	"UTF-8",					/* PG_UTF8 */
424 	NULL,						/* PG_MULE_INTERNAL */
425 	"ISO-8859-1",				/* PG_LATIN1 */
426 	"ISO-8859-2",				/* PG_LATIN2 */
427 	"ISO-8859-3",				/* PG_LATIN3 */
428 	"ISO-8859-4",				/* PG_LATIN4 */
429 	"ISO-8859-9",				/* PG_LATIN5 */
430 	"ISO-8859-10",				/* PG_LATIN6 */
431 	"ISO-8859-13",				/* PG_LATIN7 */
432 	"ISO-8859-14",				/* PG_LATIN8 */
433 	"ISO-8859-15",				/* PG_LATIN9 */
434 	NULL,						/* PG_LATIN10 */
435 	"CP1256",					/* PG_WIN1256 */
436 	"CP1258",					/* PG_WIN1258 */
437 	"CP866",					/* PG_WIN866 */
438 	NULL,						/* PG_WIN874 */
439 	"KOI8-R",					/* PG_KOI8R */
440 	"CP1251",					/* PG_WIN1251 */
441 	"CP1252",					/* PG_WIN1252 */
442 	"ISO-8859-5",				/* PG_ISO_8859_5 */
443 	"ISO-8859-6",				/* PG_ISO_8859_6 */
444 	"ISO-8859-7",				/* PG_ISO_8859_7 */
445 	"ISO-8859-8",				/* PG_ISO_8859_8 */
446 	"CP1250",					/* PG_WIN1250 */
447 	"CP1253",					/* PG_WIN1253 */
448 	"CP1254",					/* PG_WIN1254 */
449 	"CP1255",					/* PG_WIN1255 */
450 	"CP1257",					/* PG_WIN1257 */
451 	"KOI8-U",					/* PG_KOI8U */
452 };
453 
454 
455 /*
456  * Is this encoding supported by ICU?
457  */
458 bool
is_encoding_supported_by_icu(int encoding)459 is_encoding_supported_by_icu(int encoding)
460 {
461 	if (!PG_VALID_BE_ENCODING(encoding))
462 		return false;
463 	return (pg_enc2icu_tbl[encoding] != NULL);
464 }
465 
466 /*
467  * Returns ICU's name for encoding, or NULL if not supported
468  */
469 const char *
get_encoding_name_for_icu(int encoding)470 get_encoding_name_for_icu(int encoding)
471 {
472 	StaticAssertStmt(lengthof(pg_enc2icu_tbl) == PG_ENCODING_BE_LAST + 1,
473 					 "pg_enc2icu_tbl incomplete");
474 
475 	if (!PG_VALID_BE_ENCODING(encoding))
476 		return NULL;
477 	return pg_enc2icu_tbl[encoding];
478 }
479 
480 
481 /* ----------
482  * Encoding checks, for error returns -1 else encoding id
483  * ----------
484  */
485 int
pg_valid_client_encoding(const char * name)486 pg_valid_client_encoding(const char *name)
487 {
488 	int			enc;
489 
490 	if ((enc = pg_char_to_encoding(name)) < 0)
491 		return -1;
492 
493 	if (!PG_VALID_FE_ENCODING(enc))
494 		return -1;
495 
496 	return enc;
497 }
498 
499 int
pg_valid_server_encoding(const char * name)500 pg_valid_server_encoding(const char *name)
501 {
502 	int			enc;
503 
504 	if ((enc = pg_char_to_encoding(name)) < 0)
505 		return -1;
506 
507 	if (!PG_VALID_BE_ENCODING(enc))
508 		return -1;
509 
510 	return enc;
511 }
512 
513 int
pg_valid_server_encoding_id(int encoding)514 pg_valid_server_encoding_id(int encoding)
515 {
516 	return PG_VALID_BE_ENCODING(encoding);
517 }
518 
519 /*
520  * Remove irrelevant chars from encoding name, store at *newkey
521  *
522  * (Caller's responsibility to provide a large enough buffer)
523  */
524 static char *
clean_encoding_name(const char * key,char * newkey)525 clean_encoding_name(const char *key, char *newkey)
526 {
527 	const char *p;
528 	char	   *np;
529 
530 	for (p = key, np = newkey; *p != '\0'; p++)
531 	{
532 		if (isalnum((unsigned char) *p))
533 		{
534 			if (*p >= 'A' && *p <= 'Z')
535 				*np++ = *p + 'a' - 'A';
536 			else
537 				*np++ = *p;
538 		}
539 	}
540 	*np = '\0';
541 	return newkey;
542 }
543 
544 /*
545  * Search encoding by encoding name
546  *
547  * Returns encoding ID, or -1 if not recognized
548  */
549 int
pg_char_to_encoding(const char * name)550 pg_char_to_encoding(const char *name)
551 {
552 	unsigned int nel = lengthof(pg_encname_tbl);
553 	const pg_encname *base = pg_encname_tbl,
554 			   *last = base + nel - 1,
555 			   *position;
556 	int			result;
557 	char		buff[NAMEDATALEN],
558 			   *key;
559 
560 	if (name == NULL || *name == '\0')
561 		return -1;
562 
563 	if (strlen(name) >= NAMEDATALEN)
564 		return -1;				/* it's certainly not in the table */
565 
566 	key = clean_encoding_name(name, buff);
567 
568 	while (last >= base)
569 	{
570 		position = base + ((last - base) >> 1);
571 		result = key[0] - position->name[0];
572 
573 		if (result == 0)
574 		{
575 			result = strcmp(key, position->name);
576 			if (result == 0)
577 				return position->encoding;
578 		}
579 		if (result < 0)
580 			last = position - 1;
581 		else
582 			base = position + 1;
583 	}
584 	return -1;
585 }
586 
587 const char *
pg_encoding_to_char(int encoding)588 pg_encoding_to_char(int encoding)
589 {
590 	if (PG_VALID_ENCODING(encoding))
591 	{
592 		const pg_enc2name *p = &pg_enc2name_tbl[encoding];
593 
594 		Assert(encoding == p->encoding);
595 		return p->name;
596 	}
597 	return "";
598 }
599