1 /*
2  * Encoding names and routines for work with it. All
3  * in this file is shared between FE and BE.
4  *
5  * src/backend/utils/mb/encnames.c
6  */
7 #ifdef FRONTEND
8 #include "postgres_fe.h"
9 #else
10 #include "postgres.h"
11 #include "utils/builtins.h"
12 #endif
13 
14 #include <ctype.h>
15 #include <unistd.h>
16 
17 #include "mb/pg_wchar.h"
18 
19 
20 /* ----------
21  * All encoding names, sorted:		 *** A L P H A B E T I C ***
22  *
23  * All names must be without irrelevant chars, search routines use
24  * isalnum() chars only. It means ISO-8859-1, iso_8859-1 and Iso8859_1
25  * are always converted to 'iso88591'. All must be lower case.
26  *
27  * The table doesn't contain 'cs' aliases (like csISOLatin1). It's needed?
28  *
29  * Karel Zak, Aug 2001
30  * ----------
31  */
32 typedef struct pg_encname
33 {
34 	const char *name;
35 	pg_enc		encoding;
36 } pg_encname;
37 
38 static const pg_encname pg_encname_tbl[] =
39 {
40 	{
41 		"abc", PG_WIN1258
42 	},							/* alias for WIN1258 */
43 	{
44 		"alt", PG_WIN866
45 	},							/* IBM866 */
46 	{
47 		"big5", PG_BIG5
48 	},							/* Big5; Chinese for Taiwan multibyte set */
49 	{
50 		"euccn", PG_EUC_CN
51 	},							/* EUC-CN; Extended Unix Code for simplified
52 								 * Chinese */
53 	{
54 		"eucjis2004", PG_EUC_JIS_2004
55 	},							/* EUC-JIS-2004; Extended UNIX Code fixed
56 								 * Width for Japanese, standard JIS X 0213 */
57 	{
58 		"eucjp", PG_EUC_JP
59 	},							/* EUC-JP; Extended UNIX Code fixed Width for
60 								 * Japanese, standard OSF */
61 	{
62 		"euckr", PG_EUC_KR
63 	},							/* EUC-KR; Extended Unix Code for Korean , KS
64 								 * X 1001 standard */
65 	{
66 		"euctw", PG_EUC_TW
67 	},							/* EUC-TW; Extended Unix Code for
68 								 *
69 								 * traditional Chinese */
70 	{
71 		"gb18030", PG_GB18030
72 	},							/* GB18030;GB18030 */
73 	{
74 		"gbk", PG_GBK
75 	},							/* GBK; Chinese Windows CodePage 936
76 								 * simplified Chinese */
77 	{
78 		"iso88591", PG_LATIN1
79 	},							/* ISO-8859-1; RFC1345,KXS2 */
80 	{
81 		"iso885910", PG_LATIN6
82 	},							/* ISO-8859-10; RFC1345,KXS2 */
83 	{
84 		"iso885913", PG_LATIN7
85 	},							/* ISO-8859-13; RFC1345,KXS2 */
86 	{
87 		"iso885914", PG_LATIN8
88 	},							/* ISO-8859-14; RFC1345,KXS2 */
89 	{
90 		"iso885915", PG_LATIN9
91 	},							/* ISO-8859-15; RFC1345,KXS2 */
92 	{
93 		"iso885916", PG_LATIN10
94 	},							/* ISO-8859-16; RFC1345,KXS2 */
95 	{
96 		"iso88592", PG_LATIN2
97 	},							/* ISO-8859-2; RFC1345,KXS2 */
98 	{
99 		"iso88593", PG_LATIN3
100 	},							/* ISO-8859-3; RFC1345,KXS2 */
101 	{
102 		"iso88594", PG_LATIN4
103 	},							/* ISO-8859-4; RFC1345,KXS2 */
104 	{
105 		"iso88595", PG_ISO_8859_5
106 	},							/* ISO-8859-5; RFC1345,KXS2 */
107 	{
108 		"iso88596", PG_ISO_8859_6
109 	},							/* ISO-8859-6; RFC1345,KXS2 */
110 	{
111 		"iso88597", PG_ISO_8859_7
112 	},							/* ISO-8859-7; RFC1345,KXS2 */
113 	{
114 		"iso88598", PG_ISO_8859_8
115 	},							/* ISO-8859-8; RFC1345,KXS2 */
116 	{
117 		"iso88599", PG_LATIN5
118 	},							/* ISO-8859-9; RFC1345,KXS2 */
119 	{
120 		"johab", PG_JOHAB
121 	},							/* JOHAB; Extended Unix Code for simplified
122 								 * Chinese */
123 	{
124 		"koi8", PG_KOI8R
125 	},							/* _dirty_ alias for KOI8-R (backward
126 								 * compatibility) */
127 	{
128 		"koi8r", PG_KOI8R
129 	},							/* KOI8-R; RFC1489 */
130 	{
131 		"koi8u", PG_KOI8U
132 	},							/* KOI8-U; RFC2319 */
133 	{
134 		"latin1", PG_LATIN1
135 	},							/* alias for ISO-8859-1 */
136 	{
137 		"latin10", PG_LATIN10
138 	},							/* alias for ISO-8859-16 */
139 	{
140 		"latin2", PG_LATIN2
141 	},							/* alias for ISO-8859-2 */
142 	{
143 		"latin3", PG_LATIN3
144 	},							/* alias for ISO-8859-3 */
145 	{
146 		"latin4", PG_LATIN4
147 	},							/* alias for ISO-8859-4 */
148 	{
149 		"latin5", PG_LATIN5
150 	},							/* alias for ISO-8859-9 */
151 	{
152 		"latin6", PG_LATIN6
153 	},							/* alias for ISO-8859-10 */
154 	{
155 		"latin7", PG_LATIN7
156 	},							/* alias for ISO-8859-13 */
157 	{
158 		"latin8", PG_LATIN8
159 	},							/* alias for ISO-8859-14 */
160 	{
161 		"latin9", PG_LATIN9
162 	},							/* alias for ISO-8859-15 */
163 	{
164 		"mskanji", PG_SJIS
165 	},							/* alias for Shift_JIS */
166 	{
167 		"muleinternal", PG_MULE_INTERNAL
168 	},
169 	{
170 		"shiftjis", PG_SJIS
171 	},							/* Shift_JIS; JIS X 0202-1991 */
172 
173 	{
174 		"shiftjis2004", PG_SHIFT_JIS_2004
175 	},							/* SHIFT-JIS-2004; Shift JIS for Japanese,
176 								 * standard JIS X 0213 */
177 	{
178 		"sjis", PG_SJIS
179 	},							/* alias for Shift_JIS */
180 	{
181 		"sqlascii", PG_SQL_ASCII
182 	},
183 	{
184 		"tcvn", PG_WIN1258
185 	},							/* alias for WIN1258 */
186 	{
187 		"tcvn5712", PG_WIN1258
188 	},							/* alias for WIN1258 */
189 	{
190 		"uhc", PG_UHC
191 	},							/* UHC; Korean Windows CodePage 949 */
192 	{
193 		"unicode", PG_UTF8
194 	},							/* alias for UTF8 */
195 	{
196 		"utf8", PG_UTF8
197 	},							/* alias for UTF8 */
198 	{
199 		"vscii", PG_WIN1258
200 	},							/* alias for WIN1258 */
201 	{
202 		"win", PG_WIN1251
203 	},							/* _dirty_ alias for windows-1251 (backward
204 								 * compatibility) */
205 	{
206 		"win1250", PG_WIN1250
207 	},							/* alias for Windows-1250 */
208 	{
209 		"win1251", PG_WIN1251
210 	},							/* alias for Windows-1251 */
211 	{
212 		"win1252", PG_WIN1252
213 	},							/* alias for Windows-1252 */
214 	{
215 		"win1253", PG_WIN1253
216 	},							/* alias for Windows-1253 */
217 	{
218 		"win1254", PG_WIN1254
219 	},							/* alias for Windows-1254 */
220 	{
221 		"win1255", PG_WIN1255
222 	},							/* alias for Windows-1255 */
223 	{
224 		"win1256", PG_WIN1256
225 	},							/* alias for Windows-1256 */
226 	{
227 		"win1257", PG_WIN1257
228 	},							/* alias for Windows-1257 */
229 	{
230 		"win1258", PG_WIN1258
231 	},							/* alias for Windows-1258 */
232 	{
233 		"win866", PG_WIN866
234 	},							/* IBM866 */
235 	{
236 		"win874", PG_WIN874
237 	},							/* alias for Windows-874 */
238 	{
239 		"win932", PG_SJIS
240 	},							/* alias for Shift_JIS */
241 	{
242 		"win936", PG_GBK
243 	},							/* alias for GBK */
244 	{
245 		"win949", PG_UHC
246 	},							/* alias for UHC */
247 	{
248 		"win950", PG_BIG5
249 	},							/* alias for BIG5 */
250 	{
251 		"windows1250", PG_WIN1250
252 	},							/* Windows-1251; Microsoft */
253 	{
254 		"windows1251", PG_WIN1251
255 	},							/* Windows-1251; Microsoft */
256 	{
257 		"windows1252", PG_WIN1252
258 	},							/* Windows-1252; Microsoft */
259 	{
260 		"windows1253", PG_WIN1253
261 	},							/* Windows-1253; Microsoft */
262 	{
263 		"windows1254", PG_WIN1254
264 	},							/* Windows-1254; Microsoft */
265 	{
266 		"windows1255", PG_WIN1255
267 	},							/* Windows-1255; Microsoft */
268 	{
269 		"windows1256", PG_WIN1256
270 	},							/* Windows-1256; Microsoft */
271 	{
272 		"windows1257", PG_WIN1257
273 	},							/* Windows-1257; Microsoft */
274 	{
275 		"windows1258", PG_WIN1258
276 	},							/* Windows-1258; Microsoft */
277 	{
278 		"windows866", PG_WIN866
279 	},							/* IBM866 */
280 	{
281 		"windows874", PG_WIN874
282 	},							/* Windows-874; Microsoft */
283 	{
284 		"windows932", PG_SJIS
285 	},							/* alias for Shift_JIS */
286 	{
287 		"windows936", PG_GBK
288 	},							/* alias for GBK */
289 	{
290 		"windows949", PG_UHC
291 	},							/* alias for UHC */
292 	{
293 		"windows950", PG_BIG5
294 	}							/* alias for BIG5 */
295 };
296 
297 /* ----------
298  * These are "official" encoding names.
299  * XXX must be sorted by the same order as enum pg_enc (in mb/pg_wchar.h)
300  * ----------
301  */
302 #ifndef WIN32
303 #define DEF_ENC2NAME(name, codepage) { #name, PG_##name }
304 #else
305 #define DEF_ENC2NAME(name, codepage) { #name, PG_##name, codepage }
306 #endif
307 const pg_enc2name pg_enc2name_tbl[] =
308 {
309 	DEF_ENC2NAME(SQL_ASCII, 0),
310 	DEF_ENC2NAME(EUC_JP, 20932),
311 	DEF_ENC2NAME(EUC_CN, 20936),
312 	DEF_ENC2NAME(EUC_KR, 51949),
313 	DEF_ENC2NAME(EUC_TW, 0),
314 	DEF_ENC2NAME(EUC_JIS_2004, 20932),
315 	DEF_ENC2NAME(UTF8, 65001),
316 	DEF_ENC2NAME(MULE_INTERNAL, 0),
317 	DEF_ENC2NAME(LATIN1, 28591),
318 	DEF_ENC2NAME(LATIN2, 28592),
319 	DEF_ENC2NAME(LATIN3, 28593),
320 	DEF_ENC2NAME(LATIN4, 28594),
321 	DEF_ENC2NAME(LATIN5, 28599),
322 	DEF_ENC2NAME(LATIN6, 0),
323 	DEF_ENC2NAME(LATIN7, 0),
324 	DEF_ENC2NAME(LATIN8, 0),
325 	DEF_ENC2NAME(LATIN9, 28605),
326 	DEF_ENC2NAME(LATIN10, 0),
327 	DEF_ENC2NAME(WIN1256, 1256),
328 	DEF_ENC2NAME(WIN1258, 1258),
329 	DEF_ENC2NAME(WIN866, 866),
330 	DEF_ENC2NAME(WIN874, 874),
331 	DEF_ENC2NAME(KOI8R, 20866),
332 	DEF_ENC2NAME(WIN1251, 1251),
333 	DEF_ENC2NAME(WIN1252, 1252),
334 	DEF_ENC2NAME(ISO_8859_5, 28595),
335 	DEF_ENC2NAME(ISO_8859_6, 28596),
336 	DEF_ENC2NAME(ISO_8859_7, 28597),
337 	DEF_ENC2NAME(ISO_8859_8, 28598),
338 	DEF_ENC2NAME(WIN1250, 1250),
339 	DEF_ENC2NAME(WIN1253, 1253),
340 	DEF_ENC2NAME(WIN1254, 1254),
341 	DEF_ENC2NAME(WIN1255, 1255),
342 	DEF_ENC2NAME(WIN1257, 1257),
343 	DEF_ENC2NAME(KOI8U, 21866),
344 	DEF_ENC2NAME(SJIS, 932),
345 	DEF_ENC2NAME(BIG5, 950),
346 	DEF_ENC2NAME(GBK, 936),
347 	DEF_ENC2NAME(UHC, 949),
348 	DEF_ENC2NAME(GB18030, 54936),
349 	DEF_ENC2NAME(JOHAB, 0),
350 	DEF_ENC2NAME(SHIFT_JIS_2004, 932)
351 };
352 
353 /* ----------
354  * These are encoding names for gettext.
355  *
356  * This covers all encodings except MULE_INTERNAL, which is alien to gettext.
357  * ----------
358  */
359 const pg_enc2gettext pg_enc2gettext_tbl[] =
360 {
361 	{PG_SQL_ASCII, "US-ASCII"},
362 	{PG_UTF8, "UTF-8"},
363 	{PG_LATIN1, "LATIN1"},
364 	{PG_LATIN2, "LATIN2"},
365 	{PG_LATIN3, "LATIN3"},
366 	{PG_LATIN4, "LATIN4"},
367 	{PG_ISO_8859_5, "ISO-8859-5"},
368 	{PG_ISO_8859_6, "ISO_8859-6"},
369 	{PG_ISO_8859_7, "ISO-8859-7"},
370 	{PG_ISO_8859_8, "ISO-8859-8"},
371 	{PG_LATIN5, "LATIN5"},
372 	{PG_LATIN6, "LATIN6"},
373 	{PG_LATIN7, "LATIN7"},
374 	{PG_LATIN8, "LATIN8"},
375 	{PG_LATIN9, "LATIN-9"},
376 	{PG_LATIN10, "LATIN10"},
377 	{PG_KOI8R, "KOI8-R"},
378 	{PG_KOI8U, "KOI8-U"},
379 	{PG_WIN1250, "CP1250"},
380 	{PG_WIN1251, "CP1251"},
381 	{PG_WIN1252, "CP1252"},
382 	{PG_WIN1253, "CP1253"},
383 	{PG_WIN1254, "CP1254"},
384 	{PG_WIN1255, "CP1255"},
385 	{PG_WIN1256, "CP1256"},
386 	{PG_WIN1257, "CP1257"},
387 	{PG_WIN1258, "CP1258"},
388 	{PG_WIN866, "CP866"},
389 	{PG_WIN874, "CP874"},
390 	{PG_EUC_CN, "EUC-CN"},
391 	{PG_EUC_JP, "EUC-JP"},
392 	{PG_EUC_KR, "EUC-KR"},
393 	{PG_EUC_TW, "EUC-TW"},
394 	{PG_EUC_JIS_2004, "EUC-JP"},
395 	{PG_SJIS, "SHIFT-JIS"},
396 	{PG_BIG5, "BIG5"},
397 	{PG_GBK, "GBK"},
398 	{PG_UHC, "UHC"},
399 	{PG_GB18030, "GB18030"},
400 	{PG_JOHAB, "JOHAB"},
401 	{PG_SHIFT_JIS_2004, "SHIFT_JISX0213"},
402 	{0, NULL}
403 };
404 
405 
406 #ifndef FRONTEND
407 
408 /*
409  * Table of encoding names for ICU
410  *
411  * Reference: <https://ssl.icu-project.org/icu-bin/convexp>
412  *
413  * NULL entries are not supported by ICU, or their mapping is unclear.
414  */
415 static const char *const pg_enc2icu_tbl[] =
416 {
417 	NULL,						/* PG_SQL_ASCII */
418 	"EUC-JP",					/* PG_EUC_JP */
419 	"EUC-CN",					/* PG_EUC_CN */
420 	"EUC-KR",					/* PG_EUC_KR */
421 	"EUC-TW",					/* PG_EUC_TW */
422 	NULL,						/* PG_EUC_JIS_2004 */
423 	"UTF-8",					/* PG_UTF8 */
424 	NULL,						/* PG_MULE_INTERNAL */
425 	"ISO-8859-1",				/* PG_LATIN1 */
426 	"ISO-8859-2",				/* PG_LATIN2 */
427 	"ISO-8859-3",				/* PG_LATIN3 */
428 	"ISO-8859-4",				/* PG_LATIN4 */
429 	"ISO-8859-9",				/* PG_LATIN5 */
430 	"ISO-8859-10",				/* PG_LATIN6 */
431 	"ISO-8859-13",				/* PG_LATIN7 */
432 	"ISO-8859-14",				/* PG_LATIN8 */
433 	"ISO-8859-15",				/* PG_LATIN9 */
434 	NULL,						/* PG_LATIN10 */
435 	"CP1256",					/* PG_WIN1256 */
436 	"CP1258",					/* PG_WIN1258 */
437 	"CP866",					/* PG_WIN866 */
438 	NULL,						/* PG_WIN874 */
439 	"KOI8-R",					/* PG_KOI8R */
440 	"CP1251",					/* PG_WIN1251 */
441 	"CP1252",					/* PG_WIN1252 */
442 	"ISO-8859-5",				/* PG_ISO_8859_5 */
443 	"ISO-8859-6",				/* PG_ISO_8859_6 */
444 	"ISO-8859-7",				/* PG_ISO_8859_7 */
445 	"ISO-8859-8",				/* PG_ISO_8859_8 */
446 	"CP1250",					/* PG_WIN1250 */
447 	"CP1253",					/* PG_WIN1253 */
448 	"CP1254",					/* PG_WIN1254 */
449 	"CP1255",					/* PG_WIN1255 */
450 	"CP1257",					/* PG_WIN1257 */
451 	"KOI8-U",					/* PG_KOI8U */
452 };
453 
454 bool
is_encoding_supported_by_icu(int encoding)455 is_encoding_supported_by_icu(int encoding)
456 {
457 	return (pg_enc2icu_tbl[encoding] != NULL);
458 }
459 
460 const char *
get_encoding_name_for_icu(int encoding)461 get_encoding_name_for_icu(int encoding)
462 {
463 	const char *icu_encoding_name;
464 
465 	StaticAssertStmt(lengthof(pg_enc2icu_tbl) == PG_ENCODING_BE_LAST + 1,
466 					 "pg_enc2icu_tbl incomplete");
467 
468 	icu_encoding_name = pg_enc2icu_tbl[encoding];
469 
470 	if (!icu_encoding_name)
471 		ereport(ERROR,
472 				(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
473 				 errmsg("encoding \"%s\" not supported by ICU",
474 						pg_encoding_to_char(encoding))));
475 
476 	return icu_encoding_name;
477 }
478 
479 #endif							/* not FRONTEND */
480 
481 
482 /* ----------
483  * Encoding checks, for error returns -1 else encoding id
484  * ----------
485  */
486 int
pg_valid_client_encoding(const char * name)487 pg_valid_client_encoding(const char *name)
488 {
489 	int			enc;
490 
491 	if ((enc = pg_char_to_encoding(name)) < 0)
492 		return -1;
493 
494 	if (!PG_VALID_FE_ENCODING(enc))
495 		return -1;
496 
497 	return enc;
498 }
499 
500 int
pg_valid_server_encoding(const char * name)501 pg_valid_server_encoding(const char *name)
502 {
503 	int			enc;
504 
505 	if ((enc = pg_char_to_encoding(name)) < 0)
506 		return -1;
507 
508 	if (!PG_VALID_BE_ENCODING(enc))
509 		return -1;
510 
511 	return enc;
512 }
513 
514 int
pg_valid_server_encoding_id(int encoding)515 pg_valid_server_encoding_id(int encoding)
516 {
517 	return PG_VALID_BE_ENCODING(encoding);
518 }
519 
520 /* ----------
521  * Remove irrelevant chars from encoding name
522  * ----------
523  */
524 static char *
clean_encoding_name(const char * key,char * newkey)525 clean_encoding_name(const char *key, char *newkey)
526 {
527 	const char *p;
528 	char	   *np;
529 
530 	for (p = key, np = newkey; *p != '\0'; p++)
531 	{
532 		if (isalnum((unsigned char) *p))
533 		{
534 			if (*p >= 'A' && *p <= 'Z')
535 				*np++ = *p + 'a' - 'A';
536 			else
537 				*np++ = *p;
538 		}
539 	}
540 	*np = '\0';
541 	return newkey;
542 }
543 
544 /* ----------
545  * Search encoding by encoding name
546  *
547  * Returns encoding ID, or -1 for error
548  * ----------
549  */
550 int
pg_char_to_encoding(const char * name)551 pg_char_to_encoding(const char *name)
552 {
553 	unsigned int nel = lengthof(pg_encname_tbl);
554 	const pg_encname *base = pg_encname_tbl,
555 			   *last = base + nel - 1,
556 			   *position;
557 	int			result;
558 	char		buff[NAMEDATALEN],
559 			   *key;
560 
561 	if (name == NULL || *name == '\0')
562 		return -1;
563 
564 	if (strlen(name) >= NAMEDATALEN)
565 	{
566 #ifdef FRONTEND
567 		fprintf(stderr, "encoding name too long\n");
568 		return -1;
569 #else
570 		ereport(ERROR,
571 				(errcode(ERRCODE_NAME_TOO_LONG),
572 				 errmsg("encoding name too long")));
573 #endif
574 	}
575 	key = clean_encoding_name(name, buff);
576 
577 	while (last >= base)
578 	{
579 		position = base + ((last - base) >> 1);
580 		result = key[0] - position->name[0];
581 
582 		if (result == 0)
583 		{
584 			result = strcmp(key, position->name);
585 			if (result == 0)
586 				return position->encoding;
587 		}
588 		if (result < 0)
589 			last = position - 1;
590 		else
591 			base = position + 1;
592 	}
593 	return -1;
594 }
595 
596 #ifndef FRONTEND
597 Datum
PG_char_to_encoding(PG_FUNCTION_ARGS)598 PG_char_to_encoding(PG_FUNCTION_ARGS)
599 {
600 	Name		s = PG_GETARG_NAME(0);
601 
602 	PG_RETURN_INT32(pg_char_to_encoding(NameStr(*s)));
603 }
604 #endif
605 
606 const char *
pg_encoding_to_char(int encoding)607 pg_encoding_to_char(int encoding)
608 {
609 	if (PG_VALID_ENCODING(encoding))
610 	{
611 		const pg_enc2name *p = &pg_enc2name_tbl[encoding];
612 
613 		Assert(encoding == p->encoding);
614 		return p->name;
615 	}
616 	return "";
617 }
618 
619 #ifndef FRONTEND
620 Datum
PG_encoding_to_char(PG_FUNCTION_ARGS)621 PG_encoding_to_char(PG_FUNCTION_ARGS)
622 {
623 	int32		encoding = PG_GETARG_INT32(0);
624 	const char *encoding_name = pg_encoding_to_char(encoding);
625 
626 	return DirectFunctionCall1(namein, CStringGetDatum(encoding_name));
627 }
628 
629 #endif
630