1 /*-------------------------------------------------------------------------
2 *
3 * encnames.c
4 * Encoding names and routines for working with them.
5 *
6 * Portions Copyright (c) 2001-2020, PostgreSQL Global Development Group
7 *
8 * IDENTIFICATION
9 * src/common/encnames.c
10 *
11 *-------------------------------------------------------------------------
12 */
13 #include "c.h"
14
15 #include <ctype.h>
16 #include <unistd.h>
17
18 #include "mb/pg_wchar.h"
19
20
21 /* ----------
22 * All encoding names, sorted: *** A L P H A B E T I C ***
23 *
24 * All names must be without irrelevant chars, search routines use
25 * isalnum() chars only. It means ISO-8859-1, iso_8859-1 and Iso8859_1
26 * are always converted to 'iso88591'. All must be lower case.
27 *
28 * The table doesn't contain 'cs' aliases (like csISOLatin1). It's needed?
29 *
30 * Karel Zak, Aug 2001
31 * ----------
32 */
33 typedef struct pg_encname
34 {
35 const char *name;
36 pg_enc encoding;
37 } pg_encname;
38
39 static const pg_encname pg_encname_tbl[] =
40 {
41 {
42 "abc", PG_WIN1258
43 }, /* alias for WIN1258 */
44 {
45 "alt", PG_WIN866
46 }, /* IBM866 */
47 {
48 "big5", PG_BIG5
49 }, /* Big5; Chinese for Taiwan multibyte set */
50 {
51 "euccn", PG_EUC_CN
52 }, /* EUC-CN; Extended Unix Code for simplified
53 * Chinese */
54 {
55 "eucjis2004", PG_EUC_JIS_2004
56 }, /* EUC-JIS-2004; Extended UNIX Code fixed
57 * Width for Japanese, standard JIS X 0213 */
58 {
59 "eucjp", PG_EUC_JP
60 }, /* EUC-JP; Extended UNIX Code fixed Width for
61 * Japanese, standard OSF */
62 {
63 "euckr", PG_EUC_KR
64 }, /* EUC-KR; Extended Unix Code for Korean , KS
65 * X 1001 standard */
66 {
67 "euctw", PG_EUC_TW
68 }, /* EUC-TW; Extended Unix Code for
69 *
70 * traditional Chinese */
71 {
72 "gb18030", PG_GB18030
73 }, /* GB18030;GB18030 */
74 {
75 "gbk", PG_GBK
76 }, /* GBK; Chinese Windows CodePage 936
77 * simplified Chinese */
78 {
79 "iso88591", PG_LATIN1
80 }, /* ISO-8859-1; RFC1345,KXS2 */
81 {
82 "iso885910", PG_LATIN6
83 }, /* ISO-8859-10; RFC1345,KXS2 */
84 {
85 "iso885913", PG_LATIN7
86 }, /* ISO-8859-13; RFC1345,KXS2 */
87 {
88 "iso885914", PG_LATIN8
89 }, /* ISO-8859-14; RFC1345,KXS2 */
90 {
91 "iso885915", PG_LATIN9
92 }, /* ISO-8859-15; RFC1345,KXS2 */
93 {
94 "iso885916", PG_LATIN10
95 }, /* ISO-8859-16; RFC1345,KXS2 */
96 {
97 "iso88592", PG_LATIN2
98 }, /* ISO-8859-2; RFC1345,KXS2 */
99 {
100 "iso88593", PG_LATIN3
101 }, /* ISO-8859-3; RFC1345,KXS2 */
102 {
103 "iso88594", PG_LATIN4
104 }, /* ISO-8859-4; RFC1345,KXS2 */
105 {
106 "iso88595", PG_ISO_8859_5
107 }, /* ISO-8859-5; RFC1345,KXS2 */
108 {
109 "iso88596", PG_ISO_8859_6
110 }, /* ISO-8859-6; RFC1345,KXS2 */
111 {
112 "iso88597", PG_ISO_8859_7
113 }, /* ISO-8859-7; RFC1345,KXS2 */
114 {
115 "iso88598", PG_ISO_8859_8
116 }, /* ISO-8859-8; RFC1345,KXS2 */
117 {
118 "iso88599", PG_LATIN5
119 }, /* ISO-8859-9; RFC1345,KXS2 */
120 {
121 "johab", PG_JOHAB
122 }, /* JOHAB; Extended Unix Code for simplified
123 * Chinese */
124 {
125 "koi8", PG_KOI8R
126 }, /* _dirty_ alias for KOI8-R (backward
127 * compatibility) */
128 {
129 "koi8r", PG_KOI8R
130 }, /* KOI8-R; RFC1489 */
131 {
132 "koi8u", PG_KOI8U
133 }, /* KOI8-U; RFC2319 */
134 {
135 "latin1", PG_LATIN1
136 }, /* alias for ISO-8859-1 */
137 {
138 "latin10", PG_LATIN10
139 }, /* alias for ISO-8859-16 */
140 {
141 "latin2", PG_LATIN2
142 }, /* alias for ISO-8859-2 */
143 {
144 "latin3", PG_LATIN3
145 }, /* alias for ISO-8859-3 */
146 {
147 "latin4", PG_LATIN4
148 }, /* alias for ISO-8859-4 */
149 {
150 "latin5", PG_LATIN5
151 }, /* alias for ISO-8859-9 */
152 {
153 "latin6", PG_LATIN6
154 }, /* alias for ISO-8859-10 */
155 {
156 "latin7", PG_LATIN7
157 }, /* alias for ISO-8859-13 */
158 {
159 "latin8", PG_LATIN8
160 }, /* alias for ISO-8859-14 */
161 {
162 "latin9", PG_LATIN9
163 }, /* alias for ISO-8859-15 */
164 {
165 "mskanji", PG_SJIS
166 }, /* alias for Shift_JIS */
167 {
168 "muleinternal", PG_MULE_INTERNAL
169 },
170 {
171 "shiftjis", PG_SJIS
172 }, /* Shift_JIS; JIS X 0202-1991 */
173
174 {
175 "shiftjis2004", PG_SHIFT_JIS_2004
176 }, /* SHIFT-JIS-2004; Shift JIS for Japanese,
177 * standard JIS X 0213 */
178 {
179 "sjis", PG_SJIS
180 }, /* alias for Shift_JIS */
181 {
182 "sqlascii", PG_SQL_ASCII
183 },
184 {
185 "tcvn", PG_WIN1258
186 }, /* alias for WIN1258 */
187 {
188 "tcvn5712", PG_WIN1258
189 }, /* alias for WIN1258 */
190 {
191 "uhc", PG_UHC
192 }, /* UHC; Korean Windows CodePage 949 */
193 {
194 "unicode", PG_UTF8
195 }, /* alias for UTF8 */
196 {
197 "utf8", PG_UTF8
198 }, /* alias for UTF8 */
199 {
200 "vscii", PG_WIN1258
201 }, /* alias for WIN1258 */
202 {
203 "win", PG_WIN1251
204 }, /* _dirty_ alias for windows-1251 (backward
205 * compatibility) */
206 {
207 "win1250", PG_WIN1250
208 }, /* alias for Windows-1250 */
209 {
210 "win1251", PG_WIN1251
211 }, /* alias for Windows-1251 */
212 {
213 "win1252", PG_WIN1252
214 }, /* alias for Windows-1252 */
215 {
216 "win1253", PG_WIN1253
217 }, /* alias for Windows-1253 */
218 {
219 "win1254", PG_WIN1254
220 }, /* alias for Windows-1254 */
221 {
222 "win1255", PG_WIN1255
223 }, /* alias for Windows-1255 */
224 {
225 "win1256", PG_WIN1256
226 }, /* alias for Windows-1256 */
227 {
228 "win1257", PG_WIN1257
229 }, /* alias for Windows-1257 */
230 {
231 "win1258", PG_WIN1258
232 }, /* alias for Windows-1258 */
233 {
234 "win866", PG_WIN866
235 }, /* IBM866 */
236 {
237 "win874", PG_WIN874
238 }, /* alias for Windows-874 */
239 {
240 "win932", PG_SJIS
241 }, /* alias for Shift_JIS */
242 {
243 "win936", PG_GBK
244 }, /* alias for GBK */
245 {
246 "win949", PG_UHC
247 }, /* alias for UHC */
248 {
249 "win950", PG_BIG5
250 }, /* alias for BIG5 */
251 {
252 "windows1250", PG_WIN1250
253 }, /* Windows-1251; Microsoft */
254 {
255 "windows1251", PG_WIN1251
256 }, /* Windows-1251; Microsoft */
257 {
258 "windows1252", PG_WIN1252
259 }, /* Windows-1252; Microsoft */
260 {
261 "windows1253", PG_WIN1253
262 }, /* Windows-1253; Microsoft */
263 {
264 "windows1254", PG_WIN1254
265 }, /* Windows-1254; Microsoft */
266 {
267 "windows1255", PG_WIN1255
268 }, /* Windows-1255; Microsoft */
269 {
270 "windows1256", PG_WIN1256
271 }, /* Windows-1256; Microsoft */
272 {
273 "windows1257", PG_WIN1257
274 }, /* Windows-1257; Microsoft */
275 {
276 "windows1258", PG_WIN1258
277 }, /* Windows-1258; Microsoft */
278 {
279 "windows866", PG_WIN866
280 }, /* IBM866 */
281 {
282 "windows874", PG_WIN874
283 }, /* Windows-874; Microsoft */
284 {
285 "windows932", PG_SJIS
286 }, /* alias for Shift_JIS */
287 {
288 "windows936", PG_GBK
289 }, /* alias for GBK */
290 {
291 "windows949", PG_UHC
292 }, /* alias for UHC */
293 {
294 "windows950", PG_BIG5
295 } /* alias for BIG5 */
296 };
297
298 /* ----------
299 * These are "official" encoding names.
300 * XXX must be sorted by the same order as enum pg_enc (in mb/pg_wchar.h)
301 * ----------
302 */
303 #ifndef WIN32
304 #define DEF_ENC2NAME(name, codepage) { #name, PG_##name }
305 #else
306 #define DEF_ENC2NAME(name, codepage) { #name, PG_##name, codepage }
307 #endif
308
309 const pg_enc2name pg_enc2name_tbl[] =
310 {
311 DEF_ENC2NAME(SQL_ASCII, 0),
312 DEF_ENC2NAME(EUC_JP, 20932),
313 DEF_ENC2NAME(EUC_CN, 20936),
314 DEF_ENC2NAME(EUC_KR, 51949),
315 DEF_ENC2NAME(EUC_TW, 0),
316 DEF_ENC2NAME(EUC_JIS_2004, 20932),
317 DEF_ENC2NAME(UTF8, 65001),
318 DEF_ENC2NAME(MULE_INTERNAL, 0),
319 DEF_ENC2NAME(LATIN1, 28591),
320 DEF_ENC2NAME(LATIN2, 28592),
321 DEF_ENC2NAME(LATIN3, 28593),
322 DEF_ENC2NAME(LATIN4, 28594),
323 DEF_ENC2NAME(LATIN5, 28599),
324 DEF_ENC2NAME(LATIN6, 0),
325 DEF_ENC2NAME(LATIN7, 0),
326 DEF_ENC2NAME(LATIN8, 0),
327 DEF_ENC2NAME(LATIN9, 28605),
328 DEF_ENC2NAME(LATIN10, 0),
329 DEF_ENC2NAME(WIN1256, 1256),
330 DEF_ENC2NAME(WIN1258, 1258),
331 DEF_ENC2NAME(WIN866, 866),
332 DEF_ENC2NAME(WIN874, 874),
333 DEF_ENC2NAME(KOI8R, 20866),
334 DEF_ENC2NAME(WIN1251, 1251),
335 DEF_ENC2NAME(WIN1252, 1252),
336 DEF_ENC2NAME(ISO_8859_5, 28595),
337 DEF_ENC2NAME(ISO_8859_6, 28596),
338 DEF_ENC2NAME(ISO_8859_7, 28597),
339 DEF_ENC2NAME(ISO_8859_8, 28598),
340 DEF_ENC2NAME(WIN1250, 1250),
341 DEF_ENC2NAME(WIN1253, 1253),
342 DEF_ENC2NAME(WIN1254, 1254),
343 DEF_ENC2NAME(WIN1255, 1255),
344 DEF_ENC2NAME(WIN1257, 1257),
345 DEF_ENC2NAME(KOI8U, 21866),
346 DEF_ENC2NAME(SJIS, 932),
347 DEF_ENC2NAME(BIG5, 950),
348 DEF_ENC2NAME(GBK, 936),
349 DEF_ENC2NAME(UHC, 949),
350 DEF_ENC2NAME(GB18030, 54936),
351 DEF_ENC2NAME(JOHAB, 0),
352 DEF_ENC2NAME(SHIFT_JIS_2004, 932)
353 };
354
355 /* ----------
356 * These are encoding names for gettext.
357 *
358 * This covers all encodings except MULE_INTERNAL, which is alien to gettext.
359 * ----------
360 */
361 const pg_enc2gettext pg_enc2gettext_tbl[] =
362 {
363 {PG_SQL_ASCII, "US-ASCII"},
364 {PG_UTF8, "UTF-8"},
365 {PG_LATIN1, "LATIN1"},
366 {PG_LATIN2, "LATIN2"},
367 {PG_LATIN3, "LATIN3"},
368 {PG_LATIN4, "LATIN4"},
369 {PG_ISO_8859_5, "ISO-8859-5"},
370 {PG_ISO_8859_6, "ISO_8859-6"},
371 {PG_ISO_8859_7, "ISO-8859-7"},
372 {PG_ISO_8859_8, "ISO-8859-8"},
373 {PG_LATIN5, "LATIN5"},
374 {PG_LATIN6, "LATIN6"},
375 {PG_LATIN7, "LATIN7"},
376 {PG_LATIN8, "LATIN8"},
377 {PG_LATIN9, "LATIN-9"},
378 {PG_LATIN10, "LATIN10"},
379 {PG_KOI8R, "KOI8-R"},
380 {PG_KOI8U, "KOI8-U"},
381 {PG_WIN1250, "CP1250"},
382 {PG_WIN1251, "CP1251"},
383 {PG_WIN1252, "CP1252"},
384 {PG_WIN1253, "CP1253"},
385 {PG_WIN1254, "CP1254"},
386 {PG_WIN1255, "CP1255"},
387 {PG_WIN1256, "CP1256"},
388 {PG_WIN1257, "CP1257"},
389 {PG_WIN1258, "CP1258"},
390 {PG_WIN866, "CP866"},
391 {PG_WIN874, "CP874"},
392 {PG_EUC_CN, "EUC-CN"},
393 {PG_EUC_JP, "EUC-JP"},
394 {PG_EUC_KR, "EUC-KR"},
395 {PG_EUC_TW, "EUC-TW"},
396 {PG_EUC_JIS_2004, "EUC-JP"},
397 {PG_SJIS, "SHIFT-JIS"},
398 {PG_BIG5, "BIG5"},
399 {PG_GBK, "GBK"},
400 {PG_UHC, "UHC"},
401 {PG_GB18030, "GB18030"},
402 {PG_JOHAB, "JOHAB"},
403 {PG_SHIFT_JIS_2004, "SHIFT_JISX0213"},
404 {0, NULL}
405 };
406
407
408 /*
409 * Table of encoding names for ICU (currently covers backend encodings only)
410 *
411 * Reference: <https://ssl.icu-project.org/icu-bin/convexp>
412 *
413 * NULL entries are not supported by ICU, or their mapping is unclear.
414 */
415 static const char *const pg_enc2icu_tbl[] =
416 {
417 NULL, /* PG_SQL_ASCII */
418 "EUC-JP", /* PG_EUC_JP */
419 "EUC-CN", /* PG_EUC_CN */
420 "EUC-KR", /* PG_EUC_KR */
421 "EUC-TW", /* PG_EUC_TW */
422 NULL, /* PG_EUC_JIS_2004 */
423 "UTF-8", /* PG_UTF8 */
424 NULL, /* PG_MULE_INTERNAL */
425 "ISO-8859-1", /* PG_LATIN1 */
426 "ISO-8859-2", /* PG_LATIN2 */
427 "ISO-8859-3", /* PG_LATIN3 */
428 "ISO-8859-4", /* PG_LATIN4 */
429 "ISO-8859-9", /* PG_LATIN5 */
430 "ISO-8859-10", /* PG_LATIN6 */
431 "ISO-8859-13", /* PG_LATIN7 */
432 "ISO-8859-14", /* PG_LATIN8 */
433 "ISO-8859-15", /* PG_LATIN9 */
434 NULL, /* PG_LATIN10 */
435 "CP1256", /* PG_WIN1256 */
436 "CP1258", /* PG_WIN1258 */
437 "CP866", /* PG_WIN866 */
438 NULL, /* PG_WIN874 */
439 "KOI8-R", /* PG_KOI8R */
440 "CP1251", /* PG_WIN1251 */
441 "CP1252", /* PG_WIN1252 */
442 "ISO-8859-5", /* PG_ISO_8859_5 */
443 "ISO-8859-6", /* PG_ISO_8859_6 */
444 "ISO-8859-7", /* PG_ISO_8859_7 */
445 "ISO-8859-8", /* PG_ISO_8859_8 */
446 "CP1250", /* PG_WIN1250 */
447 "CP1253", /* PG_WIN1253 */
448 "CP1254", /* PG_WIN1254 */
449 "CP1255", /* PG_WIN1255 */
450 "CP1257", /* PG_WIN1257 */
451 "KOI8-U", /* PG_KOI8U */
452 };
453
454
455 /*
456 * Is this encoding supported by ICU?
457 */
458 bool
is_encoding_supported_by_icu(int encoding)459 is_encoding_supported_by_icu(int encoding)
460 {
461 if (!PG_VALID_BE_ENCODING(encoding))
462 return false;
463 return (pg_enc2icu_tbl[encoding] != NULL);
464 }
465
466 /*
467 * Returns ICU's name for encoding, or NULL if not supported
468 */
469 const char *
get_encoding_name_for_icu(int encoding)470 get_encoding_name_for_icu(int encoding)
471 {
472 StaticAssertStmt(lengthof(pg_enc2icu_tbl) == PG_ENCODING_BE_LAST + 1,
473 "pg_enc2icu_tbl incomplete");
474
475 if (!PG_VALID_BE_ENCODING(encoding))
476 return NULL;
477 return pg_enc2icu_tbl[encoding];
478 }
479
480
481 /* ----------
482 * Encoding checks, for error returns -1 else encoding id
483 * ----------
484 */
485 int
pg_valid_client_encoding(const char * name)486 pg_valid_client_encoding(const char *name)
487 {
488 int enc;
489
490 if ((enc = pg_char_to_encoding(name)) < 0)
491 return -1;
492
493 if (!PG_VALID_FE_ENCODING(enc))
494 return -1;
495
496 return enc;
497 }
498
499 int
pg_valid_server_encoding(const char * name)500 pg_valid_server_encoding(const char *name)
501 {
502 int enc;
503
504 if ((enc = pg_char_to_encoding(name)) < 0)
505 return -1;
506
507 if (!PG_VALID_BE_ENCODING(enc))
508 return -1;
509
510 return enc;
511 }
512
513 int
pg_valid_server_encoding_id(int encoding)514 pg_valid_server_encoding_id(int encoding)
515 {
516 return PG_VALID_BE_ENCODING(encoding);
517 }
518
519 /*
520 * Remove irrelevant chars from encoding name, store at *newkey
521 *
522 * (Caller's responsibility to provide a large enough buffer)
523 */
524 static char *
clean_encoding_name(const char * key,char * newkey)525 clean_encoding_name(const char *key, char *newkey)
526 {
527 const char *p;
528 char *np;
529
530 for (p = key, np = newkey; *p != '\0'; p++)
531 {
532 if (isalnum((unsigned char) *p))
533 {
534 if (*p >= 'A' && *p <= 'Z')
535 *np++ = *p + 'a' - 'A';
536 else
537 *np++ = *p;
538 }
539 }
540 *np = '\0';
541 return newkey;
542 }
543
544 /*
545 * Search encoding by encoding name
546 *
547 * Returns encoding ID, or -1 if not recognized
548 */
549 int
pg_char_to_encoding(const char * name)550 pg_char_to_encoding(const char *name)
551 {
552 unsigned int nel = lengthof(pg_encname_tbl);
553 const pg_encname *base = pg_encname_tbl,
554 *last = base + nel - 1,
555 *position;
556 int result;
557 char buff[NAMEDATALEN],
558 *key;
559
560 if (name == NULL || *name == '\0')
561 return -1;
562
563 if (strlen(name) >= NAMEDATALEN)
564 return -1; /* it's certainly not in the table */
565
566 key = clean_encoding_name(name, buff);
567
568 while (last >= base)
569 {
570 position = base + ((last - base) >> 1);
571 result = key[0] - position->name[0];
572
573 if (result == 0)
574 {
575 result = strcmp(key, position->name);
576 if (result == 0)
577 return position->encoding;
578 }
579 if (result < 0)
580 last = position - 1;
581 else
582 base = position + 1;
583 }
584 return -1;
585 }
586
587 const char *
pg_encoding_to_char(int encoding)588 pg_encoding_to_char(int encoding)
589 {
590 if (PG_VALID_ENCODING(encoding))
591 {
592 const pg_enc2name *p = &pg_enc2name_tbl[encoding];
593
594 Assert(encoding == p->encoding);
595 return p->name;
596 }
597 return "";
598 }
599