1 /*------------------------------------------------------------------------- 2 * 3 * encnames.c 4 * Encoding names and routines for working with them. 5 * 6 * Portions Copyright (c) 2001-2021, PostgreSQL Global Development Group 7 * 8 * IDENTIFICATION 9 * src/common/encnames.c 10 * 11 *------------------------------------------------------------------------- 12 */ 13 #include "c.h" 14 15 #include <ctype.h> 16 #include <unistd.h> 17 18 #include "mb/pg_wchar.h" 19 20 21 /* ---------- 22 * All encoding names, sorted: *** A L P H A B E T I C *** 23 * 24 * All names must be without irrelevant chars, search routines use 25 * isalnum() chars only. It means ISO-8859-1, iso_8859-1 and Iso8859_1 26 * are always converted to 'iso88591'. All must be lower case. 27 * 28 * The table doesn't contain 'cs' aliases (like csISOLatin1). It's needed? 29 * 30 * Karel Zak, Aug 2001 31 * ---------- 32 */ 33 typedef struct pg_encname 34 { 35 const char *name; 36 pg_enc encoding; 37 } pg_encname; 38 39 static const pg_encname pg_encname_tbl[] = 40 { 41 { 42 "abc", PG_WIN1258 43 }, /* alias for WIN1258 */ 44 { 45 "alt", PG_WIN866 46 }, /* IBM866 */ 47 { 48 "big5", PG_BIG5 49 }, /* Big5; Chinese for Taiwan multibyte set */ 50 { 51 "euccn", PG_EUC_CN 52 }, /* EUC-CN; Extended Unix Code for simplified 53 * Chinese */ 54 { 55 "eucjis2004", PG_EUC_JIS_2004 56 }, /* EUC-JIS-2004; Extended UNIX Code fixed 57 * Width for Japanese, standard JIS X 0213 */ 58 { 59 "eucjp", PG_EUC_JP 60 }, /* EUC-JP; Extended UNIX Code fixed Width for 61 * Japanese, standard OSF */ 62 { 63 "euckr", PG_EUC_KR 64 }, /* EUC-KR; Extended Unix Code for Korean , KS 65 * X 1001 standard */ 66 { 67 "euctw", PG_EUC_TW 68 }, /* EUC-TW; Extended Unix Code for 69 * 70 * traditional Chinese */ 71 { 72 "gb18030", PG_GB18030 73 }, /* GB18030;GB18030 */ 74 { 75 "gbk", PG_GBK 76 }, /* GBK; Chinese Windows CodePage 936 77 * simplified Chinese */ 78 { 79 "iso88591", PG_LATIN1 80 }, /* ISO-8859-1; RFC1345,KXS2 */ 81 { 82 "iso885910", PG_LATIN6 83 }, /* ISO-8859-10; RFC1345,KXS2 */ 84 { 85 "iso885913", PG_LATIN7 86 }, /* ISO-8859-13; RFC1345,KXS2 */ 87 { 88 "iso885914", PG_LATIN8 89 }, /* ISO-8859-14; RFC1345,KXS2 */ 90 { 91 "iso885915", PG_LATIN9 92 }, /* ISO-8859-15; RFC1345,KXS2 */ 93 { 94 "iso885916", PG_LATIN10 95 }, /* ISO-8859-16; RFC1345,KXS2 */ 96 { 97 "iso88592", PG_LATIN2 98 }, /* ISO-8859-2; RFC1345,KXS2 */ 99 { 100 "iso88593", PG_LATIN3 101 }, /* ISO-8859-3; RFC1345,KXS2 */ 102 { 103 "iso88594", PG_LATIN4 104 }, /* ISO-8859-4; RFC1345,KXS2 */ 105 { 106 "iso88595", PG_ISO_8859_5 107 }, /* ISO-8859-5; RFC1345,KXS2 */ 108 { 109 "iso88596", PG_ISO_8859_6 110 }, /* ISO-8859-6; RFC1345,KXS2 */ 111 { 112 "iso88597", PG_ISO_8859_7 113 }, /* ISO-8859-7; RFC1345,KXS2 */ 114 { 115 "iso88598", PG_ISO_8859_8 116 }, /* ISO-8859-8; RFC1345,KXS2 */ 117 { 118 "iso88599", PG_LATIN5 119 }, /* ISO-8859-9; RFC1345,KXS2 */ 120 { 121 "johab", PG_JOHAB 122 }, /* JOHAB; Extended Unix Code for simplified 123 * Chinese */ 124 { 125 "koi8", PG_KOI8R 126 }, /* _dirty_ alias for KOI8-R (backward 127 * compatibility) */ 128 { 129 "koi8r", PG_KOI8R 130 }, /* KOI8-R; RFC1489 */ 131 { 132 "koi8u", PG_KOI8U 133 }, /* KOI8-U; RFC2319 */ 134 { 135 "latin1", PG_LATIN1 136 }, /* alias for ISO-8859-1 */ 137 { 138 "latin10", PG_LATIN10 139 }, /* alias for ISO-8859-16 */ 140 { 141 "latin2", PG_LATIN2 142 }, /* alias for ISO-8859-2 */ 143 { 144 "latin3", PG_LATIN3 145 }, /* alias for ISO-8859-3 */ 146 { 147 "latin4", PG_LATIN4 148 }, /* alias for ISO-8859-4 */ 149 { 150 "latin5", PG_LATIN5 151 }, /* alias for ISO-8859-9 */ 152 { 153 "latin6", PG_LATIN6 154 }, /* alias for ISO-8859-10 */ 155 { 156 "latin7", PG_LATIN7 157 }, /* alias for ISO-8859-13 */ 158 { 159 "latin8", PG_LATIN8 160 }, /* alias for ISO-8859-14 */ 161 { 162 "latin9", PG_LATIN9 163 }, /* alias for ISO-8859-15 */ 164 { 165 "mskanji", PG_SJIS 166 }, /* alias for Shift_JIS */ 167 { 168 "muleinternal", PG_MULE_INTERNAL 169 }, 170 { 171 "shiftjis", PG_SJIS 172 }, /* Shift_JIS; JIS X 0202-1991 */ 173 174 { 175 "shiftjis2004", PG_SHIFT_JIS_2004 176 }, /* SHIFT-JIS-2004; Shift JIS for Japanese, 177 * standard JIS X 0213 */ 178 { 179 "sjis", PG_SJIS 180 }, /* alias for Shift_JIS */ 181 { 182 "sqlascii", PG_SQL_ASCII 183 }, 184 { 185 "tcvn", PG_WIN1258 186 }, /* alias for WIN1258 */ 187 { 188 "tcvn5712", PG_WIN1258 189 }, /* alias for WIN1258 */ 190 { 191 "uhc", PG_UHC 192 }, /* UHC; Korean Windows CodePage 949 */ 193 { 194 "unicode", PG_UTF8 195 }, /* alias for UTF8 */ 196 { 197 "utf8", PG_UTF8 198 }, /* alias for UTF8 */ 199 { 200 "vscii", PG_WIN1258 201 }, /* alias for WIN1258 */ 202 { 203 "win", PG_WIN1251 204 }, /* _dirty_ alias for windows-1251 (backward 205 * compatibility) */ 206 { 207 "win1250", PG_WIN1250 208 }, /* alias for Windows-1250 */ 209 { 210 "win1251", PG_WIN1251 211 }, /* alias for Windows-1251 */ 212 { 213 "win1252", PG_WIN1252 214 }, /* alias for Windows-1252 */ 215 { 216 "win1253", PG_WIN1253 217 }, /* alias for Windows-1253 */ 218 { 219 "win1254", PG_WIN1254 220 }, /* alias for Windows-1254 */ 221 { 222 "win1255", PG_WIN1255 223 }, /* alias for Windows-1255 */ 224 { 225 "win1256", PG_WIN1256 226 }, /* alias for Windows-1256 */ 227 { 228 "win1257", PG_WIN1257 229 }, /* alias for Windows-1257 */ 230 { 231 "win1258", PG_WIN1258 232 }, /* alias for Windows-1258 */ 233 { 234 "win866", PG_WIN866 235 }, /* IBM866 */ 236 { 237 "win874", PG_WIN874 238 }, /* alias for Windows-874 */ 239 { 240 "win932", PG_SJIS 241 }, /* alias for Shift_JIS */ 242 { 243 "win936", PG_GBK 244 }, /* alias for GBK */ 245 { 246 "win949", PG_UHC 247 }, /* alias for UHC */ 248 { 249 "win950", PG_BIG5 250 }, /* alias for BIG5 */ 251 { 252 "windows1250", PG_WIN1250 253 }, /* Windows-1251; Microsoft */ 254 { 255 "windows1251", PG_WIN1251 256 }, /* Windows-1251; Microsoft */ 257 { 258 "windows1252", PG_WIN1252 259 }, /* Windows-1252; Microsoft */ 260 { 261 "windows1253", PG_WIN1253 262 }, /* Windows-1253; Microsoft */ 263 { 264 "windows1254", PG_WIN1254 265 }, /* Windows-1254; Microsoft */ 266 { 267 "windows1255", PG_WIN1255 268 }, /* Windows-1255; Microsoft */ 269 { 270 "windows1256", PG_WIN1256 271 }, /* Windows-1256; Microsoft */ 272 { 273 "windows1257", PG_WIN1257 274 }, /* Windows-1257; Microsoft */ 275 { 276 "windows1258", PG_WIN1258 277 }, /* Windows-1258; Microsoft */ 278 { 279 "windows866", PG_WIN866 280 }, /* IBM866 */ 281 { 282 "windows874", PG_WIN874 283 }, /* Windows-874; Microsoft */ 284 { 285 "windows932", PG_SJIS 286 }, /* alias for Shift_JIS */ 287 { 288 "windows936", PG_GBK 289 }, /* alias for GBK */ 290 { 291 "windows949", PG_UHC 292 }, /* alias for UHC */ 293 { 294 "windows950", PG_BIG5 295 } /* alias for BIG5 */ 296 }; 297 298 /* ---------- 299 * These are "official" encoding names. 300 * XXX must be sorted by the same order as enum pg_enc (in mb/pg_wchar.h) 301 * ---------- 302 */ 303 #ifndef WIN32 304 #define DEF_ENC2NAME(name, codepage) { #name, PG_##name } 305 #else 306 #define DEF_ENC2NAME(name, codepage) { #name, PG_##name, codepage } 307 #endif 308 309 const pg_enc2name pg_enc2name_tbl[] = 310 { 311 DEF_ENC2NAME(SQL_ASCII, 0), 312 DEF_ENC2NAME(EUC_JP, 20932), 313 DEF_ENC2NAME(EUC_CN, 20936), 314 DEF_ENC2NAME(EUC_KR, 51949), 315 DEF_ENC2NAME(EUC_TW, 0), 316 DEF_ENC2NAME(EUC_JIS_2004, 20932), 317 DEF_ENC2NAME(UTF8, 65001), 318 DEF_ENC2NAME(MULE_INTERNAL, 0), 319 DEF_ENC2NAME(LATIN1, 28591), 320 DEF_ENC2NAME(LATIN2, 28592), 321 DEF_ENC2NAME(LATIN3, 28593), 322 DEF_ENC2NAME(LATIN4, 28594), 323 DEF_ENC2NAME(LATIN5, 28599), 324 DEF_ENC2NAME(LATIN6, 0), 325 DEF_ENC2NAME(LATIN7, 0), 326 DEF_ENC2NAME(LATIN8, 0), 327 DEF_ENC2NAME(LATIN9, 28605), 328 DEF_ENC2NAME(LATIN10, 0), 329 DEF_ENC2NAME(WIN1256, 1256), 330 DEF_ENC2NAME(WIN1258, 1258), 331 DEF_ENC2NAME(WIN866, 866), 332 DEF_ENC2NAME(WIN874, 874), 333 DEF_ENC2NAME(KOI8R, 20866), 334 DEF_ENC2NAME(WIN1251, 1251), 335 DEF_ENC2NAME(WIN1252, 1252), 336 DEF_ENC2NAME(ISO_8859_5, 28595), 337 DEF_ENC2NAME(ISO_8859_6, 28596), 338 DEF_ENC2NAME(ISO_8859_7, 28597), 339 DEF_ENC2NAME(ISO_8859_8, 28598), 340 DEF_ENC2NAME(WIN1250, 1250), 341 DEF_ENC2NAME(WIN1253, 1253), 342 DEF_ENC2NAME(WIN1254, 1254), 343 DEF_ENC2NAME(WIN1255, 1255), 344 DEF_ENC2NAME(WIN1257, 1257), 345 DEF_ENC2NAME(KOI8U, 21866), 346 DEF_ENC2NAME(SJIS, 932), 347 DEF_ENC2NAME(BIG5, 950), 348 DEF_ENC2NAME(GBK, 936), 349 DEF_ENC2NAME(UHC, 949), 350 DEF_ENC2NAME(GB18030, 54936), 351 DEF_ENC2NAME(JOHAB, 0), 352 DEF_ENC2NAME(SHIFT_JIS_2004, 932) 353 }; 354 355 /* ---------- 356 * These are encoding names for gettext. 357 * 358 * This covers all encodings except MULE_INTERNAL, which is alien to gettext. 359 * ---------- 360 */ 361 const pg_enc2gettext pg_enc2gettext_tbl[] = 362 { 363 {PG_SQL_ASCII, "US-ASCII"}, 364 {PG_UTF8, "UTF-8"}, 365 {PG_LATIN1, "LATIN1"}, 366 {PG_LATIN2, "LATIN2"}, 367 {PG_LATIN3, "LATIN3"}, 368 {PG_LATIN4, "LATIN4"}, 369 {PG_ISO_8859_5, "ISO-8859-5"}, 370 {PG_ISO_8859_6, "ISO_8859-6"}, 371 {PG_ISO_8859_7, "ISO-8859-7"}, 372 {PG_ISO_8859_8, "ISO-8859-8"}, 373 {PG_LATIN5, "LATIN5"}, 374 {PG_LATIN6, "LATIN6"}, 375 {PG_LATIN7, "LATIN7"}, 376 {PG_LATIN8, "LATIN8"}, 377 {PG_LATIN9, "LATIN-9"}, 378 {PG_LATIN10, "LATIN10"}, 379 {PG_KOI8R, "KOI8-R"}, 380 {PG_KOI8U, "KOI8-U"}, 381 {PG_WIN1250, "CP1250"}, 382 {PG_WIN1251, "CP1251"}, 383 {PG_WIN1252, "CP1252"}, 384 {PG_WIN1253, "CP1253"}, 385 {PG_WIN1254, "CP1254"}, 386 {PG_WIN1255, "CP1255"}, 387 {PG_WIN1256, "CP1256"}, 388 {PG_WIN1257, "CP1257"}, 389 {PG_WIN1258, "CP1258"}, 390 {PG_WIN866, "CP866"}, 391 {PG_WIN874, "CP874"}, 392 {PG_EUC_CN, "EUC-CN"}, 393 {PG_EUC_JP, "EUC-JP"}, 394 {PG_EUC_KR, "EUC-KR"}, 395 {PG_EUC_TW, "EUC-TW"}, 396 {PG_EUC_JIS_2004, "EUC-JP"}, 397 {PG_SJIS, "SHIFT-JIS"}, 398 {PG_BIG5, "BIG5"}, 399 {PG_GBK, "GBK"}, 400 {PG_UHC, "UHC"}, 401 {PG_GB18030, "GB18030"}, 402 {PG_JOHAB, "JOHAB"}, 403 {PG_SHIFT_JIS_2004, "SHIFT_JISX0213"}, 404 {0, NULL} 405 }; 406 407 408 /* 409 * Table of encoding names for ICU (currently covers backend encodings only) 410 * 411 * Reference: <https://ssl.icu-project.org/icu-bin/convexp> 412 * 413 * NULL entries are not supported by ICU, or their mapping is unclear. 414 */ 415 static const char *const pg_enc2icu_tbl[] = 416 { 417 NULL, /* PG_SQL_ASCII */ 418 "EUC-JP", /* PG_EUC_JP */ 419 "EUC-CN", /* PG_EUC_CN */ 420 "EUC-KR", /* PG_EUC_KR */ 421 "EUC-TW", /* PG_EUC_TW */ 422 NULL, /* PG_EUC_JIS_2004 */ 423 "UTF-8", /* PG_UTF8 */ 424 NULL, /* PG_MULE_INTERNAL */ 425 "ISO-8859-1", /* PG_LATIN1 */ 426 "ISO-8859-2", /* PG_LATIN2 */ 427 "ISO-8859-3", /* PG_LATIN3 */ 428 "ISO-8859-4", /* PG_LATIN4 */ 429 "ISO-8859-9", /* PG_LATIN5 */ 430 "ISO-8859-10", /* PG_LATIN6 */ 431 "ISO-8859-13", /* PG_LATIN7 */ 432 "ISO-8859-14", /* PG_LATIN8 */ 433 "ISO-8859-15", /* PG_LATIN9 */ 434 NULL, /* PG_LATIN10 */ 435 "CP1256", /* PG_WIN1256 */ 436 "CP1258", /* PG_WIN1258 */ 437 "CP866", /* PG_WIN866 */ 438 NULL, /* PG_WIN874 */ 439 "KOI8-R", /* PG_KOI8R */ 440 "CP1251", /* PG_WIN1251 */ 441 "CP1252", /* PG_WIN1252 */ 442 "ISO-8859-5", /* PG_ISO_8859_5 */ 443 "ISO-8859-6", /* PG_ISO_8859_6 */ 444 "ISO-8859-7", /* PG_ISO_8859_7 */ 445 "ISO-8859-8", /* PG_ISO_8859_8 */ 446 "CP1250", /* PG_WIN1250 */ 447 "CP1253", /* PG_WIN1253 */ 448 "CP1254", /* PG_WIN1254 */ 449 "CP1255", /* PG_WIN1255 */ 450 "CP1257", /* PG_WIN1257 */ 451 "KOI8-U", /* PG_KOI8U */ 452 }; 453 454 455 /* 456 * Is this encoding supported by ICU? 457 */ 458 bool 459 is_encoding_supported_by_icu(int encoding) 460 { 461 if (!PG_VALID_BE_ENCODING(encoding)) 462 return false; 463 return (pg_enc2icu_tbl[encoding] != NULL); 464 } 465 466 /* 467 * Returns ICU's name for encoding, or NULL if not supported 468 */ 469 const char * 470 get_encoding_name_for_icu(int encoding) 471 { 472 StaticAssertStmt(lengthof(pg_enc2icu_tbl) == PG_ENCODING_BE_LAST + 1, 473 "pg_enc2icu_tbl incomplete"); 474 475 if (!PG_VALID_BE_ENCODING(encoding)) 476 return NULL; 477 return pg_enc2icu_tbl[encoding]; 478 } 479 480 481 /* ---------- 482 * Encoding checks, for error returns -1 else encoding id 483 * ---------- 484 */ 485 int 486 pg_valid_client_encoding(const char *name) 487 { 488 int enc; 489 490 if ((enc = pg_char_to_encoding(name)) < 0) 491 return -1; 492 493 if (!PG_VALID_FE_ENCODING(enc)) 494 return -1; 495 496 return enc; 497 } 498 499 int 500 pg_valid_server_encoding(const char *name) 501 { 502 int enc; 503 504 if ((enc = pg_char_to_encoding(name)) < 0) 505 return -1; 506 507 if (!PG_VALID_BE_ENCODING(enc)) 508 return -1; 509 510 return enc; 511 } 512 513 int 514 pg_valid_server_encoding_id(int encoding) 515 { 516 return PG_VALID_BE_ENCODING(encoding); 517 } 518 519 /* 520 * Remove irrelevant chars from encoding name, store at *newkey 521 * 522 * (Caller's responsibility to provide a large enough buffer) 523 */ 524 static char * 525 clean_encoding_name(const char *key, char *newkey) 526 { 527 const char *p; 528 char *np; 529 530 for (p = key, np = newkey; *p != '\0'; p++) 531 { 532 if (isalnum((unsigned char) *p)) 533 { 534 if (*p >= 'A' && *p <= 'Z') 535 *np++ = *p + 'a' - 'A'; 536 else 537 *np++ = *p; 538 } 539 } 540 *np = '\0'; 541 return newkey; 542 } 543 544 /* 545 * Search encoding by encoding name 546 * 547 * Returns encoding ID, or -1 if not recognized 548 */ 549 int 550 pg_char_to_encoding(const char *name) 551 { 552 unsigned int nel = lengthof(pg_encname_tbl); 553 const pg_encname *base = pg_encname_tbl, 554 *last = base + nel - 1, 555 *position; 556 int result; 557 char buff[NAMEDATALEN], 558 *key; 559 560 if (name == NULL || *name == '\0') 561 return -1; 562 563 if (strlen(name) >= NAMEDATALEN) 564 return -1; /* it's certainly not in the table */ 565 566 key = clean_encoding_name(name, buff); 567 568 while (last >= base) 569 { 570 position = base + ((last - base) >> 1); 571 result = key[0] - position->name[0]; 572 573 if (result == 0) 574 { 575 result = strcmp(key, position->name); 576 if (result == 0) 577 return position->encoding; 578 } 579 if (result < 0) 580 last = position - 1; 581 else 582 base = position + 1; 583 } 584 return -1; 585 } 586 587 const char * 588 pg_encoding_to_char(int encoding) 589 { 590 if (PG_VALID_ENCODING(encoding)) 591 { 592 const pg_enc2name *p = &pg_enc2name_tbl[encoding]; 593 594 Assert(encoding == p->encoding); 595 return p->name; 596 } 597 return ""; 598 } 599