1 /* Determine a canonical name for the current locale's character encoding. 2 3 Copyright (C) 2000-2006, 2008-2015 Free Software Foundation, Inc. 4 5 This program is free software; you can redistribute it and/or modify 6 it under the terms of the GNU General Public License as published by 7 the Free Software Foundation; either version 3, or (at your option) 8 any later version. 9 10 This program is distributed in the hope that it will be useful, 11 but WITHOUT ANY WARRANTY; without even the implied warranty of 12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 GNU General Public License for more details. 14 15 You should have received a copy of the GNU General Public License along 16 with this program; if not, see <http://www.gnu.org/licenses/>. */ 17 18 /* Written by Bruno Haible <bruno@clisp.org>. */ 19 20 #include <config.h> 21 22 /* Specification. */ 23 #include "localcharset.h" 24 25 #include <fcntl.h> 26 #include <stddef.h> 27 #include <stdio.h> 28 #include <string.h> 29 #include <stdlib.h> 30 31 #if defined __APPLE__ && defined __MACH__ && HAVE_LANGINFO_CODESET 32 # define DARWIN7 /* Darwin 7 or newer, i.e. Mac OS X 10.3 or newer */ 33 #endif 34 35 #if defined _WIN32 || defined __WIN32__ 36 # define WINDOWS_NATIVE 37 # include <locale.h> 38 #endif 39 40 #if defined __EMX__ 41 /* Assume EMX program runs on OS/2, even if compiled under DOS. */ 42 # ifndef OS2 43 # define OS2 44 # endif 45 #endif 46 47 #if !defined WINDOWS_NATIVE 48 # include <unistd.h> 49 # if HAVE_LANGINFO_CODESET 50 # include <langinfo.h> 51 # else 52 # if 0 /* see comment below */ 53 # include <locale.h> 54 # endif 55 # endif 56 # ifdef __CYGWIN__ 57 # define WIN32_LEAN_AND_MEAN 58 # include <windows.h> 59 # endif 60 #elif defined WINDOWS_NATIVE 61 # define WIN32_LEAN_AND_MEAN 62 # include <windows.h> 63 #endif 64 #if defined OS2 65 # define INCL_DOS 66 # include <os2.h> 67 #endif 68 69 /* For MB_CUR_MAX_L */ 70 #if defined DARWIN7 71 # include <xlocale.h> 72 #endif 73 74 #if ENABLE_RELOCATABLE 75 # include "relocatable.h" 76 #else 77 # define relocate(pathname) (pathname) 78 #endif 79 80 /* Get LIBDIR. */ 81 #ifndef LIBDIR 82 # include "configmake.h" 83 #endif 84 85 /* Define O_NOFOLLOW to 0 on platforms where it does not exist. */ 86 #ifndef O_NOFOLLOW 87 # define O_NOFOLLOW 0 88 #endif 89 90 #if defined _WIN32 || defined __WIN32__ || defined __CYGWIN__ || defined __EMX__ || defined __DJGPP__ 91 /* Native Windows, Cygwin, OS/2, DOS */ 92 # define ISSLASH(C) ((C) == '/' || (C) == '\\') 93 #endif 94 95 #ifndef DIRECTORY_SEPARATOR 96 # define DIRECTORY_SEPARATOR '/' 97 #endif 98 99 #ifndef ISSLASH 100 # define ISSLASH(C) ((C) == DIRECTORY_SEPARATOR) 101 #endif 102 103 #if HAVE_DECL_GETC_UNLOCKED 104 # undef getc 105 # define getc getc_unlocked 106 #endif 107 108 /* The following static variable is declared 'volatile' to avoid a 109 possible multithread problem in the function get_charset_aliases. If we 110 are running in a threaded environment, and if two threads initialize 111 'charset_aliases' simultaneously, both will produce the same value, 112 and everything will be ok if the two assignments to 'charset_aliases' 113 are atomic. But I don't know what will happen if the two assignments mix. */ 114 #if __STDC__ != 1 115 # define volatile /* empty */ 116 #endif 117 /* Pointer to the contents of the charset.alias file, if it has already been 118 read, else NULL. Its format is: 119 ALIAS_1 '\0' CANONICAL_1 '\0' ... ALIAS_n '\0' CANONICAL_n '\0' '\0' */ 120 static const char * volatile charset_aliases; 121 122 /* Return a pointer to the contents of the charset.alias file. */ 123 static const char * 124 get_charset_aliases (void) 125 { 126 const char *cp; 127 128 cp = charset_aliases; 129 if (cp == NULL) 130 { 131 #if !(defined DARWIN7 || defined VMS || defined WINDOWS_NATIVE || defined __CYGWIN__ || defined OS2) 132 const char *dir; 133 const char *base = "charset.alias"; 134 char *file_name; 135 136 /* Make it possible to override the charset.alias location. This is 137 necessary for running the testsuite before "make install". */ 138 dir = getenv ("CHARSETALIASDIR"); 139 if (dir == NULL || dir[0] == '\0') 140 dir = relocate (LIBDIR); 141 142 /* Concatenate dir and base into freshly allocated file_name. */ 143 { 144 size_t dir_len = strlen (dir); 145 size_t base_len = strlen (base); 146 int add_slash = (dir_len > 0 && !ISSLASH (dir[dir_len - 1])); 147 file_name = (char *) malloc (dir_len + add_slash + base_len + 1); 148 if (file_name != NULL) 149 { 150 memcpy (file_name, dir, dir_len); 151 if (add_slash) 152 file_name[dir_len] = DIRECTORY_SEPARATOR; 153 memcpy (file_name + dir_len + add_slash, base, base_len + 1); 154 } 155 } 156 157 if (file_name == NULL) 158 /* Out of memory. Treat the file as empty. */ 159 cp = ""; 160 else 161 { 162 int fd; 163 164 /* Open the file. Reject symbolic links on platforms that support 165 O_NOFOLLOW. This is a security feature. Without it, an attacker 166 could retrieve parts of the contents (namely, the tail of the 167 first line that starts with "* ") of an arbitrary file by placing 168 a symbolic link to that file under the name "charset.alias" in 169 some writable directory and defining the environment variable 170 CHARSETALIASDIR to point to that directory. */ 171 fd = open (file_name, 172 O_RDONLY | (HAVE_WORKING_O_NOFOLLOW ? O_NOFOLLOW : 0)); 173 if (fd < 0) 174 /* File not found. Treat it as empty. */ 175 cp = ""; 176 else 177 { 178 FILE *fp; 179 180 fp = fdopen (fd, "r"); 181 if (fp == NULL) 182 { 183 /* Out of memory. Treat the file as empty. */ 184 close (fd); 185 cp = ""; 186 } 187 else 188 { 189 /* Parse the file's contents. */ 190 char *res_ptr = NULL; 191 size_t res_size = 0; 192 193 for (;;) 194 { 195 int c; 196 char buf1[50+1]; 197 char buf2[50+1]; 198 size_t l1, l2; 199 char *old_res_ptr; 200 201 c = getc (fp); 202 if (c == EOF) 203 break; 204 if (c == '\n' || c == ' ' || c == '\t') 205 continue; 206 if (c == '#') 207 { 208 /* Skip comment, to end of line. */ 209 do 210 c = getc (fp); 211 while (!(c == EOF || c == '\n')); 212 if (c == EOF) 213 break; 214 continue; 215 } 216 ungetc (c, fp); 217 if (fscanf (fp, "%50s %50s", buf1, buf2) < 2) 218 break; 219 l1 = strlen (buf1); 220 l2 = strlen (buf2); 221 old_res_ptr = res_ptr; 222 if (res_size == 0) 223 { 224 res_size = l1 + 1 + l2 + 1; 225 res_ptr = (char *) malloc (res_size + 1); 226 } 227 else 228 { 229 res_size += l1 + 1 + l2 + 1; 230 res_ptr = (char *) realloc (res_ptr, res_size + 1); 231 } 232 if (res_ptr == NULL) 233 { 234 /* Out of memory. */ 235 res_size = 0; 236 free (old_res_ptr); 237 break; 238 } 239 strcpy (res_ptr + res_size - (l2 + 1) - (l1 + 1), buf1); 240 strcpy (res_ptr + res_size - (l2 + 1), buf2); 241 } 242 fclose (fp); 243 if (res_size == 0) 244 cp = ""; 245 else 246 { 247 *(res_ptr + res_size) = '\0'; 248 cp = res_ptr; 249 } 250 } 251 } 252 253 free (file_name); 254 } 255 256 #else 257 258 # if defined DARWIN7 259 /* To avoid the trouble of installing a file that is shared by many 260 GNU packages -- many packaging systems have problems with this --, 261 simply inline the aliases here. */ 262 cp = "ISO8859-1" "\0" "ISO-8859-1" "\0" 263 "ISO8859-2" "\0" "ISO-8859-2" "\0" 264 "ISO8859-4" "\0" "ISO-8859-4" "\0" 265 "ISO8859-5" "\0" "ISO-8859-5" "\0" 266 "ISO8859-7" "\0" "ISO-8859-7" "\0" 267 "ISO8859-9" "\0" "ISO-8859-9" "\0" 268 "ISO8859-13" "\0" "ISO-8859-13" "\0" 269 "ISO8859-15" "\0" "ISO-8859-15" "\0" 270 "KOI8-R" "\0" "KOI8-R" "\0" 271 "KOI8-U" "\0" "KOI8-U" "\0" 272 "CP866" "\0" "CP866" "\0" 273 "CP949" "\0" "CP949" "\0" 274 "CP1131" "\0" "CP1131" "\0" 275 "CP1251" "\0" "CP1251" "\0" 276 "eucCN" "\0" "GB2312" "\0" 277 "GB2312" "\0" "GB2312" "\0" 278 "eucJP" "\0" "EUC-JP" "\0" 279 "eucKR" "\0" "EUC-KR" "\0" 280 "Big5" "\0" "BIG5" "\0" 281 "Big5HKSCS" "\0" "BIG5-HKSCS" "\0" 282 "GBK" "\0" "GBK" "\0" 283 "GB18030" "\0" "GB18030" "\0" 284 "SJIS" "\0" "SHIFT_JIS" "\0" 285 "ARMSCII-8" "\0" "ARMSCII-8" "\0" 286 "PT154" "\0" "PT154" "\0" 287 /*"ISCII-DEV" "\0" "?" "\0"*/ 288 "*" "\0" "UTF-8" "\0"; 289 # endif 290 291 # if defined VMS 292 /* To avoid the troubles of an extra file charset.alias_vms in the 293 sources of many GNU packages, simply inline the aliases here. */ 294 /* The list of encodings is taken from the OpenVMS 7.3-1 documentation 295 "Compaq C Run-Time Library Reference Manual for OpenVMS systems" 296 section 10.7 "Handling Different Character Sets". */ 297 cp = "ISO8859-1" "\0" "ISO-8859-1" "\0" 298 "ISO8859-2" "\0" "ISO-8859-2" "\0" 299 "ISO8859-5" "\0" "ISO-8859-5" "\0" 300 "ISO8859-7" "\0" "ISO-8859-7" "\0" 301 "ISO8859-8" "\0" "ISO-8859-8" "\0" 302 "ISO8859-9" "\0" "ISO-8859-9" "\0" 303 /* Japanese */ 304 "eucJP" "\0" "EUC-JP" "\0" 305 "SJIS" "\0" "SHIFT_JIS" "\0" 306 "DECKANJI" "\0" "DEC-KANJI" "\0" 307 "SDECKANJI" "\0" "EUC-JP" "\0" 308 /* Chinese */ 309 "eucTW" "\0" "EUC-TW" "\0" 310 "DECHANYU" "\0" "DEC-HANYU" "\0" 311 "DECHANZI" "\0" "GB2312" "\0" 312 /* Korean */ 313 "DECKOREAN" "\0" "EUC-KR" "\0"; 314 # endif 315 316 # if defined WINDOWS_NATIVE || defined __CYGWIN__ 317 /* To avoid the troubles of installing a separate file in the same 318 directory as the DLL and of retrieving the DLL's directory at 319 runtime, simply inline the aliases here. */ 320 321 cp = "CP936" "\0" "GBK" "\0" 322 "CP1361" "\0" "JOHAB" "\0" 323 "CP20127" "\0" "ASCII" "\0" 324 "CP20866" "\0" "KOI8-R" "\0" 325 "CP20936" "\0" "GB2312" "\0" 326 "CP21866" "\0" "KOI8-RU" "\0" 327 "CP28591" "\0" "ISO-8859-1" "\0" 328 "CP28592" "\0" "ISO-8859-2" "\0" 329 "CP28593" "\0" "ISO-8859-3" "\0" 330 "CP28594" "\0" "ISO-8859-4" "\0" 331 "CP28595" "\0" "ISO-8859-5" "\0" 332 "CP28596" "\0" "ISO-8859-6" "\0" 333 "CP28597" "\0" "ISO-8859-7" "\0" 334 "CP28598" "\0" "ISO-8859-8" "\0" 335 "CP28599" "\0" "ISO-8859-9" "\0" 336 "CP28605" "\0" "ISO-8859-15" "\0" 337 "CP38598" "\0" "ISO-8859-8" "\0" 338 "CP51932" "\0" "EUC-JP" "\0" 339 "CP51936" "\0" "GB2312" "\0" 340 "CP51949" "\0" "EUC-KR" "\0" 341 "CP51950" "\0" "EUC-TW" "\0" 342 "CP54936" "\0" "GB18030" "\0" 343 "CP65001" "\0" "UTF-8" "\0"; 344 # endif 345 # if defined OS2 346 /* To avoid the troubles of installing a separate file in the same 347 directory as the DLL and of retrieving the DLL's directory at 348 runtime, simply inline the aliases here. */ 349 350 /* The list of encodings is taken from "List of OS/2 Codepages" 351 by Alex Taylor: 352 <http://altsan.org/os2/toolkits/uls/index.html#codepages>. 353 See also "IBM Globalization - Code page identifiers": 354 <http://www-01.ibm.com/software/globalization/cp/cp_cpgid.html>. */ 355 cp = "CP813" "\0" "ISO-8859-7" "\0" 356 "CP878" "\0" "KOI8-R" "\0" 357 "CP819" "\0" "ISO-8859-1" "\0" 358 "CP912" "\0" "ISO-8859-2" "\0" 359 "CP913" "\0" "ISO-8859-3" "\0" 360 "CP914" "\0" "ISO-8859-4" "\0" 361 "CP915" "\0" "ISO-8859-5" "\0" 362 "CP916" "\0" "ISO-8859-8" "\0" 363 "CP920" "\0" "ISO-8859-9" "\0" 364 "CP921" "\0" "ISO-8859-13" "\0" 365 "CP923" "\0" "ISO-8859-15" "\0" 366 "CP954" "\0" "EUC-JP" "\0" 367 "CP964" "\0" "EUC-TW" "\0" 368 "CP970" "\0" "EUC-KR" "\0" 369 "CP1089" "\0" "ISO-8859-6" "\0" 370 "CP1208" "\0" "UTF-8" "\0" 371 "CP1381" "\0" "GB2312" "\0" 372 "CP1386" "\0" "GBK" "\0" 373 "CP3372" "\0" "EUC-JP" "\0"; 374 # endif 375 #endif 376 377 charset_aliases = cp; 378 } 379 380 return cp; 381 } 382 383 /* Determine the current locale's character encoding, and canonicalize it 384 into one of the canonical names listed in config.charset. 385 The result must not be freed; it is statically allocated. 386 If the canonical name cannot be determined, the result is a non-canonical 387 name. */ 388 389 #ifdef STATIC 390 STATIC 391 #endif 392 const char * 393 locale_charset (void) 394 { 395 const char *codeset; 396 const char *aliases; 397 398 #if !(defined WINDOWS_NATIVE || defined OS2) 399 400 # if HAVE_LANGINFO_CODESET 401 402 /* Most systems support nl_langinfo (CODESET) nowadays. */ 403 codeset = nl_langinfo (CODESET); 404 405 # ifdef __CYGWIN__ 406 /* Cygwin < 1.7 does not have locales. nl_langinfo (CODESET) always 407 returns "US-ASCII". Return the suffix of the locale name from the 408 environment variables (if present) or the codepage as a number. */ 409 if (codeset != NULL && strcmp (codeset, "US-ASCII") == 0) 410 { 411 const char *locale; 412 static char buf[2 + 10 + 1]; 413 414 locale = getenv ("LC_ALL"); 415 if (locale == NULL || locale[0] == '\0') 416 { 417 locale = getenv ("LC_CTYPE"); 418 if (locale == NULL || locale[0] == '\0') 419 locale = getenv ("LANG"); 420 } 421 if (locale != NULL && locale[0] != '\0') 422 { 423 /* If the locale name contains an encoding after the dot, return 424 it. */ 425 const char *dot = strchr (locale, '.'); 426 427 if (dot != NULL) 428 { 429 const char *modifier; 430 431 dot++; 432 /* Look for the possible @... trailer and remove it, if any. */ 433 modifier = strchr (dot, '@'); 434 if (modifier == NULL) 435 return dot; 436 if (modifier - dot < sizeof (buf)) 437 { 438 memcpy (buf, dot, modifier - dot); 439 buf [modifier - dot] = '\0'; 440 return buf; 441 } 442 } 443 } 444 445 /* The Windows API has a function returning the locale's codepage as a 446 number: GetACP(). This encoding is used by Cygwin, unless the user 447 has set the environment variable CYGWIN=codepage:oem (which very few 448 people do). 449 Output directed to console windows needs to be converted (to 450 GetOEMCP() if the console is using a raster font, or to 451 GetConsoleOutputCP() if it is using a TrueType font). Cygwin does 452 this conversion transparently (see winsup/cygwin/fhandler_console.cc), 453 converting to GetConsoleOutputCP(). This leads to correct results, 454 except when SetConsoleOutputCP has been called and a raster font is 455 in use. */ 456 sprintf (buf, "CP%u", GetACP ()); 457 codeset = buf; 458 } 459 # endif 460 461 # else 462 463 /* On old systems which lack it, use setlocale or getenv. */ 464 const char *locale = NULL; 465 466 /* But most old systems don't have a complete set of locales. Some 467 (like SunOS 4 or DJGPP) have only the C locale. Therefore we don't 468 use setlocale here; it would return "C" when it doesn't support the 469 locale name the user has set. */ 470 # if 0 471 locale = setlocale (LC_CTYPE, NULL); 472 # endif 473 if (locale == NULL || locale[0] == '\0') 474 { 475 locale = getenv ("LC_ALL"); 476 if (locale == NULL || locale[0] == '\0') 477 { 478 locale = getenv ("LC_CTYPE"); 479 if (locale == NULL || locale[0] == '\0') 480 locale = getenv ("LANG"); 481 } 482 } 483 484 /* On some old systems, one used to set locale = "iso8859_1". On others, 485 you set it to "language_COUNTRY.charset". In any case, we resolve it 486 through the charset.alias file. */ 487 codeset = locale; 488 489 # endif 490 491 #elif defined WINDOWS_NATIVE 492 493 static char buf[2 + 10 + 1]; 494 495 /* The Windows API has a function returning the locale's codepage as 496 a number, but the value doesn't change according to what the 497 'setlocale' call specified. So we use it as a last resort, in 498 case the string returned by 'setlocale' doesn't specify the 499 codepage. */ 500 char *current_locale = setlocale (LC_ALL, NULL); 501 char *pdot; 502 503 /* If they set different locales for different categories, 504 'setlocale' will return a semi-colon separated list of locale 505 values. To make sure we use the correct one, we choose LC_CTYPE. */ 506 if (strchr (current_locale, ';')) 507 current_locale = setlocale (LC_CTYPE, NULL); 508 509 pdot = strrchr (current_locale, '.'); 510 if (pdot) 511 sprintf (buf, "CP%s", pdot + 1); 512 else 513 { 514 /* The Windows API has a function returning the locale's codepage as a 515 number: GetACP(). 516 When the output goes to a console window, it needs to be provided in 517 GetOEMCP() encoding if the console is using a raster font, or in 518 GetConsoleOutputCP() encoding if it is using a TrueType font. 519 But in GUI programs and for output sent to files and pipes, GetACP() 520 encoding is the best bet. */ 521 sprintf (buf, "CP%u", GetACP ()); 522 } 523 codeset = buf; 524 525 #elif defined OS2 526 527 const char *locale; 528 static char buf[2 + 10 + 1]; 529 ULONG cp[3]; 530 ULONG cplen; 531 532 codeset = NULL; 533 534 /* Allow user to override the codeset, as set in the operating system, 535 with standard language environment variables. */ 536 locale = getenv ("LC_ALL"); 537 if (locale == NULL || locale[0] == '\0') 538 { 539 locale = getenv ("LC_CTYPE"); 540 if (locale == NULL || locale[0] == '\0') 541 locale = getenv ("LANG"); 542 } 543 if (locale != NULL && locale[0] != '\0') 544 { 545 /* If the locale name contains an encoding after the dot, return it. */ 546 const char *dot = strchr (locale, '.'); 547 548 if (dot != NULL) 549 { 550 const char *modifier; 551 552 dot++; 553 /* Look for the possible @... trailer and remove it, if any. */ 554 modifier = strchr (dot, '@'); 555 if (modifier == NULL) 556 return dot; 557 if (modifier - dot < sizeof (buf)) 558 { 559 memcpy (buf, dot, modifier - dot); 560 buf [modifier - dot] = '\0'; 561 return buf; 562 } 563 } 564 565 /* For the POSIX locale, don't use the system's codepage. */ 566 if (strcmp (locale, "C") == 0 || strcmp (locale, "POSIX") == 0) 567 codeset = ""; 568 } 569 570 if (codeset == NULL) 571 { 572 /* OS/2 has a function returning the locale's codepage as a number. */ 573 if (DosQueryCp (sizeof (cp), cp, &cplen)) 574 codeset = ""; 575 else 576 { 577 sprintf (buf, "CP%u", cp[0]); 578 codeset = buf; 579 } 580 } 581 582 #endif 583 584 if (codeset == NULL) 585 /* The canonical name cannot be determined. */ 586 codeset = ""; 587 588 /* Resolve alias. */ 589 for (aliases = get_charset_aliases (); 590 *aliases != '\0'; 591 aliases += strlen (aliases) + 1, aliases += strlen (aliases) + 1) 592 if (strcmp (codeset, aliases) == 0 593 || (aliases[0] == '*' && aliases[1] == '\0')) 594 { 595 codeset = aliases + strlen (aliases) + 1; 596 break; 597 } 598 599 /* Don't return an empty string. GNU libc and GNU libiconv interpret 600 the empty string as denoting "the locale's character encoding", 601 thus GNU libiconv would call this function a second time. */ 602 if (codeset[0] == '\0') 603 codeset = "ASCII"; 604 605 #ifdef DARWIN7 606 /* Mac OS X sets MB_CUR_MAX to 1 when LC_ALL=C, and "UTF-8" 607 (the default codeset) does not work when MB_CUR_MAX is 1. */ 608 if (strcmp (codeset, "UTF-8") == 0 && MB_CUR_MAX_L (uselocale (NULL)) <= 1) 609 codeset = "ASCII"; 610 #endif 611 612 return codeset; 613 } 614