1 /* Determine a canonical name for the current locale's character encoding. 2 3 Copyright (C) 2000-2006, 2008-2014 Free Software Foundation, Inc. 4 5 This program is free software; you can redistribute it and/or modify 6 it under the terms of the GNU General Public License as published by 7 the Free Software Foundation; either version 3, or (at your option) 8 any later version. 9 10 This program is distributed in the hope that it will be useful, 11 but WITHOUT ANY WARRANTY; without even the implied warranty of 12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 GNU General Public License for more details. 14 15 You should have received a copy of the GNU General Public License along 16 with this program; if not, see <http://www.gnu.org/licenses/>. */ 17 18 /* Written by Bruno Haible <bruno@clisp.org>. */ 19 20 #include <config.h> 21 22 /* Specification. */ 23 #include "localcharset.h" 24 25 #include <fcntl.h> 26 #include <stddef.h> 27 #include <stdio.h> 28 #include <string.h> 29 #include <stdlib.h> 30 31 #if defined __APPLE__ && defined __MACH__ && HAVE_LANGINFO_CODESET 32 # define DARWIN7 /* Darwin 7 or newer, i.e. Mac OS X 10.3 or newer */ 33 #endif 34 35 #if defined _WIN32 || defined __WIN32__ 36 # define WINDOWS_NATIVE 37 #endif 38 39 #if defined __EMX__ 40 /* Assume EMX program runs on OS/2, even if compiled under DOS. */ 41 # ifndef OS2 42 # define OS2 43 # endif 44 #endif 45 46 #if !defined WINDOWS_NATIVE 47 # include <unistd.h> 48 # if HAVE_LANGINFO_CODESET 49 # include <langinfo.h> 50 # else 51 # if 0 /* see comment below */ 52 # include <locale.h> 53 # endif 54 # endif 55 # ifdef __CYGWIN__ 56 # define WIN32_LEAN_AND_MEAN 57 # include <windows.h> 58 # endif 59 #elif defined WINDOWS_NATIVE 60 # define WIN32_LEAN_AND_MEAN 61 # include <windows.h> 62 #endif 63 #if defined OS2 64 # define INCL_DOS 65 # include <os2.h> 66 #endif 67 68 /* For MB_CUR_MAX_L */ 69 #if defined DARWIN7 70 # include <xlocale.h> 71 #endif 72 73 #if ENABLE_RELOCATABLE 74 # include "relocatable.h" 75 #else 76 # define relocate(pathname) (pathname) 77 #endif 78 79 /* Get LIBDIR. */ 80 #ifndef LIBDIR 81 # include "configmake.h" 82 #endif 83 84 /* Define O_NOFOLLOW to 0 on platforms where it does not exist. */ 85 #ifndef O_NOFOLLOW 86 # define O_NOFOLLOW 0 87 #endif 88 89 #if defined _WIN32 || defined __WIN32__ || defined __CYGWIN__ || defined __EMX__ || defined __DJGPP__ 90 /* Native Windows, Cygwin, OS/2, DOS */ 91 # define ISSLASH(C) ((C) == '/' || (C) == '\\') 92 #endif 93 94 #ifndef DIRECTORY_SEPARATOR 95 # define DIRECTORY_SEPARATOR '/' 96 #endif 97 98 #ifndef ISSLASH 99 # define ISSLASH(C) ((C) == DIRECTORY_SEPARATOR) 100 #endif 101 102 #if HAVE_DECL_GETC_UNLOCKED 103 # undef getc 104 # define getc getc_unlocked 105 #endif 106 107 /* The following static variable is declared 'volatile' to avoid a 108 possible multithread problem in the function get_charset_aliases. If we 109 are running in a threaded environment, and if two threads initialize 110 'charset_aliases' simultaneously, both will produce the same value, 111 and everything will be ok if the two assignments to 'charset_aliases' 112 are atomic. But I don't know what will happen if the two assignments mix. */ 113 #if __STDC__ != 1 114 # define volatile /* empty */ 115 #endif 116 /* Pointer to the contents of the charset.alias file, if it has already been 117 read, else NULL. Its format is: 118 ALIAS_1 '\0' CANONICAL_1 '\0' ... ALIAS_n '\0' CANONICAL_n '\0' '\0' */ 119 static const char * volatile charset_aliases; 120 121 /* Return a pointer to the contents of the charset.alias file. */ 122 static const char * 123 get_charset_aliases (void) 124 { 125 const char *cp; 126 127 cp = charset_aliases; 128 if (cp == NULL) 129 { 130 #if !(defined DARWIN7 || defined VMS || defined WINDOWS_NATIVE || defined __CYGWIN__) 131 const char *dir; 132 const char *base = "charset.alias"; 133 char *file_name; 134 135 /* Make it possible to override the charset.alias location. This is 136 necessary for running the testsuite before "make install". */ 137 dir = getenv ("CHARSETALIASDIR"); 138 if (dir == NULL || dir[0] == '\0') 139 dir = relocate (LIBDIR); 140 141 /* Concatenate dir and base into freshly allocated file_name. */ 142 { 143 size_t dir_len = strlen (dir); 144 size_t base_len = strlen (base); 145 int add_slash = (dir_len > 0 && !ISSLASH (dir[dir_len - 1])); 146 file_name = (char *) malloc (dir_len + add_slash + base_len + 1); 147 if (file_name != NULL) 148 { 149 memcpy (file_name, dir, dir_len); 150 if (add_slash) 151 file_name[dir_len] = DIRECTORY_SEPARATOR; 152 memcpy (file_name + dir_len + add_slash, base, base_len + 1); 153 } 154 } 155 156 if (file_name == NULL) 157 /* Out of memory. Treat the file as empty. */ 158 cp = ""; 159 else 160 { 161 int fd; 162 163 /* Open the file. Reject symbolic links on platforms that support 164 O_NOFOLLOW. This is a security feature. Without it, an attacker 165 could retrieve parts of the contents (namely, the tail of the 166 first line that starts with "* ") of an arbitrary file by placing 167 a symbolic link to that file under the name "charset.alias" in 168 some writable directory and defining the environment variable 169 CHARSETALIASDIR to point to that directory. */ 170 fd = open (file_name, 171 O_RDONLY | (HAVE_WORKING_O_NOFOLLOW ? O_NOFOLLOW : 0)); 172 if (fd < 0) 173 /* File not found. Treat it as empty. */ 174 cp = ""; 175 else 176 { 177 FILE *fp; 178 179 fp = fdopen (fd, "r"); 180 if (fp == NULL) 181 { 182 /* Out of memory. Treat the file as empty. */ 183 close (fd); 184 cp = ""; 185 } 186 else 187 { 188 /* Parse the file's contents. */ 189 char *res_ptr = NULL; 190 size_t res_size = 0; 191 192 for (;;) 193 { 194 int c; 195 char buf1[50+1]; 196 char buf2[50+1]; 197 size_t l1, l2; 198 char *old_res_ptr; 199 200 c = getc (fp); 201 if (c == EOF) 202 break; 203 if (c == '\n' || c == ' ' || c == '\t') 204 continue; 205 if (c == '#') 206 { 207 /* Skip comment, to end of line. */ 208 do 209 c = getc (fp); 210 while (!(c == EOF || c == '\n')); 211 if (c == EOF) 212 break; 213 continue; 214 } 215 ungetc (c, fp); 216 if (fscanf (fp, "%50s %50s", buf1, buf2) < 2) 217 break; 218 l1 = strlen (buf1); 219 l2 = strlen (buf2); 220 old_res_ptr = res_ptr; 221 if (res_size == 0) 222 { 223 res_size = l1 + 1 + l2 + 1; 224 res_ptr = (char *) malloc (res_size + 1); 225 } 226 else 227 { 228 res_size += l1 + 1 + l2 + 1; 229 res_ptr = (char *) realloc (res_ptr, res_size + 1); 230 } 231 if (res_ptr == NULL) 232 { 233 /* Out of memory. */ 234 res_size = 0; 235 free (old_res_ptr); 236 break; 237 } 238 strcpy (res_ptr + res_size - (l2 + 1) - (l1 + 1), buf1); 239 strcpy (res_ptr + res_size - (l2 + 1), buf2); 240 } 241 fclose (fp); 242 if (res_size == 0) 243 cp = ""; 244 else 245 { 246 *(res_ptr + res_size) = '\0'; 247 cp = res_ptr; 248 } 249 } 250 } 251 252 free (file_name); 253 } 254 255 #else 256 257 # if defined DARWIN7 258 /* To avoid the trouble of installing a file that is shared by many 259 GNU packages -- many packaging systems have problems with this --, 260 simply inline the aliases here. */ 261 cp = "ISO8859-1" "\0" "ISO-8859-1" "\0" 262 "ISO8859-2" "\0" "ISO-8859-2" "\0" 263 "ISO8859-4" "\0" "ISO-8859-4" "\0" 264 "ISO8859-5" "\0" "ISO-8859-5" "\0" 265 "ISO8859-7" "\0" "ISO-8859-7" "\0" 266 "ISO8859-9" "\0" "ISO-8859-9" "\0" 267 "ISO8859-13" "\0" "ISO-8859-13" "\0" 268 "ISO8859-15" "\0" "ISO-8859-15" "\0" 269 "KOI8-R" "\0" "KOI8-R" "\0" 270 "KOI8-U" "\0" "KOI8-U" "\0" 271 "CP866" "\0" "CP866" "\0" 272 "CP949" "\0" "CP949" "\0" 273 "CP1131" "\0" "CP1131" "\0" 274 "CP1251" "\0" "CP1251" "\0" 275 "eucCN" "\0" "GB2312" "\0" 276 "GB2312" "\0" "GB2312" "\0" 277 "eucJP" "\0" "EUC-JP" "\0" 278 "eucKR" "\0" "EUC-KR" "\0" 279 "Big5" "\0" "BIG5" "\0" 280 "Big5HKSCS" "\0" "BIG5-HKSCS" "\0" 281 "GBK" "\0" "GBK" "\0" 282 "GB18030" "\0" "GB18030" "\0" 283 "SJIS" "\0" "SHIFT_JIS" "\0" 284 "ARMSCII-8" "\0" "ARMSCII-8" "\0" 285 "PT154" "\0" "PT154" "\0" 286 /*"ISCII-DEV" "\0" "?" "\0"*/ 287 "*" "\0" "UTF-8" "\0"; 288 # endif 289 290 # if defined VMS 291 /* To avoid the troubles of an extra file charset.alias_vms in the 292 sources of many GNU packages, simply inline the aliases here. */ 293 /* The list of encodings is taken from the OpenVMS 7.3-1 documentation 294 "Compaq C Run-Time Library Reference Manual for OpenVMS systems" 295 section 10.7 "Handling Different Character Sets". */ 296 cp = "ISO8859-1" "\0" "ISO-8859-1" "\0" 297 "ISO8859-2" "\0" "ISO-8859-2" "\0" 298 "ISO8859-5" "\0" "ISO-8859-5" "\0" 299 "ISO8859-7" "\0" "ISO-8859-7" "\0" 300 "ISO8859-8" "\0" "ISO-8859-8" "\0" 301 "ISO8859-9" "\0" "ISO-8859-9" "\0" 302 /* Japanese */ 303 "eucJP" "\0" "EUC-JP" "\0" 304 "SJIS" "\0" "SHIFT_JIS" "\0" 305 "DECKANJI" "\0" "DEC-KANJI" "\0" 306 "SDECKANJI" "\0" "EUC-JP" "\0" 307 /* Chinese */ 308 "eucTW" "\0" "EUC-TW" "\0" 309 "DECHANYU" "\0" "DEC-HANYU" "\0" 310 "DECHANZI" "\0" "GB2312" "\0" 311 /* Korean */ 312 "DECKOREAN" "\0" "EUC-KR" "\0"; 313 # endif 314 315 # if defined WINDOWS_NATIVE || defined __CYGWIN__ 316 /* To avoid the troubles of installing a separate file in the same 317 directory as the DLL and of retrieving the DLL's directory at 318 runtime, simply inline the aliases here. */ 319 320 cp = "CP936" "\0" "GBK" "\0" 321 "CP1361" "\0" "JOHAB" "\0" 322 "CP20127" "\0" "ASCII" "\0" 323 "CP20866" "\0" "KOI8-R" "\0" 324 "CP20936" "\0" "GB2312" "\0" 325 "CP21866" "\0" "KOI8-RU" "\0" 326 "CP28591" "\0" "ISO-8859-1" "\0" 327 "CP28592" "\0" "ISO-8859-2" "\0" 328 "CP28593" "\0" "ISO-8859-3" "\0" 329 "CP28594" "\0" "ISO-8859-4" "\0" 330 "CP28595" "\0" "ISO-8859-5" "\0" 331 "CP28596" "\0" "ISO-8859-6" "\0" 332 "CP28597" "\0" "ISO-8859-7" "\0" 333 "CP28598" "\0" "ISO-8859-8" "\0" 334 "CP28599" "\0" "ISO-8859-9" "\0" 335 "CP28605" "\0" "ISO-8859-15" "\0" 336 "CP38598" "\0" "ISO-8859-8" "\0" 337 "CP51932" "\0" "EUC-JP" "\0" 338 "CP51936" "\0" "GB2312" "\0" 339 "CP51949" "\0" "EUC-KR" "\0" 340 "CP51950" "\0" "EUC-TW" "\0" 341 "CP54936" "\0" "GB18030" "\0" 342 "CP65001" "\0" "UTF-8" "\0"; 343 # endif 344 #endif 345 346 charset_aliases = cp; 347 } 348 349 return cp; 350 } 351 352 /* Determine the current locale's character encoding, and canonicalize it 353 into one of the canonical names listed in config.charset. 354 The result must not be freed; it is statically allocated. 355 If the canonical name cannot be determined, the result is a non-canonical 356 name. */ 357 358 #ifdef STATIC 359 STATIC 360 #endif 361 const char * 362 locale_charset (void) 363 { 364 const char *codeset; 365 const char *aliases; 366 367 #if !(defined WINDOWS_NATIVE || defined OS2) 368 369 # if HAVE_LANGINFO_CODESET 370 371 /* Most systems support nl_langinfo (CODESET) nowadays. */ 372 codeset = nl_langinfo (CODESET); 373 374 # ifdef __CYGWIN__ 375 /* Cygwin < 1.7 does not have locales. nl_langinfo (CODESET) always 376 returns "US-ASCII". Return the suffix of the locale name from the 377 environment variables (if present) or the codepage as a number. */ 378 if (codeset != NULL && strcmp (codeset, "US-ASCII") == 0) 379 { 380 const char *locale; 381 static char buf[2 + 10 + 1]; 382 383 locale = getenv ("LC_ALL"); 384 if (locale == NULL || locale[0] == '\0') 385 { 386 locale = getenv ("LC_CTYPE"); 387 if (locale == NULL || locale[0] == '\0') 388 locale = getenv ("LANG"); 389 } 390 if (locale != NULL && locale[0] != '\0') 391 { 392 /* If the locale name contains an encoding after the dot, return 393 it. */ 394 const char *dot = strchr (locale, '.'); 395 396 if (dot != NULL) 397 { 398 const char *modifier; 399 400 dot++; 401 /* Look for the possible @... trailer and remove it, if any. */ 402 modifier = strchr (dot, '@'); 403 if (modifier == NULL) 404 return dot; 405 if (modifier - dot < sizeof (buf)) 406 { 407 memcpy (buf, dot, modifier - dot); 408 buf [modifier - dot] = '\0'; 409 return buf; 410 } 411 } 412 } 413 414 /* The Windows API has a function returning the locale's codepage as a 415 number: GetACP(). This encoding is used by Cygwin, unless the user 416 has set the environment variable CYGWIN=codepage:oem (which very few 417 people do). 418 Output directed to console windows needs to be converted (to 419 GetOEMCP() if the console is using a raster font, or to 420 GetConsoleOutputCP() if it is using a TrueType font). Cygwin does 421 this conversion transparently (see winsup/cygwin/fhandler_console.cc), 422 converting to GetConsoleOutputCP(). This leads to correct results, 423 except when SetConsoleOutputCP has been called and a raster font is 424 in use. */ 425 sprintf (buf, "CP%u", GetACP ()); 426 codeset = buf; 427 } 428 # endif 429 430 # else 431 432 /* On old systems which lack it, use setlocale or getenv. */ 433 const char *locale = NULL; 434 435 /* But most old systems don't have a complete set of locales. Some 436 (like SunOS 4 or DJGPP) have only the C locale. Therefore we don't 437 use setlocale here; it would return "C" when it doesn't support the 438 locale name the user has set. */ 439 # if 0 440 locale = setlocale (LC_CTYPE, NULL); 441 # endif 442 if (locale == NULL || locale[0] == '\0') 443 { 444 locale = getenv ("LC_ALL"); 445 if (locale == NULL || locale[0] == '\0') 446 { 447 locale = getenv ("LC_CTYPE"); 448 if (locale == NULL || locale[0] == '\0') 449 locale = getenv ("LANG"); 450 } 451 } 452 453 /* On some old systems, one used to set locale = "iso8859_1". On others, 454 you set it to "language_COUNTRY.charset". In any case, we resolve it 455 through the charset.alias file. */ 456 codeset = locale; 457 458 # endif 459 460 #elif defined WINDOWS_NATIVE 461 462 static char buf[2 + 10 + 1]; 463 464 /* The Windows API has a function returning the locale's codepage as a 465 number: GetACP(). 466 When the output goes to a console window, it needs to be provided in 467 GetOEMCP() encoding if the console is using a raster font, or in 468 GetConsoleOutputCP() encoding if it is using a TrueType font. 469 But in GUI programs and for output sent to files and pipes, GetACP() 470 encoding is the best bet. */ 471 sprintf (buf, "CP%u", GetACP ()); 472 codeset = buf; 473 474 #elif defined OS2 475 476 const char *locale; 477 static char buf[2 + 10 + 1]; 478 ULONG cp[3]; 479 ULONG cplen; 480 481 /* Allow user to override the codeset, as set in the operating system, 482 with standard language environment variables. */ 483 locale = getenv ("LC_ALL"); 484 if (locale == NULL || locale[0] == '\0') 485 { 486 locale = getenv ("LC_CTYPE"); 487 if (locale == NULL || locale[0] == '\0') 488 locale = getenv ("LANG"); 489 } 490 if (locale != NULL && locale[0] != '\0') 491 { 492 /* If the locale name contains an encoding after the dot, return it. */ 493 const char *dot = strchr (locale, '.'); 494 495 if (dot != NULL) 496 { 497 const char *modifier; 498 499 dot++; 500 /* Look for the possible @... trailer and remove it, if any. */ 501 modifier = strchr (dot, '@'); 502 if (modifier == NULL) 503 return dot; 504 if (modifier - dot < sizeof (buf)) 505 { 506 memcpy (buf, dot, modifier - dot); 507 buf [modifier - dot] = '\0'; 508 return buf; 509 } 510 } 511 512 /* Resolve through the charset.alias file. */ 513 codeset = locale; 514 } 515 else 516 { 517 /* OS/2 has a function returning the locale's codepage as a number. */ 518 if (DosQueryCp (sizeof (cp), cp, &cplen)) 519 codeset = ""; 520 else 521 { 522 sprintf (buf, "CP%u", cp[0]); 523 codeset = buf; 524 } 525 } 526 527 #endif 528 529 if (codeset == NULL) 530 /* The canonical name cannot be determined. */ 531 codeset = ""; 532 533 /* Resolve alias. */ 534 for (aliases = get_charset_aliases (); 535 *aliases != '\0'; 536 aliases += strlen (aliases) + 1, aliases += strlen (aliases) + 1) 537 if (strcmp (codeset, aliases) == 0 538 || (aliases[0] == '*' && aliases[1] == '\0')) 539 { 540 codeset = aliases + strlen (aliases) + 1; 541 break; 542 } 543 544 /* Don't return an empty string. GNU libc and GNU libiconv interpret 545 the empty string as denoting "the locale's character encoding", 546 thus GNU libiconv would call this function a second time. */ 547 if (codeset[0] == '\0') 548 codeset = "ASCII"; 549 550 #ifdef DARWIN7 551 /* Mac OS X sets MB_CUR_MAX to 1 when LC_ALL=C, and "UTF-8" 552 (the default codeset) does not work when MB_CUR_MAX is 1. */ 553 if (strcmp (codeset, "UTF-8") == 0 && MB_CUR_MAX_L (uselocale (NULL)) <= 1) 554 codeset = "ASCII"; 555 #endif 556 557 return codeset; 558 } 559