1 /* Determine a canonical name for the current locale's character encoding. 2 3 Copyright (C) 2000-2006, 2008-2018 Free Software Foundation, Inc. 4 5 This program is free software; you can redistribute it and/or modify 6 it under the terms of the GNU General Public License as published by 7 the Free Software Foundation; either version 3, or (at your option) 8 any later version. 9 10 This program is distributed in the hope that it will be useful, 11 but WITHOUT ANY WARRANTY; without even the implied warranty of 12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 GNU General Public License for more details. 14 15 You should have received a copy of the GNU General Public License along 16 with this program; if not, see <https://www.gnu.org/licenses/>. */ 17 18 /* Written by Bruno Haible <bruno@clisp.org>. */ 19 20 #include <config.h> 21 22 /* Specification. */ 23 #include "localcharset.h" 24 25 #include <stddef.h> 26 #include <stdio.h> 27 #include <string.h> 28 #include <stdlib.h> 29 30 #if defined __APPLE__ && defined __MACH__ && HAVE_LANGINFO_CODESET 31 # define DARWIN7 /* Darwin 7 or newer, i.e. Mac OS X 10.3 or newer */ 32 #endif 33 34 #if defined _WIN32 && !defined __CYGWIN__ 35 # define WINDOWS_NATIVE 36 # include <locale.h> 37 #endif 38 39 #if defined __EMX__ 40 /* Assume EMX program runs on OS/2, even if compiled under DOS. */ 41 # ifndef OS2 42 # define OS2 43 # endif 44 #endif 45 46 #if !defined WINDOWS_NATIVE 47 # if HAVE_LANGINFO_CODESET 48 # include <langinfo.h> 49 # else 50 # if 0 /* see comment regarding use of setlocale(), below */ 51 # include <locale.h> 52 # endif 53 # endif 54 # ifdef __CYGWIN__ 55 # define WIN32_LEAN_AND_MEAN 56 # include <windows.h> 57 # endif 58 #elif defined WINDOWS_NATIVE 59 # define WIN32_LEAN_AND_MEAN 60 # include <windows.h> 61 #endif 62 #if defined OS2 63 # define INCL_DOS 64 # include <os2.h> 65 #endif 66 67 /* For MB_CUR_MAX_L */ 68 #if defined DARWIN7 69 # include <xlocale.h> 70 #endif 71 72 73 #if HAVE_LANGINFO_CODESET || defined WINDOWS_NATIVE || defined OS2 74 75 /* On these platforms, we use a mapping from non-canonical encoding name 76 to GNU canonical encoding name. */ 77 78 /* With glibc-2.1 or newer, we don't need any canonicalization, 79 because glibc has iconv and both glibc and libiconv support all 80 GNU canonical names directly. */ 81 # if !((defined __GNU_LIBRARY__ && __GLIBC__ >= 2) || defined __UCLIBC__) 82 83 struct table_entry 84 { 85 const char alias[11+1]; 86 const char canonical[11+1]; 87 }; 88 89 /* Table of platform-dependent mappings, sorted in ascending order. */ 90 static const struct table_entry alias_table[] = 91 { 92 # if defined __FreeBSD__ /* FreeBSD */ 93 /*{ "ARMSCII-8", "ARMSCII-8" },*/ 94 { "Big5", "BIG5" }, 95 { "C", "ASCII" }, 96 /*{ "CP1131", "CP1131" },*/ 97 /*{ "CP1251", "CP1251" },*/ 98 /*{ "CP866", "CP866" },*/ 99 /*{ "GB18030", "GB18030" },*/ 100 /*{ "GB2312", "GB2312" },*/ 101 /*{ "GBK", "GBK" },*/ 102 /*{ "ISCII-DEV", "?" },*/ 103 { "ISO8859-1", "ISO-8859-1" }, 104 { "ISO8859-13", "ISO-8859-13" }, 105 { "ISO8859-15", "ISO-8859-15" }, 106 { "ISO8859-2", "ISO-8859-2" }, 107 { "ISO8859-5", "ISO-8859-5" }, 108 { "ISO8859-7", "ISO-8859-7" }, 109 { "ISO8859-9", "ISO-8859-9" }, 110 /*{ "KOI8-R", "KOI8-R" },*/ 111 /*{ "KOI8-U", "KOI8-U" },*/ 112 { "SJIS", "SHIFT_JIS" }, 113 { "US-ASCII", "ASCII" }, 114 { "eucCN", "GB2312" }, 115 { "eucJP", "EUC-JP" }, 116 { "eucKR", "EUC-KR" } 117 # define alias_table_defined 118 # endif 119 # if defined __NetBSD__ /* NetBSD */ 120 { "646", "ASCII" }, 121 /*{ "ARMSCII-8", "ARMSCII-8" },*/ 122 /*{ "BIG5", "BIG5" },*/ 123 { "Big5-HKSCS", "BIG5-HKSCS" }, 124 /*{ "CP1251", "CP1251" },*/ 125 /*{ "CP866", "CP866" },*/ 126 /*{ "GB18030", "GB18030" },*/ 127 /*{ "GB2312", "GB2312" },*/ 128 { "ISO8859-1", "ISO-8859-1" }, 129 { "ISO8859-13", "ISO-8859-13" }, 130 { "ISO8859-15", "ISO-8859-15" }, 131 { "ISO8859-2", "ISO-8859-2" }, 132 { "ISO8859-4", "ISO-8859-4" }, 133 { "ISO8859-5", "ISO-8859-5" }, 134 { "ISO8859-7", "ISO-8859-7" }, 135 /*{ "KOI8-R", "KOI8-R" },*/ 136 /*{ "KOI8-U", "KOI8-U" },*/ 137 /*{ "PT154", "PT154" },*/ 138 { "SJIS", "SHIFT_JIS" }, 139 { "eucCN", "GB2312" }, 140 { "eucJP", "EUC-JP" }, 141 { "eucKR", "EUC-KR" }, 142 { "eucTW", "EUC-TW" } 143 # define alias_table_defined 144 # endif 145 # if defined __OpenBSD__ /* OpenBSD */ 146 { "646", "ASCII" }, 147 { "ISO8859-1", "ISO-8859-1" }, 148 { "ISO8859-13", "ISO-8859-13" }, 149 { "ISO8859-15", "ISO-8859-15" }, 150 { "ISO8859-2", "ISO-8859-2" }, 151 { "ISO8859-4", "ISO-8859-4" }, 152 { "ISO8859-5", "ISO-8859-5" }, 153 { "ISO8859-7", "ISO-8859-7" } 154 # define alias_table_defined 155 # endif 156 # if defined __APPLE__ && defined __MACH__ /* Mac OS X */ 157 /* Darwin 7.5 has nl_langinfo(CODESET), but sometimes its value is 158 useless: 159 - It returns the empty string when LANG is set to a locale of the 160 form ll_CC, although ll_CC/LC_CTYPE is a symlink to an UTF-8 161 LC_CTYPE file. 162 - The environment variables LANG, LC_CTYPE, LC_ALL are not set by 163 the system; nl_langinfo(CODESET) returns "US-ASCII" in this case. 164 - The documentation says: 165 "... all code that calls BSD system routines should ensure 166 that the const *char parameters of these routines are in UTF-8 167 encoding. All BSD system functions expect their string 168 parameters to be in UTF-8 encoding and nothing else." 169 It also says 170 "An additional caveat is that string parameters for files, 171 paths, and other file-system entities must be in canonical 172 UTF-8. In a canonical UTF-8 Unicode string, all decomposable 173 characters are decomposed ..." 174 but this is not true: You can pass non-decomposed UTF-8 strings 175 to file system functions, and it is the OS which will convert 176 them to decomposed UTF-8 before accessing the file system. 177 - The Apple Terminal application displays UTF-8 by default. 178 - However, other applications are free to use different encodings: 179 - xterm uses ISO-8859-1 by default. 180 - TextEdit uses MacRoman by default. 181 We prefer UTF-8 over decomposed UTF-8-MAC because one should 182 minimize the use of decomposed Unicode. Unfortunately, through the 183 Darwin file system, decomposed UTF-8 strings are leaked into user 184 space nevertheless. 185 Then there are also the locales with encodings other than US-ASCII 186 and UTF-8. These locales can be occasionally useful to users (e.g. 187 when grepping through ISO-8859-1 encoded text files), when all their 188 file names are in US-ASCII. 189 */ 190 { "ARMSCII-8", "ARMSCII-8" }, 191 { "Big5", "BIG5" }, 192 { "Big5HKSCS", "BIG5-HKSCS" }, 193 { "CP1131", "CP1131" }, 194 { "CP1251", "CP1251" }, 195 { "CP866", "CP866" }, 196 { "CP949", "CP949" }, 197 { "GB18030", "GB18030" }, 198 { "GB2312", "GB2312" }, 199 { "GBK", "GBK" }, 200 /*{ "ISCII-DEV", "?" },*/ 201 { "ISO8859-1", "ISO-8859-1" }, 202 { "ISO8859-13", "ISO-8859-13" }, 203 { "ISO8859-15", "ISO-8859-15" }, 204 { "ISO8859-2", "ISO-8859-2" }, 205 { "ISO8859-4", "ISO-8859-4" }, 206 { "ISO8859-5", "ISO-8859-5" }, 207 { "ISO8859-7", "ISO-8859-7" }, 208 { "ISO8859-9", "ISO-8859-9" }, 209 { "KOI8-R", "KOI8-R" }, 210 { "KOI8-U", "KOI8-U" }, 211 { "PT154", "PT154" }, 212 { "SJIS", "SHIFT_JIS" }, 213 { "eucCN", "GB2312" }, 214 { "eucJP", "EUC-JP" }, 215 { "eucKR", "EUC-KR" } 216 # define alias_table_defined 217 # endif 218 # if defined _AIX /* AIX */ 219 /*{ "GBK", "GBK" },*/ 220 { "IBM-1046", "CP1046" }, 221 { "IBM-1124", "CP1124" }, 222 { "IBM-1129", "CP1129" }, 223 { "IBM-1252", "CP1252" }, 224 { "IBM-850", "CP850" }, 225 { "IBM-856", "CP856" }, 226 { "IBM-921", "ISO-8859-13" }, 227 { "IBM-922", "CP922" }, 228 { "IBM-932", "CP932" }, 229 { "IBM-943", "CP943" }, 230 { "IBM-eucCN", "GB2312" }, 231 { "IBM-eucJP", "EUC-JP" }, 232 { "IBM-eucKR", "EUC-KR" }, 233 { "IBM-eucTW", "EUC-TW" }, 234 { "ISO8859-1", "ISO-8859-1" }, 235 { "ISO8859-15", "ISO-8859-15" }, 236 { "ISO8859-2", "ISO-8859-2" }, 237 { "ISO8859-5", "ISO-8859-5" }, 238 { "ISO8859-6", "ISO-8859-6" }, 239 { "ISO8859-7", "ISO-8859-7" }, 240 { "ISO8859-8", "ISO-8859-8" }, 241 { "ISO8859-9", "ISO-8859-9" }, 242 { "TIS-620", "TIS-620" }, 243 /*{ "UTF-8", "UTF-8" },*/ 244 { "big5", "BIG5" } 245 # define alias_table_defined 246 # endif 247 # if defined __hpux /* HP-UX */ 248 { "SJIS", "SHIFT_JIS" }, 249 { "arabic8", "HP-ARABIC8" }, 250 { "big5", "BIG5" }, 251 { "cp1251", "CP1251" }, 252 { "eucJP", "EUC-JP" }, 253 { "eucKR", "EUC-KR" }, 254 { "eucTW", "EUC-TW" }, 255 { "gb18030", "GB18030" }, 256 { "greek8", "HP-GREEK8" }, 257 { "hebrew8", "HP-HEBREW8" }, 258 { "hkbig5", "BIG5-HKSCS" }, 259 { "hp15CN", "GB2312" }, 260 { "iso88591", "ISO-8859-1" }, 261 { "iso885913", "ISO-8859-13" }, 262 { "iso885915", "ISO-8859-15" }, 263 { "iso88592", "ISO-8859-2" }, 264 { "iso88594", "ISO-8859-4" }, 265 { "iso88595", "ISO-8859-5" }, 266 { "iso88596", "ISO-8859-6" }, 267 { "iso88597", "ISO-8859-7" }, 268 { "iso88598", "ISO-8859-8" }, 269 { "iso88599", "ISO-8859-9" }, 270 { "kana8", "HP-KANA8" }, 271 { "koi8r", "KOI8-R" }, 272 { "roman8", "HP-ROMAN8" }, 273 { "tis620", "TIS-620" }, 274 { "turkish8", "HP-TURKISH8" }, 275 { "utf8", "UTF-8" } 276 # define alias_table_defined 277 # endif 278 # if defined __sgi /* IRIX */ 279 { "ISO8859-1", "ISO-8859-1" }, 280 { "ISO8859-15", "ISO-8859-15" }, 281 { "ISO8859-2", "ISO-8859-2" }, 282 { "ISO8859-5", "ISO-8859-5" }, 283 { "ISO8859-7", "ISO-8859-7" }, 284 { "ISO8859-9", "ISO-8859-9" }, 285 { "eucCN", "GB2312" }, 286 { "eucJP", "EUC-JP" }, 287 { "eucKR", "EUC-KR" }, 288 { "eucTW", "EUC-TW" } 289 # define alias_table_defined 290 # endif 291 # if defined __osf__ /* OSF/1 */ 292 /*{ "GBK", "GBK" },*/ 293 { "ISO8859-1", "ISO-8859-1" }, 294 { "ISO8859-15", "ISO-8859-15" }, 295 { "ISO8859-2", "ISO-8859-2" }, 296 { "ISO8859-4", "ISO-8859-4" }, 297 { "ISO8859-5", "ISO-8859-5" }, 298 { "ISO8859-7", "ISO-8859-7" }, 299 { "ISO8859-8", "ISO-8859-8" }, 300 { "ISO8859-9", "ISO-8859-9" }, 301 { "KSC5601", "CP949" }, 302 { "SJIS", "SHIFT_JIS" }, 303 { "TACTIS", "TIS-620" }, 304 /*{ "UTF-8", "UTF-8" },*/ 305 { "big5", "BIG5" }, 306 { "cp850", "CP850" }, 307 { "dechanyu", "DEC-HANYU" }, 308 { "dechanzi", "GB2312" }, 309 { "deckanji", "DEC-KANJI" }, 310 { "deckorean", "EUC-KR" }, 311 { "eucJP", "EUC-JP" }, 312 { "eucKR", "EUC-KR" }, 313 { "eucTW", "EUC-TW" }, 314 { "sdeckanji", "EUC-JP" } 315 # define alias_table_defined 316 # endif 317 # if defined __sun /* Solaris */ 318 { "5601", "EUC-KR" }, 319 { "646", "ASCII" }, 320 /*{ "BIG5", "BIG5" },*/ 321 { "Big5-HKSCS", "BIG5-HKSCS" }, 322 { "GB18030", "GB18030" }, 323 /*{ "GBK", "GBK" },*/ 324 { "ISO8859-1", "ISO-8859-1" }, 325 { "ISO8859-11", "TIS-620" }, 326 { "ISO8859-13", "ISO-8859-13" }, 327 { "ISO8859-15", "ISO-8859-15" }, 328 { "ISO8859-2", "ISO-8859-2" }, 329 { "ISO8859-3", "ISO-8859-3" }, 330 { "ISO8859-4", "ISO-8859-4" }, 331 { "ISO8859-5", "ISO-8859-5" }, 332 { "ISO8859-6", "ISO-8859-6" }, 333 { "ISO8859-7", "ISO-8859-7" }, 334 { "ISO8859-8", "ISO-8859-8" }, 335 { "ISO8859-9", "ISO-8859-9" }, 336 { "PCK", "SHIFT_JIS" }, 337 { "TIS620.2533", "TIS-620" }, 338 /*{ "UTF-8", "UTF-8" },*/ 339 { "ansi-1251", "CP1251" }, 340 { "cns11643", "EUC-TW" }, 341 { "eucJP", "EUC-JP" }, 342 { "gb2312", "GB2312" }, 343 { "koi8-r", "KOI8-R" } 344 # define alias_table_defined 345 # endif 346 # if defined __minix /* Minix */ 347 { "646", "ASCII" } 348 # define alias_table_defined 349 # endif 350 # if defined WINDOWS_NATIVE || defined __CYGWIN__ /* Windows */ 351 { "CP1361", "JOHAB" }, 352 { "CP20127", "ASCII" }, 353 { "CP20866", "KOI8-R" }, 354 { "CP20936", "GB2312" }, 355 { "CP21866", "KOI8-RU" }, 356 { "CP28591", "ISO-8859-1" }, 357 { "CP28592", "ISO-8859-2" }, 358 { "CP28593", "ISO-8859-3" }, 359 { "CP28594", "ISO-8859-4" }, 360 { "CP28595", "ISO-8859-5" }, 361 { "CP28596", "ISO-8859-6" }, 362 { "CP28597", "ISO-8859-7" }, 363 { "CP28598", "ISO-8859-8" }, 364 { "CP28599", "ISO-8859-9" }, 365 { "CP28605", "ISO-8859-15" }, 366 { "CP38598", "ISO-8859-8" }, 367 { "CP51932", "EUC-JP" }, 368 { "CP51936", "GB2312" }, 369 { "CP51949", "EUC-KR" }, 370 { "CP51950", "EUC-TW" }, 371 { "CP54936", "GB18030" }, 372 { "CP65001", "UTF-8" }, 373 { "CP936", "GBK" } 374 # define alias_table_defined 375 # endif 376 # if defined OS2 /* OS/2 */ 377 /* The list of encodings is taken from "List of OS/2 Codepages" 378 by Alex Taylor: 379 <http://altsan.org/os2/toolkits/uls/index.html#codepages>. 380 See also "IBM Globalization - Code page identifiers": 381 <https://www-01.ibm.com/software/globalization/cp/cp_cpgid.html>. */ 382 { "CP1089", "ISO-8859-6" }, 383 { "CP1208", "UTF-8" }, 384 { "CP1381", "GB2312" }, 385 { "CP1386", "GBK" }, 386 { "CP3372", "EUC-JP" }, 387 { "CP813", "ISO-8859-7" }, 388 { "CP819", "ISO-8859-1" }, 389 { "CP878", "KOI8-R" }, 390 { "CP912", "ISO-8859-2" }, 391 { "CP913", "ISO-8859-3" }, 392 { "CP914", "ISO-8859-4" }, 393 { "CP915", "ISO-8859-5" }, 394 { "CP916", "ISO-8859-8" }, 395 { "CP920", "ISO-8859-9" }, 396 { "CP921", "ISO-8859-13" }, 397 { "CP923", "ISO-8859-15" }, 398 { "CP954", "EUC-JP" }, 399 { "CP964", "EUC-TW" }, 400 { "CP970", "EUC-KR" } 401 # define alias_table_defined 402 # endif 403 # if defined VMS /* OpenVMS */ 404 /* The list of encodings is taken from the OpenVMS 7.3-1 documentation 405 "Compaq C Run-Time Library Reference Manual for OpenVMS systems" 406 section 10.7 "Handling Different Character Sets". */ 407 { "DECHANYU", "DEC-HANYU" }, 408 { "DECHANZI", "GB2312" }, 409 { "DECKANJI", "DEC-KANJI" }, 410 { "DECKOREAN", "EUC-KR" }, 411 { "ISO8859-1", "ISO-8859-1" }, 412 { "ISO8859-2", "ISO-8859-2" }, 413 { "ISO8859-5", "ISO-8859-5" }, 414 { "ISO8859-7", "ISO-8859-7" }, 415 { "ISO8859-8", "ISO-8859-8" }, 416 { "ISO8859-9", "ISO-8859-9" }, 417 { "SDECKANJI", "EUC-JP" }, 418 { "SJIS", "SHIFT_JIS" }, 419 { "eucJP", "EUC-JP" }, 420 { "eucTW", "EUC-TW" } 421 # define alias_table_defined 422 # endif 423 # ifndef alias_table_defined 424 /* Just a dummy entry, to avoid a C syntax error. */ 425 { "", "" } 426 # endif 427 }; 428 429 # endif 430 431 #else 432 433 /* On these platforms, we use a mapping from locale name to GNU canonical 434 encoding name. */ 435 436 struct table_entry 437 { 438 const char locale[17+1]; 439 const char canonical[11+1]; 440 }; 441 442 /* Table of platform-dependent mappings, sorted in ascending order. */ 443 static const struct table_entry locale_table[] = 444 { 445 # if defined __FreeBSD__ /* FreeBSD 4.2 */ 446 { "cs_CZ.ISO_8859-2", "ISO-8859-2" }, 447 { "da_DK.DIS_8859-15", "ISO-8859-15" }, 448 { "da_DK.ISO_8859-1", "ISO-8859-1" }, 449 { "de_AT.DIS_8859-15", "ISO-8859-15" }, 450 { "de_AT.ISO_8859-1", "ISO-8859-1" }, 451 { "de_CH.DIS_8859-15", "ISO-8859-15" }, 452 { "de_CH.ISO_8859-1", "ISO-8859-1" }, 453 { "de_DE.DIS_8859-15", "ISO-8859-15" }, 454 { "de_DE.ISO_8859-1", "ISO-8859-1" }, 455 { "en_AU.DIS_8859-15", "ISO-8859-15" }, 456 { "en_AU.ISO_8859-1", "ISO-8859-1" }, 457 { "en_CA.DIS_8859-15", "ISO-8859-15" }, 458 { "en_CA.ISO_8859-1", "ISO-8859-1" }, 459 { "en_GB.DIS_8859-15", "ISO-8859-15" }, 460 { "en_GB.ISO_8859-1", "ISO-8859-1" }, 461 { "en_US.DIS_8859-15", "ISO-8859-15" }, 462 { "en_US.ISO_8859-1", "ISO-8859-1" }, 463 { "es_ES.DIS_8859-15", "ISO-8859-15" }, 464 { "es_ES.ISO_8859-1", "ISO-8859-1" }, 465 { "fi_FI.DIS_8859-15", "ISO-8859-15" }, 466 { "fi_FI.ISO_8859-1", "ISO-8859-1" }, 467 { "fr_BE.DIS_8859-15", "ISO-8859-15" }, 468 { "fr_BE.ISO_8859-1", "ISO-8859-1" }, 469 { "fr_CA.DIS_8859-15", "ISO-8859-15" }, 470 { "fr_CA.ISO_8859-1", "ISO-8859-1" }, 471 { "fr_CH.DIS_8859-15", "ISO-8859-15" }, 472 { "fr_CH.ISO_8859-1", "ISO-8859-1" }, 473 { "fr_FR.DIS_8859-15", "ISO-8859-15" }, 474 { "fr_FR.ISO_8859-1", "ISO-8859-1" }, 475 { "hr_HR.ISO_8859-2", "ISO-8859-2" }, 476 { "hu_HU.ISO_8859-2", "ISO-8859-2" }, 477 { "is_IS.DIS_8859-15", "ISO-8859-15" }, 478 { "is_IS.ISO_8859-1", "ISO-8859-1" }, 479 { "it_CH.DIS_8859-15", "ISO-8859-15" }, 480 { "it_CH.ISO_8859-1", "ISO-8859-1" }, 481 { "it_IT.DIS_8859-15", "ISO-8859-15" }, 482 { "it_IT.ISO_8859-1", "ISO-8859-1" }, 483 { "ja_JP.EUC", "EUC-JP" }, 484 { "ja_JP.SJIS", "SHIFT_JIS" }, 485 { "ja_JP.Shift_JIS", "SHIFT_JIS" }, 486 { "ko_KR.EUC", "EUC-KR" }, 487 { "la_LN.ASCII", "ASCII" }, 488 { "la_LN.DIS_8859-15", "ISO-8859-15" }, 489 { "la_LN.ISO_8859-1", "ISO-8859-1" }, 490 { "la_LN.ISO_8859-2", "ISO-8859-2" }, 491 { "la_LN.ISO_8859-4", "ISO-8859-4" }, 492 { "lt_LN.ASCII", "ASCII" }, 493 { "lt_LN.DIS_8859-15", "ISO-8859-15" }, 494 { "lt_LN.ISO_8859-1", "ISO-8859-1" }, 495 { "lt_LN.ISO_8859-2", "ISO-8859-2" }, 496 { "lt_LT.ISO_8859-4", "ISO-8859-4" }, 497 { "nl_BE.DIS_8859-15", "ISO-8859-15" }, 498 { "nl_BE.ISO_8859-1", "ISO-8859-1" }, 499 { "nl_NL.DIS_8859-15", "ISO-8859-15" }, 500 { "nl_NL.ISO_8859-1", "ISO-8859-1" }, 501 { "no_NO.DIS_8859-15", "ISO-8859-15" }, 502 { "no_NO.ISO_8859-1", "ISO-8859-1" }, 503 { "pl_PL.ISO_8859-2", "ISO-8859-2" }, 504 { "pt_PT.DIS_8859-15", "ISO-8859-15" }, 505 { "pt_PT.ISO_8859-1", "ISO-8859-1" }, 506 { "ru_RU.CP866", "CP866" }, 507 { "ru_RU.ISO_8859-5", "ISO-8859-5" }, 508 { "ru_RU.KOI8-R", "KOI8-R" }, 509 { "ru_SU.CP866", "CP866" }, 510 { "ru_SU.ISO_8859-5", "ISO-8859-5" }, 511 { "ru_SU.KOI8-R", "KOI8-R" }, 512 { "sl_SI.ISO_8859-2", "ISO-8859-2" }, 513 { "sv_SE.DIS_8859-15", "ISO-8859-15" }, 514 { "sv_SE.ISO_8859-1", "ISO-8859-1" }, 515 { "uk_UA.KOI8-U", "KOI8-U" }, 516 { "zh_CN.EUC", "GB2312" }, 517 { "zh_TW.BIG5", "BIG5" }, 518 { "zh_TW.Big5", "BIG5" } 519 # define locale_table_defined 520 # endif 521 # if defined __DJGPP__ /* DOS / DJGPP 2.03 */ 522 /* The encodings given here may not all be correct. 523 If you find that the encoding given for your language and 524 country is not the one your DOS machine actually uses, just 525 correct it in this file, and send a mail to 526 Juan Manuel Guerrero <juan.guerrero@gmx.de> 527 and <bug-gnulib@gnu.org>. */ 528 { "C", "ASCII" }, 529 { "ar", "CP864" }, 530 { "ar_AE", "CP864" }, 531 { "ar_DZ", "CP864" }, 532 { "ar_EG", "CP864" }, 533 { "ar_IQ", "CP864" }, 534 { "ar_IR", "CP864" }, 535 { "ar_JO", "CP864" }, 536 { "ar_KW", "CP864" }, 537 { "ar_MA", "CP864" }, 538 { "ar_OM", "CP864" }, 539 { "ar_QA", "CP864" }, 540 { "ar_SA", "CP864" }, 541 { "ar_SY", "CP864" }, 542 { "be", "CP866" }, 543 { "be_BE", "CP866" }, 544 { "bg", "CP866" }, /* not CP855 ?? */ 545 { "bg_BG", "CP866" }, /* not CP855 ?? */ 546 { "ca", "CP850" }, 547 { "ca_ES", "CP850" }, 548 { "cs", "CP852" }, 549 { "cs_CZ", "CP852" }, 550 { "da", "CP865" }, /* not CP850 ?? */ 551 { "da_DK", "CP865" }, /* not CP850 ?? */ 552 { "de", "CP850" }, 553 { "de_AT", "CP850" }, 554 { "de_CH", "CP850" }, 555 { "de_DE", "CP850" }, 556 { "el", "CP869" }, 557 { "el_GR", "CP869" }, 558 { "en", "CP850" }, 559 { "en_AU", "CP850" }, /* not CP437 ?? */ 560 { "en_CA", "CP850" }, 561 { "en_GB", "CP850" }, 562 { "en_NZ", "CP437" }, 563 { "en_US", "CP437" }, 564 { "en_ZA", "CP850" }, /* not CP437 ?? */ 565 { "eo", "CP850" }, 566 { "eo_EO", "CP850" }, 567 { "es", "CP850" }, 568 { "es_AR", "CP850" }, 569 { "es_BO", "CP850" }, 570 { "es_CL", "CP850" }, 571 { "es_CO", "CP850" }, 572 { "es_CR", "CP850" }, 573 { "es_CU", "CP850" }, 574 { "es_DO", "CP850" }, 575 { "es_EC", "CP850" }, 576 { "es_ES", "CP850" }, 577 { "es_GT", "CP850" }, 578 { "es_HN", "CP850" }, 579 { "es_MX", "CP850" }, 580 { "es_NI", "CP850" }, 581 { "es_PA", "CP850" }, 582 { "es_PE", "CP850" }, 583 { "es_PY", "CP850" }, 584 { "es_SV", "CP850" }, 585 { "es_UY", "CP850" }, 586 { "es_VE", "CP850" }, 587 { "et", "CP850" }, 588 { "et_EE", "CP850" }, 589 { "eu", "CP850" }, 590 { "eu_ES", "CP850" }, 591 { "fi", "CP850" }, 592 { "fi_FI", "CP850" }, 593 { "fr", "CP850" }, 594 { "fr_BE", "CP850" }, 595 { "fr_CA", "CP850" }, 596 { "fr_CH", "CP850" }, 597 { "fr_FR", "CP850" }, 598 { "ga", "CP850" }, 599 { "ga_IE", "CP850" }, 600 { "gd", "CP850" }, 601 { "gd_GB", "CP850" }, 602 { "gl", "CP850" }, 603 { "gl_ES", "CP850" }, 604 { "he", "CP862" }, 605 { "he_IL", "CP862" }, 606 { "hr", "CP852" }, 607 { "hr_HR", "CP852" }, 608 { "hu", "CP852" }, 609 { "hu_HU", "CP852" }, 610 { "id", "CP850" }, /* not CP437 ?? */ 611 { "id_ID", "CP850" }, /* not CP437 ?? */ 612 { "is", "CP861" }, /* not CP850 ?? */ 613 { "is_IS", "CP861" }, /* not CP850 ?? */ 614 { "it", "CP850" }, 615 { "it_CH", "CP850" }, 616 { "it_IT", "CP850" }, 617 { "ja", "CP932" }, 618 { "ja_JP", "CP932" }, 619 { "kr", "CP949" }, /* not CP934 ?? */ 620 { "kr_KR", "CP949" }, /* not CP934 ?? */ 621 { "lt", "CP775" }, 622 { "lt_LT", "CP775" }, 623 { "lv", "CP775" }, 624 { "lv_LV", "CP775" }, 625 { "mk", "CP866" }, /* not CP855 ?? */ 626 { "mk_MK", "CP866" }, /* not CP855 ?? */ 627 { "mt", "CP850" }, 628 { "mt_MT", "CP850" }, 629 { "nb", "CP865" }, /* not CP850 ?? */ 630 { "nb_NO", "CP865" }, /* not CP850 ?? */ 631 { "nl", "CP850" }, 632 { "nl_BE", "CP850" }, 633 { "nl_NL", "CP850" }, 634 { "nn", "CP865" }, /* not CP850 ?? */ 635 { "nn_NO", "CP865" }, /* not CP850 ?? */ 636 { "no", "CP865" }, /* not CP850 ?? */ 637 { "no_NO", "CP865" }, /* not CP850 ?? */ 638 { "pl", "CP852" }, 639 { "pl_PL", "CP852" }, 640 { "pt", "CP850" }, 641 { "pt_BR", "CP850" }, 642 { "pt_PT", "CP850" }, 643 { "ro", "CP852" }, 644 { "ro_RO", "CP852" }, 645 { "ru", "CP866" }, 646 { "ru_RU", "CP866" }, 647 { "sk", "CP852" }, 648 { "sk_SK", "CP852" }, 649 { "sl", "CP852" }, 650 { "sl_SI", "CP852" }, 651 { "sq", "CP852" }, 652 { "sq_AL", "CP852" }, 653 { "sr", "CP852" }, /* CP852 or CP866 or CP855 ?? */ 654 { "sr_CS", "CP852" }, /* CP852 or CP866 or CP855 ?? */ 655 { "sr_YU", "CP852" }, /* CP852 or CP866 or CP855 ?? */ 656 { "sv", "CP850" }, 657 { "sv_SE", "CP850" }, 658 { "th", "CP874" }, 659 { "th_TH", "CP874" }, 660 { "tr", "CP857" }, 661 { "tr_TR", "CP857" }, 662 { "uk", "CP1125" }, 663 { "uk_UA", "CP1125" }, 664 { "zh_CN", "GBK" }, 665 { "zh_TW", "CP950" } /* not CP938 ?? */ 666 # define locale_table_defined 667 # endif 668 # ifndef locale_table_defined 669 /* Just a dummy entry, to avoid a C syntax error. */ 670 { "", "" } 671 # endif 672 }; 673 674 #endif 675 676 677 /* Determine the current locale's character encoding, and canonicalize it 678 into one of the canonical names listed in localcharset.h. 679 The result must not be freed; it is statically allocated. 680 If the canonical name cannot be determined, the result is a non-canonical 681 name. */ 682 683 #ifdef STATIC 684 STATIC 685 #endif 686 const char * 687 locale_charset (void) 688 { 689 const char *codeset; 690 691 #if HAVE_LANGINFO_CODESET || defined WINDOWS_NATIVE || defined OS2 692 693 # if HAVE_LANGINFO_CODESET 694 695 /* Most systems support nl_langinfo (CODESET) nowadays. */ 696 codeset = nl_langinfo (CODESET); 697 698 # ifdef __CYGWIN__ 699 /* Cygwin < 1.7 does not have locales. nl_langinfo (CODESET) always 700 returns "US-ASCII". Return the suffix of the locale name from the 701 environment variables (if present) or the codepage as a number. */ 702 if (codeset != NULL && strcmp (codeset, "US-ASCII") == 0) 703 { 704 const char *locale; 705 static char buf[2 + 10 + 1]; 706 707 locale = getenv ("LC_ALL"); 708 if (locale == NULL || locale[0] == '\0') 709 { 710 locale = getenv ("LC_CTYPE"); 711 if (locale == NULL || locale[0] == '\0') 712 locale = getenv ("LANG"); 713 } 714 if (locale != NULL && locale[0] != '\0') 715 { 716 /* If the locale name contains an encoding after the dot, return 717 it. */ 718 const char *dot = strchr (locale, '.'); 719 720 if (dot != NULL) 721 { 722 const char *modifier; 723 724 dot++; 725 /* Look for the possible @... trailer and remove it, if any. */ 726 modifier = strchr (dot, '@'); 727 if (modifier == NULL) 728 return dot; 729 if (modifier - dot < sizeof (buf)) 730 { 731 memcpy (buf, dot, modifier - dot); 732 buf [modifier - dot] = '\0'; 733 return buf; 734 } 735 } 736 } 737 738 /* The Windows API has a function returning the locale's codepage as a 739 number: GetACP(). This encoding is used by Cygwin, unless the user 740 has set the environment variable CYGWIN=codepage:oem (which very few 741 people do). 742 Output directed to console windows needs to be converted (to 743 GetOEMCP() if the console is using a raster font, or to 744 GetConsoleOutputCP() if it is using a TrueType font). Cygwin does 745 this conversion transparently (see winsup/cygwin/fhandler_console.cc), 746 converting to GetConsoleOutputCP(). This leads to correct results, 747 except when SetConsoleOutputCP has been called and a raster font is 748 in use. */ 749 sprintf (buf, "CP%u", GetACP ()); 750 codeset = buf; 751 } 752 # endif 753 754 if (codeset == NULL) 755 /* The canonical name cannot be determined. */ 756 codeset = ""; 757 758 # elif defined WINDOWS_NATIVE 759 760 static char buf[2 + 10 + 1]; 761 762 /* The Windows API has a function returning the locale's codepage as 763 a number, but the value doesn't change according to what the 764 'setlocale' call specified. So we use it as a last resort, in 765 case the string returned by 'setlocale' doesn't specify the 766 codepage. */ 767 char *current_locale = setlocale (LC_ALL, NULL); 768 char *pdot; 769 770 /* If they set different locales for different categories, 771 'setlocale' will return a semi-colon separated list of locale 772 values. To make sure we use the correct one, we choose LC_CTYPE. */ 773 if (strchr (current_locale, ';')) 774 current_locale = setlocale (LC_CTYPE, NULL); 775 776 pdot = strrchr (current_locale, '.'); 777 if (pdot && 2 + strlen (pdot + 1) + 1 <= sizeof (buf)) 778 sprintf (buf, "CP%s", pdot + 1); 779 else 780 { 781 /* The Windows API has a function returning the locale's codepage as a 782 number: GetACP(). 783 When the output goes to a console window, it needs to be provided in 784 GetOEMCP() encoding if the console is using a raster font, or in 785 GetConsoleOutputCP() encoding if it is using a TrueType font. 786 But in GUI programs and for output sent to files and pipes, GetACP() 787 encoding is the best bet. */ 788 sprintf (buf, "CP%u", GetACP ()); 789 } 790 codeset = buf; 791 792 # elif defined OS2 793 794 const char *locale; 795 static char buf[2 + 10 + 1]; 796 ULONG cp[3]; 797 ULONG cplen; 798 799 codeset = NULL; 800 801 /* Allow user to override the codeset, as set in the operating system, 802 with standard language environment variables. */ 803 locale = getenv ("LC_ALL"); 804 if (locale == NULL || locale[0] == '\0') 805 { 806 locale = getenv ("LC_CTYPE"); 807 if (locale == NULL || locale[0] == '\0') 808 locale = getenv ("LANG"); 809 } 810 if (locale != NULL && locale[0] != '\0') 811 { 812 /* If the locale name contains an encoding after the dot, return it. */ 813 const char *dot = strchr (locale, '.'); 814 815 if (dot != NULL) 816 { 817 const char *modifier; 818 819 dot++; 820 /* Look for the possible @... trailer and remove it, if any. */ 821 modifier = strchr (dot, '@'); 822 if (modifier == NULL) 823 return dot; 824 if (modifier - dot < sizeof (buf)) 825 { 826 memcpy (buf, dot, modifier - dot); 827 buf [modifier - dot] = '\0'; 828 return buf; 829 } 830 } 831 832 /* For the POSIX locale, don't use the system's codepage. */ 833 if (strcmp (locale, "C") == 0 || strcmp (locale, "POSIX") == 0) 834 codeset = ""; 835 } 836 837 if (codeset == NULL) 838 { 839 /* OS/2 has a function returning the locale's codepage as a number. */ 840 if (DosQueryCp (sizeof (cp), cp, &cplen)) 841 codeset = ""; 842 else 843 { 844 sprintf (buf, "CP%u", cp[0]); 845 codeset = buf; 846 } 847 } 848 849 # else 850 851 # error "Add code for other platforms here." 852 853 # endif 854 855 /* Resolve alias. */ 856 { 857 # ifdef alias_table_defined 858 /* On some platforms, UTF-8 locales are the most frequently used ones. 859 Speed up the common case and slow down the less common cases by 860 testing for this case first. */ 861 # if defined __OpenBSD__ || (defined __APPLE__ && defined __MACH__) || defined __sun || defined __CYGWIN__ 862 if (strcmp (codeset, "UTF-8") == 0) 863 goto done_table_lookup; 864 else 865 # endif 866 { 867 const struct table_entry * const table = alias_table; 868 size_t const table_size = 869 sizeof (alias_table) / sizeof (struct table_entry); 870 /* The table is sorted. Perform a binary search. */ 871 size_t hi = table_size; 872 size_t lo = 0; 873 while (lo < hi) 874 { 875 /* Invariant: 876 for i < lo, strcmp (table[i].alias, codeset) < 0, 877 for i >= hi, strcmp (table[i].alias, codeset) > 0. */ 878 size_t mid = (hi + lo) >> 1; /* >= lo, < hi */ 879 int cmp = strcmp (table[mid].alias, codeset); 880 if (cmp < 0) 881 lo = mid + 1; 882 else if (cmp > 0) 883 hi = mid; 884 else 885 { 886 /* Found an i with 887 strcmp (table[i].alias, codeset) == 0. */ 888 codeset = table[mid].canonical; 889 goto done_table_lookup; 890 } 891 } 892 } 893 if (0) 894 done_table_lookup: ; 895 else 896 # endif 897 { 898 /* Did not find it in the table. */ 899 /* On Mac OS X, all modern locales use the UTF-8 encoding. 900 BeOS and Haiku have a single locale, and it has UTF-8 encoding. */ 901 # if (defined __APPLE__ && defined __MACH__) || defined __BEOS__ || defined __HAIKU__ 902 codeset = "UTF-8"; 903 # else 904 /* Don't return an empty string. GNU libc and GNU libiconv interpret 905 the empty string as denoting "the locale's character encoding", 906 thus GNU libiconv would call this function a second time. */ 907 if (codeset[0] == '\0') 908 codeset = "ASCII"; 909 # endif 910 } 911 } 912 913 #else 914 915 /* On old systems which lack it, use setlocale or getenv. */ 916 const char *locale = NULL; 917 918 /* But most old systems don't have a complete set of locales. Some 919 (like DJGPP) have only the C locale. Therefore we don't use setlocale 920 here; it would return "C" when it doesn't support the locale name the 921 user has set. */ 922 # if 0 923 locale = setlocale (LC_CTYPE, NULL); 924 # endif 925 if (locale == NULL || locale[0] == '\0') 926 { 927 locale = getenv ("LC_ALL"); 928 if (locale == NULL || locale[0] == '\0') 929 { 930 locale = getenv ("LC_CTYPE"); 931 if (locale == NULL || locale[0] == '\0') 932 locale = getenv ("LANG"); 933 if (locale == NULL) 934 locale = ""; 935 } 936 } 937 938 /* Map locale name to canonical encoding name. */ 939 { 940 # ifdef locale_table_defined 941 const struct table_entry * const table = locale_table; 942 size_t const table_size = 943 sizeof (locale_table) / sizeof (struct table_entry); 944 /* The table is sorted. Perform a binary search. */ 945 size_t hi = table_size; 946 size_t lo = 0; 947 while (lo < hi) 948 { 949 /* Invariant: 950 for i < lo, strcmp (table[i].locale, locale) < 0, 951 for i >= hi, strcmp (table[i].locale, locale) > 0. */ 952 size_t mid = (hi + lo) >> 1; /* >= lo, < hi */ 953 int cmp = strcmp (table[mid].locale, locale); 954 if (cmp < 0) 955 lo = mid + 1; 956 else if (cmp > 0) 957 hi = mid; 958 else 959 { 960 /* Found an i with 961 strcmp (table[i].locale, locale) == 0. */ 962 codeset = table[mid].canonical; 963 goto done_table_lookup; 964 } 965 } 966 if (0) 967 done_table_lookup: ; 968 else 969 # endif 970 { 971 /* Did not find it in the table. */ 972 /* On Mac OS X, all modern locales use the UTF-8 encoding. 973 BeOS and Haiku have a single locale, and it has UTF-8 encoding. */ 974 # if (defined __APPLE__ && defined __MACH__) || defined __BEOS__ || defined __HAIKU__ 975 codeset = "UTF-8"; 976 # else 977 /* The canonical name cannot be determined. */ 978 /* Don't return an empty string. GNU libc and GNU libiconv interpret 979 the empty string as denoting "the locale's character encoding", 980 thus GNU libiconv would call this function a second time. */ 981 codeset = "ASCII"; 982 # endif 983 } 984 } 985 986 #endif 987 988 #ifdef DARWIN7 989 /* Mac OS X sets MB_CUR_MAX to 1 when LC_ALL=C, and "UTF-8" 990 (the default codeset) does not work when MB_CUR_MAX is 1. */ 991 if (strcmp (codeset, "UTF-8") == 0 && MB_CUR_MAX_L (uselocale (NULL)) <= 1) 992 codeset = "ASCII"; 993 #endif 994 995 return codeset; 996 } 997