1 /* Determine a canonical name for the current locale's character encoding.
2 
3    Copyright (C) 2000-2006, 2008-2018 Free Software Foundation, Inc.
4 
5    This program is free software; you can redistribute it and/or modify
6    it under the terms of the GNU General Public License as published by
7    the Free Software Foundation; either version 3, or (at your option)
8    any later version.
9 
10    This program is distributed in the hope that it will be useful,
11    but WITHOUT ANY WARRANTY; without even the implied warranty of
12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13    GNU General Public License for more details.
14 
15    You should have received a copy of the GNU General Public License along
16    with this program; if not, see <https://www.gnu.org/licenses/>.  */
17 
18 /* Written by Bruno Haible <bruno@clisp.org>.  */
19 
20 #include <config.h>
21 
22 /* Specification.  */
23 #include "localcharset.h"
24 
25 #include <stddef.h>
26 #include <stdio.h>
27 #include <string.h>
28 #include <stdlib.h>
29 
30 #if defined __APPLE__ && defined __MACH__ && HAVE_LANGINFO_CODESET
31 # define DARWIN7 /* Darwin 7 or newer, i.e. Mac OS X 10.3 or newer */
32 #endif
33 
34 #if defined _WIN32 && !defined __CYGWIN__
35 # define WINDOWS_NATIVE
36 # include <locale.h>
37 #endif
38 
39 #if defined __EMX__
40 /* Assume EMX program runs on OS/2, even if compiled under DOS.  */
41 # ifndef OS2
42 #  define OS2
43 # endif
44 #endif
45 
46 #if !defined WINDOWS_NATIVE
47 # if HAVE_LANGINFO_CODESET
48 #  include <langinfo.h>
49 # else
50 #  if 0 /* see comment regarding use of setlocale(), below */
51 #   include <locale.h>
52 #  endif
53 # endif
54 # ifdef __CYGWIN__
55 #  define WIN32_LEAN_AND_MEAN
56 #  include <windows.h>
57 # endif
58 #elif defined WINDOWS_NATIVE
59 # define WIN32_LEAN_AND_MEAN
60 # include <windows.h>
61 #endif
62 #if defined OS2
63 # define INCL_DOS
64 # include <os2.h>
65 #endif
66 
67 /* For MB_CUR_MAX_L */
68 #if defined DARWIN7
69 # include <xlocale.h>
70 #endif
71 
72 
73 #if HAVE_LANGINFO_CODESET || defined WINDOWS_NATIVE || defined OS2
74 
75 /* On these platforms, we use a mapping from non-canonical encoding name
76    to GNU canonical encoding name.  */
77 
78 /* With glibc-2.1 or newer, we don't need any canonicalization,
79    because glibc has iconv and both glibc and libiconv support all
80    GNU canonical names directly.  */
81 # if !((defined __GNU_LIBRARY__ && __GLIBC__ >= 2) || defined __UCLIBC__)
82 
83 struct table_entry
84 {
85   const char alias[11+1];
86   const char canonical[11+1];
87 };
88 
89 /* Table of platform-dependent mappings, sorted in ascending order.  */
90 static const struct table_entry alias_table[] =
91   {
92 #  if defined __FreeBSD__                                   /* FreeBSD */
93   /*{ "ARMSCII-8",  "ARMSCII-8" },*/
94     { "Big5",       "BIG5" },
95     { "C",          "ASCII" },
96   /*{ "CP1131",     "CP1131" },*/
97   /*{ "CP1251",     "CP1251" },*/
98   /*{ "CP866",      "CP866" },*/
99   /*{ "GB18030",    "GB18030" },*/
100   /*{ "GB2312",     "GB2312" },*/
101   /*{ "GBK",        "GBK" },*/
102   /*{ "ISCII-DEV",  "?" },*/
103     { "ISO8859-1",  "ISO-8859-1" },
104     { "ISO8859-13", "ISO-8859-13" },
105     { "ISO8859-15", "ISO-8859-15" },
106     { "ISO8859-2",  "ISO-8859-2" },
107     { "ISO8859-5",  "ISO-8859-5" },
108     { "ISO8859-7",  "ISO-8859-7" },
109     { "ISO8859-9",  "ISO-8859-9" },
110   /*{ "KOI8-R",     "KOI8-R" },*/
111   /*{ "KOI8-U",     "KOI8-U" },*/
112     { "SJIS",       "SHIFT_JIS" },
113     { "US-ASCII",   "ASCII" },
114     { "eucCN",      "GB2312" },
115     { "eucJP",      "EUC-JP" },
116     { "eucKR",      "EUC-KR" }
117 #   define alias_table_defined
118 #  endif
119 #  if defined __NetBSD__                                    /* NetBSD */
120     { "646",        "ASCII" },
121   /*{ "ARMSCII-8",  "ARMSCII-8" },*/
122   /*{ "BIG5",       "BIG5" },*/
123     { "Big5-HKSCS", "BIG5-HKSCS" },
124   /*{ "CP1251",     "CP1251" },*/
125   /*{ "CP866",      "CP866" },*/
126   /*{ "GB18030",    "GB18030" },*/
127   /*{ "GB2312",     "GB2312" },*/
128     { "ISO8859-1",  "ISO-8859-1" },
129     { "ISO8859-13", "ISO-8859-13" },
130     { "ISO8859-15", "ISO-8859-15" },
131     { "ISO8859-2",  "ISO-8859-2" },
132     { "ISO8859-4",  "ISO-8859-4" },
133     { "ISO8859-5",  "ISO-8859-5" },
134     { "ISO8859-7",  "ISO-8859-7" },
135   /*{ "KOI8-R",     "KOI8-R" },*/
136   /*{ "KOI8-U",     "KOI8-U" },*/
137   /*{ "PT154",      "PT154" },*/
138     { "SJIS",       "SHIFT_JIS" },
139     { "eucCN",      "GB2312" },
140     { "eucJP",      "EUC-JP" },
141     { "eucKR",      "EUC-KR" },
142     { "eucTW",      "EUC-TW" }
143 #   define alias_table_defined
144 #  endif
145 #  if defined __OpenBSD__                                   /* OpenBSD */
146     { "646",        "ASCII" },
147     { "ISO8859-1",  "ISO-8859-1" },
148     { "ISO8859-13", "ISO-8859-13" },
149     { "ISO8859-15", "ISO-8859-15" },
150     { "ISO8859-2",  "ISO-8859-2" },
151     { "ISO8859-4",  "ISO-8859-4" },
152     { "ISO8859-5",  "ISO-8859-5" },
153     { "ISO8859-7",  "ISO-8859-7" }
154 #   define alias_table_defined
155 #  endif
156 #  if defined __APPLE__ && defined __MACH__                 /* Mac OS X */
157     /* Darwin 7.5 has nl_langinfo(CODESET), but sometimes its value is
158        useless:
159        - It returns the empty string when LANG is set to a locale of the
160          form ll_CC, although ll_CC/LC_CTYPE is a symlink to an UTF-8
161          LC_CTYPE file.
162        - The environment variables LANG, LC_CTYPE, LC_ALL are not set by
163          the system; nl_langinfo(CODESET) returns "US-ASCII" in this case.
164        - The documentation says:
165            "... all code that calls BSD system routines should ensure
166             that the const *char parameters of these routines are in UTF-8
167             encoding. All BSD system functions expect their string
168             parameters to be in UTF-8 encoding and nothing else."
169          It also says
170            "An additional caveat is that string parameters for files,
171             paths, and other file-system entities must be in canonical
172             UTF-8. In a canonical UTF-8 Unicode string, all decomposable
173             characters are decomposed ..."
174          but this is not true: You can pass non-decomposed UTF-8 strings
175          to file system functions, and it is the OS which will convert
176          them to decomposed UTF-8 before accessing the file system.
177        - The Apple Terminal application displays UTF-8 by default.
178        - However, other applications are free to use different encodings:
179          - xterm uses ISO-8859-1 by default.
180          - TextEdit uses MacRoman by default.
181        We prefer UTF-8 over decomposed UTF-8-MAC because one should
182        minimize the use of decomposed Unicode. Unfortunately, through the
183        Darwin file system, decomposed UTF-8 strings are leaked into user
184        space nevertheless.
185        Then there are also the locales with encodings other than US-ASCII
186        and UTF-8. These locales can be occasionally useful to users (e.g.
187        when grepping through ISO-8859-1 encoded text files), when all their
188        file names are in US-ASCII.
189      */
190     { "ARMSCII-8",  "ARMSCII-8" },
191     { "Big5",       "BIG5" },
192     { "Big5HKSCS",  "BIG5-HKSCS" },
193     { "CP1131",     "CP1131" },
194     { "CP1251",     "CP1251" },
195     { "CP866",      "CP866" },
196     { "CP949",      "CP949" },
197     { "GB18030",    "GB18030" },
198     { "GB2312",     "GB2312" },
199     { "GBK",        "GBK" },
200   /*{ "ISCII-DEV",  "?" },*/
201     { "ISO8859-1",  "ISO-8859-1" },
202     { "ISO8859-13", "ISO-8859-13" },
203     { "ISO8859-15", "ISO-8859-15" },
204     { "ISO8859-2",  "ISO-8859-2" },
205     { "ISO8859-4",  "ISO-8859-4" },
206     { "ISO8859-5",  "ISO-8859-5" },
207     { "ISO8859-7",  "ISO-8859-7" },
208     { "ISO8859-9",  "ISO-8859-9" },
209     { "KOI8-R",     "KOI8-R" },
210     { "KOI8-U",     "KOI8-U" },
211     { "PT154",      "PT154" },
212     { "SJIS",       "SHIFT_JIS" },
213     { "eucCN",      "GB2312" },
214     { "eucJP",      "EUC-JP" },
215     { "eucKR",      "EUC-KR" }
216 #   define alias_table_defined
217 #  endif
218 #  if defined _AIX                                          /* AIX */
219   /*{ "GBK",        "GBK" },*/
220     { "IBM-1046",   "CP1046" },
221     { "IBM-1124",   "CP1124" },
222     { "IBM-1129",   "CP1129" },
223     { "IBM-1252",   "CP1252" },
224     { "IBM-850",    "CP850" },
225     { "IBM-856",    "CP856" },
226     { "IBM-921",    "ISO-8859-13" },
227     { "IBM-922",    "CP922" },
228     { "IBM-932",    "CP932" },
229     { "IBM-943",    "CP943" },
230     { "IBM-eucCN",  "GB2312" },
231     { "IBM-eucJP",  "EUC-JP" },
232     { "IBM-eucKR",  "EUC-KR" },
233     { "IBM-eucTW",  "EUC-TW" },
234     { "ISO8859-1",  "ISO-8859-1" },
235     { "ISO8859-15", "ISO-8859-15" },
236     { "ISO8859-2",  "ISO-8859-2" },
237     { "ISO8859-5",  "ISO-8859-5" },
238     { "ISO8859-6",  "ISO-8859-6" },
239     { "ISO8859-7",  "ISO-8859-7" },
240     { "ISO8859-8",  "ISO-8859-8" },
241     { "ISO8859-9",  "ISO-8859-9" },
242     { "TIS-620",    "TIS-620" },
243   /*{ "UTF-8",      "UTF-8" },*/
244     { "big5",       "BIG5" }
245 #   define alias_table_defined
246 #  endif
247 #  if defined __hpux                                        /* HP-UX */
248     { "SJIS",      "SHIFT_JIS" },
249     { "arabic8",   "HP-ARABIC8" },
250     { "big5",      "BIG5" },
251     { "cp1251",    "CP1251" },
252     { "eucJP",     "EUC-JP" },
253     { "eucKR",     "EUC-KR" },
254     { "eucTW",     "EUC-TW" },
255     { "gb18030",   "GB18030" },
256     { "greek8",    "HP-GREEK8" },
257     { "hebrew8",   "HP-HEBREW8" },
258     { "hkbig5",    "BIG5-HKSCS" },
259     { "hp15CN",    "GB2312" },
260     { "iso88591",  "ISO-8859-1" },
261     { "iso885913", "ISO-8859-13" },
262     { "iso885915", "ISO-8859-15" },
263     { "iso88592",  "ISO-8859-2" },
264     { "iso88594",  "ISO-8859-4" },
265     { "iso88595",  "ISO-8859-5" },
266     { "iso88596",  "ISO-8859-6" },
267     { "iso88597",  "ISO-8859-7" },
268     { "iso88598",  "ISO-8859-8" },
269     { "iso88599",  "ISO-8859-9" },
270     { "kana8",     "HP-KANA8" },
271     { "koi8r",     "KOI8-R" },
272     { "roman8",    "HP-ROMAN8" },
273     { "tis620",    "TIS-620" },
274     { "turkish8",  "HP-TURKISH8" },
275     { "utf8",      "UTF-8" }
276 #   define alias_table_defined
277 #  endif
278 #  if defined __sgi                                         /* IRIX */
279     { "ISO8859-1",  "ISO-8859-1" },
280     { "ISO8859-15", "ISO-8859-15" },
281     { "ISO8859-2",  "ISO-8859-2" },
282     { "ISO8859-5",  "ISO-8859-5" },
283     { "ISO8859-7",  "ISO-8859-7" },
284     { "ISO8859-9",  "ISO-8859-9" },
285     { "eucCN",      "GB2312" },
286     { "eucJP",      "EUC-JP" },
287     { "eucKR",      "EUC-KR" },
288     { "eucTW",      "EUC-TW" }
289 #   define alias_table_defined
290 #  endif
291 #  if defined __osf__                                       /* OSF/1 */
292   /*{ "GBK",        "GBK" },*/
293     { "ISO8859-1",  "ISO-8859-1" },
294     { "ISO8859-15", "ISO-8859-15" },
295     { "ISO8859-2",  "ISO-8859-2" },
296     { "ISO8859-4",  "ISO-8859-4" },
297     { "ISO8859-5",  "ISO-8859-5" },
298     { "ISO8859-7",  "ISO-8859-7" },
299     { "ISO8859-8",  "ISO-8859-8" },
300     { "ISO8859-9",  "ISO-8859-9" },
301     { "KSC5601",    "CP949" },
302     { "SJIS",       "SHIFT_JIS" },
303     { "TACTIS",     "TIS-620" },
304   /*{ "UTF-8",      "UTF-8" },*/
305     { "big5",       "BIG5" },
306     { "cp850",      "CP850" },
307     { "dechanyu",   "DEC-HANYU" },
308     { "dechanzi",   "GB2312" },
309     { "deckanji",   "DEC-KANJI" },
310     { "deckorean",  "EUC-KR" },
311     { "eucJP",      "EUC-JP" },
312     { "eucKR",      "EUC-KR" },
313     { "eucTW",      "EUC-TW" },
314     { "sdeckanji",  "EUC-JP" }
315 #   define alias_table_defined
316 #  endif
317 #  if defined __sun                                         /* Solaris */
318     { "5601",        "EUC-KR" },
319     { "646",         "ASCII" },
320   /*{ "BIG5",        "BIG5" },*/
321     { "Big5-HKSCS",  "BIG5-HKSCS" },
322     { "GB18030",     "GB18030" },
323   /*{ "GBK",         "GBK" },*/
324     { "ISO8859-1",   "ISO-8859-1" },
325     { "ISO8859-11",  "TIS-620" },
326     { "ISO8859-13",  "ISO-8859-13" },
327     { "ISO8859-15",  "ISO-8859-15" },
328     { "ISO8859-2",   "ISO-8859-2" },
329     { "ISO8859-3",   "ISO-8859-3" },
330     { "ISO8859-4",   "ISO-8859-4" },
331     { "ISO8859-5",   "ISO-8859-5" },
332     { "ISO8859-6",   "ISO-8859-6" },
333     { "ISO8859-7",   "ISO-8859-7" },
334     { "ISO8859-8",   "ISO-8859-8" },
335     { "ISO8859-9",   "ISO-8859-9" },
336     { "PCK",         "SHIFT_JIS" },
337     { "TIS620.2533", "TIS-620" },
338   /*{ "UTF-8",       "UTF-8" },*/
339     { "ansi-1251",   "CP1251" },
340     { "cns11643",    "EUC-TW" },
341     { "eucJP",       "EUC-JP" },
342     { "gb2312",      "GB2312" },
343     { "koi8-r",      "KOI8-R" }
344 #   define alias_table_defined
345 #  endif
346 #  if defined __minix                                       /* Minix */
347     { "646", "ASCII" }
348 #   define alias_table_defined
349 #  endif
350 #  if defined WINDOWS_NATIVE || defined __CYGWIN__          /* Windows */
351     { "CP1361",  "JOHAB" },
352     { "CP20127", "ASCII" },
353     { "CP20866", "KOI8-R" },
354     { "CP20936", "GB2312" },
355     { "CP21866", "KOI8-RU" },
356     { "CP28591", "ISO-8859-1" },
357     { "CP28592", "ISO-8859-2" },
358     { "CP28593", "ISO-8859-3" },
359     { "CP28594", "ISO-8859-4" },
360     { "CP28595", "ISO-8859-5" },
361     { "CP28596", "ISO-8859-6" },
362     { "CP28597", "ISO-8859-7" },
363     { "CP28598", "ISO-8859-8" },
364     { "CP28599", "ISO-8859-9" },
365     { "CP28605", "ISO-8859-15" },
366     { "CP38598", "ISO-8859-8" },
367     { "CP51932", "EUC-JP" },
368     { "CP51936", "GB2312" },
369     { "CP51949", "EUC-KR" },
370     { "CP51950", "EUC-TW" },
371     { "CP54936", "GB18030" },
372     { "CP65001", "UTF-8" },
373     { "CP936",   "GBK" }
374 #   define alias_table_defined
375 #  endif
376 #  if defined OS2                                           /* OS/2 */
377     /* The list of encodings is taken from "List of OS/2 Codepages"
378        by Alex Taylor:
379        <http://altsan.org/os2/toolkits/uls/index.html#codepages>.
380        See also "IBM Globalization - Code page identifiers":
381        <https://www-01.ibm.com/software/globalization/cp/cp_cpgid.html>.  */
382     { "CP1089", "ISO-8859-6" },
383     { "CP1208", "UTF-8" },
384     { "CP1381", "GB2312" },
385     { "CP1386", "GBK" },
386     { "CP3372", "EUC-JP" },
387     { "CP813",  "ISO-8859-7" },
388     { "CP819",  "ISO-8859-1" },
389     { "CP878",  "KOI8-R" },
390     { "CP912",  "ISO-8859-2" },
391     { "CP913",  "ISO-8859-3" },
392     { "CP914",  "ISO-8859-4" },
393     { "CP915",  "ISO-8859-5" },
394     { "CP916",  "ISO-8859-8" },
395     { "CP920",  "ISO-8859-9" },
396     { "CP921",  "ISO-8859-13" },
397     { "CP923",  "ISO-8859-15" },
398     { "CP954",  "EUC-JP" },
399     { "CP964",  "EUC-TW" },
400     { "CP970",  "EUC-KR" }
401 #   define alias_table_defined
402 #  endif
403 #  if defined VMS                                           /* OpenVMS */
404     /* The list of encodings is taken from the OpenVMS 7.3-1 documentation
405        "Compaq C Run-Time Library Reference Manual for OpenVMS systems"
406        section 10.7 "Handling Different Character Sets".  */
407     { "DECHANYU",  "DEC-HANYU" },
408     { "DECHANZI",  "GB2312" },
409     { "DECKANJI",  "DEC-KANJI" },
410     { "DECKOREAN", "EUC-KR" },
411     { "ISO8859-1", "ISO-8859-1" },
412     { "ISO8859-2", "ISO-8859-2" },
413     { "ISO8859-5", "ISO-8859-5" },
414     { "ISO8859-7", "ISO-8859-7" },
415     { "ISO8859-8", "ISO-8859-8" },
416     { "ISO8859-9", "ISO-8859-9" },
417     { "SDECKANJI", "EUC-JP" },
418     { "SJIS",      "SHIFT_JIS" },
419     { "eucJP",     "EUC-JP" },
420     { "eucTW",     "EUC-TW" }
421 #   define alias_table_defined
422 #  endif
423 #  ifndef alias_table_defined
424     /* Just a dummy entry, to avoid a C syntax error.  */
425     { "", "" }
426 #  endif
427   };
428 
429 # endif
430 
431 #else
432 
433 /* On these platforms, we use a mapping from locale name to GNU canonical
434    encoding name.  */
435 
436 struct table_entry
437 {
438   const char locale[17+1];
439   const char canonical[11+1];
440 };
441 
442 /* Table of platform-dependent mappings, sorted in ascending order.  */
443 static const struct table_entry locale_table[] =
444   {
445 # if defined __FreeBSD__                                    /* FreeBSD 4.2 */
446     { "cs_CZ.ISO_8859-2",  "ISO-8859-2" },
447     { "da_DK.DIS_8859-15", "ISO-8859-15" },
448     { "da_DK.ISO_8859-1",  "ISO-8859-1" },
449     { "de_AT.DIS_8859-15", "ISO-8859-15" },
450     { "de_AT.ISO_8859-1",  "ISO-8859-1" },
451     { "de_CH.DIS_8859-15", "ISO-8859-15" },
452     { "de_CH.ISO_8859-1",  "ISO-8859-1" },
453     { "de_DE.DIS_8859-15", "ISO-8859-15" },
454     { "de_DE.ISO_8859-1",  "ISO-8859-1" },
455     { "en_AU.DIS_8859-15", "ISO-8859-15" },
456     { "en_AU.ISO_8859-1",  "ISO-8859-1" },
457     { "en_CA.DIS_8859-15", "ISO-8859-15" },
458     { "en_CA.ISO_8859-1",  "ISO-8859-1" },
459     { "en_GB.DIS_8859-15", "ISO-8859-15" },
460     { "en_GB.ISO_8859-1",  "ISO-8859-1" },
461     { "en_US.DIS_8859-15", "ISO-8859-15" },
462     { "en_US.ISO_8859-1",  "ISO-8859-1" },
463     { "es_ES.DIS_8859-15", "ISO-8859-15" },
464     { "es_ES.ISO_8859-1",  "ISO-8859-1" },
465     { "fi_FI.DIS_8859-15", "ISO-8859-15" },
466     { "fi_FI.ISO_8859-1",  "ISO-8859-1" },
467     { "fr_BE.DIS_8859-15", "ISO-8859-15" },
468     { "fr_BE.ISO_8859-1",  "ISO-8859-1" },
469     { "fr_CA.DIS_8859-15", "ISO-8859-15" },
470     { "fr_CA.ISO_8859-1",  "ISO-8859-1" },
471     { "fr_CH.DIS_8859-15", "ISO-8859-15" },
472     { "fr_CH.ISO_8859-1",  "ISO-8859-1" },
473     { "fr_FR.DIS_8859-15", "ISO-8859-15" },
474     { "fr_FR.ISO_8859-1",  "ISO-8859-1" },
475     { "hr_HR.ISO_8859-2",  "ISO-8859-2" },
476     { "hu_HU.ISO_8859-2",  "ISO-8859-2" },
477     { "is_IS.DIS_8859-15", "ISO-8859-15" },
478     { "is_IS.ISO_8859-1",  "ISO-8859-1" },
479     { "it_CH.DIS_8859-15", "ISO-8859-15" },
480     { "it_CH.ISO_8859-1",  "ISO-8859-1" },
481     { "it_IT.DIS_8859-15", "ISO-8859-15" },
482     { "it_IT.ISO_8859-1",  "ISO-8859-1" },
483     { "ja_JP.EUC",         "EUC-JP" },
484     { "ja_JP.SJIS",        "SHIFT_JIS" },
485     { "ja_JP.Shift_JIS",   "SHIFT_JIS" },
486     { "ko_KR.EUC",         "EUC-KR" },
487     { "la_LN.ASCII",       "ASCII" },
488     { "la_LN.DIS_8859-15", "ISO-8859-15" },
489     { "la_LN.ISO_8859-1",  "ISO-8859-1" },
490     { "la_LN.ISO_8859-2",  "ISO-8859-2" },
491     { "la_LN.ISO_8859-4",  "ISO-8859-4" },
492     { "lt_LN.ASCII",       "ASCII" },
493     { "lt_LN.DIS_8859-15", "ISO-8859-15" },
494     { "lt_LN.ISO_8859-1",  "ISO-8859-1" },
495     { "lt_LN.ISO_8859-2",  "ISO-8859-2" },
496     { "lt_LT.ISO_8859-4",  "ISO-8859-4" },
497     { "nl_BE.DIS_8859-15", "ISO-8859-15" },
498     { "nl_BE.ISO_8859-1",  "ISO-8859-1" },
499     { "nl_NL.DIS_8859-15", "ISO-8859-15" },
500     { "nl_NL.ISO_8859-1",  "ISO-8859-1" },
501     { "no_NO.DIS_8859-15", "ISO-8859-15" },
502     { "no_NO.ISO_8859-1",  "ISO-8859-1" },
503     { "pl_PL.ISO_8859-2",  "ISO-8859-2" },
504     { "pt_PT.DIS_8859-15", "ISO-8859-15" },
505     { "pt_PT.ISO_8859-1",  "ISO-8859-1" },
506     { "ru_RU.CP866",       "CP866" },
507     { "ru_RU.ISO_8859-5",  "ISO-8859-5" },
508     { "ru_RU.KOI8-R",      "KOI8-R" },
509     { "ru_SU.CP866",       "CP866" },
510     { "ru_SU.ISO_8859-5",  "ISO-8859-5" },
511     { "ru_SU.KOI8-R",      "KOI8-R" },
512     { "sl_SI.ISO_8859-2",  "ISO-8859-2" },
513     { "sv_SE.DIS_8859-15", "ISO-8859-15" },
514     { "sv_SE.ISO_8859-1",  "ISO-8859-1" },
515     { "uk_UA.KOI8-U",      "KOI8-U" },
516     { "zh_CN.EUC",         "GB2312" },
517     { "zh_TW.BIG5",        "BIG5" },
518     { "zh_TW.Big5",        "BIG5" }
519 #  define locale_table_defined
520 # endif
521 # if defined __DJGPP__                                      /* DOS / DJGPP 2.03 */
522     /* The encodings given here may not all be correct.
523        If you find that the encoding given for your language and
524        country is not the one your DOS machine actually uses, just
525        correct it in this file, and send a mail to
526        Juan Manuel Guerrero <juan.guerrero@gmx.de>
527        and <bug-gnulib@gnu.org>.  */
528     { "C",     "ASCII" },
529     { "ar",    "CP864" },
530     { "ar_AE", "CP864" },
531     { "ar_DZ", "CP864" },
532     { "ar_EG", "CP864" },
533     { "ar_IQ", "CP864" },
534     { "ar_IR", "CP864" },
535     { "ar_JO", "CP864" },
536     { "ar_KW", "CP864" },
537     { "ar_MA", "CP864" },
538     { "ar_OM", "CP864" },
539     { "ar_QA", "CP864" },
540     { "ar_SA", "CP864" },
541     { "ar_SY", "CP864" },
542     { "be",    "CP866" },
543     { "be_BE", "CP866" },
544     { "bg",    "CP866" }, /* not CP855 ?? */
545     { "bg_BG", "CP866" }, /* not CP855 ?? */
546     { "ca",    "CP850" },
547     { "ca_ES", "CP850" },
548     { "cs",    "CP852" },
549     { "cs_CZ", "CP852" },
550     { "da",    "CP865" }, /* not CP850 ?? */
551     { "da_DK", "CP865" }, /* not CP850 ?? */
552     { "de",    "CP850" },
553     { "de_AT", "CP850" },
554     { "de_CH", "CP850" },
555     { "de_DE", "CP850" },
556     { "el",    "CP869" },
557     { "el_GR", "CP869" },
558     { "en",    "CP850" },
559     { "en_AU", "CP850" }, /* not CP437 ?? */
560     { "en_CA", "CP850" },
561     { "en_GB", "CP850" },
562     { "en_NZ", "CP437" },
563     { "en_US", "CP437" },
564     { "en_ZA", "CP850" }, /* not CP437 ?? */
565     { "eo",    "CP850" },
566     { "eo_EO", "CP850" },
567     { "es",    "CP850" },
568     { "es_AR", "CP850" },
569     { "es_BO", "CP850" },
570     { "es_CL", "CP850" },
571     { "es_CO", "CP850" },
572     { "es_CR", "CP850" },
573     { "es_CU", "CP850" },
574     { "es_DO", "CP850" },
575     { "es_EC", "CP850" },
576     { "es_ES", "CP850" },
577     { "es_GT", "CP850" },
578     { "es_HN", "CP850" },
579     { "es_MX", "CP850" },
580     { "es_NI", "CP850" },
581     { "es_PA", "CP850" },
582     { "es_PE", "CP850" },
583     { "es_PY", "CP850" },
584     { "es_SV", "CP850" },
585     { "es_UY", "CP850" },
586     { "es_VE", "CP850" },
587     { "et",    "CP850" },
588     { "et_EE", "CP850" },
589     { "eu",    "CP850" },
590     { "eu_ES", "CP850" },
591     { "fi",    "CP850" },
592     { "fi_FI", "CP850" },
593     { "fr",    "CP850" },
594     { "fr_BE", "CP850" },
595     { "fr_CA", "CP850" },
596     { "fr_CH", "CP850" },
597     { "fr_FR", "CP850" },
598     { "ga",    "CP850" },
599     { "ga_IE", "CP850" },
600     { "gd",    "CP850" },
601     { "gd_GB", "CP850" },
602     { "gl",    "CP850" },
603     { "gl_ES", "CP850" },
604     { "he",    "CP862" },
605     { "he_IL", "CP862" },
606     { "hr",    "CP852" },
607     { "hr_HR", "CP852" },
608     { "hu",    "CP852" },
609     { "hu_HU", "CP852" },
610     { "id",    "CP850" }, /* not CP437 ?? */
611     { "id_ID", "CP850" }, /* not CP437 ?? */
612     { "is",    "CP861" }, /* not CP850 ?? */
613     { "is_IS", "CP861" }, /* not CP850 ?? */
614     { "it",    "CP850" },
615     { "it_CH", "CP850" },
616     { "it_IT", "CP850" },
617     { "ja",    "CP932" },
618     { "ja_JP", "CP932" },
619     { "kr",    "CP949" }, /* not CP934 ?? */
620     { "kr_KR", "CP949" }, /* not CP934 ?? */
621     { "lt",    "CP775" },
622     { "lt_LT", "CP775" },
623     { "lv",    "CP775" },
624     { "lv_LV", "CP775" },
625     { "mk",    "CP866" }, /* not CP855 ?? */
626     { "mk_MK", "CP866" }, /* not CP855 ?? */
627     { "mt",    "CP850" },
628     { "mt_MT", "CP850" },
629     { "nb",    "CP865" }, /* not CP850 ?? */
630     { "nb_NO", "CP865" }, /* not CP850 ?? */
631     { "nl",    "CP850" },
632     { "nl_BE", "CP850" },
633     { "nl_NL", "CP850" },
634     { "nn",    "CP865" }, /* not CP850 ?? */
635     { "nn_NO", "CP865" }, /* not CP850 ?? */
636     { "no",    "CP865" }, /* not CP850 ?? */
637     { "no_NO", "CP865" }, /* not CP850 ?? */
638     { "pl",    "CP852" },
639     { "pl_PL", "CP852" },
640     { "pt",    "CP850" },
641     { "pt_BR", "CP850" },
642     { "pt_PT", "CP850" },
643     { "ro",    "CP852" },
644     { "ro_RO", "CP852" },
645     { "ru",    "CP866" },
646     { "ru_RU", "CP866" },
647     { "sk",    "CP852" },
648     { "sk_SK", "CP852" },
649     { "sl",    "CP852" },
650     { "sl_SI", "CP852" },
651     { "sq",    "CP852" },
652     { "sq_AL", "CP852" },
653     { "sr",    "CP852" }, /* CP852 or CP866 or CP855 ?? */
654     { "sr_CS", "CP852" }, /* CP852 or CP866 or CP855 ?? */
655     { "sr_YU", "CP852" }, /* CP852 or CP866 or CP855 ?? */
656     { "sv",    "CP850" },
657     { "sv_SE", "CP850" },
658     { "th",    "CP874" },
659     { "th_TH", "CP874" },
660     { "tr",    "CP857" },
661     { "tr_TR", "CP857" },
662     { "uk",    "CP1125" },
663     { "uk_UA", "CP1125" },
664     { "zh_CN", "GBK" },
665     { "zh_TW", "CP950" } /* not CP938 ?? */
666 #  define locale_table_defined
667 # endif
668 # ifndef locale_table_defined
669     /* Just a dummy entry, to avoid a C syntax error.  */
670     { "", "" }
671 # endif
672   };
673 
674 #endif
675 
676 
677 /* Determine the current locale's character encoding, and canonicalize it
678    into one of the canonical names listed in localcharset.h.
679    The result must not be freed; it is statically allocated.
680    If the canonical name cannot be determined, the result is a non-canonical
681    name.  */
682 
683 #ifdef STATIC
684 STATIC
685 #endif
686 const char *
locale_charset(void)687 locale_charset (void)
688 {
689   const char *codeset;
690 
691 #if HAVE_LANGINFO_CODESET || defined WINDOWS_NATIVE || defined OS2
692 
693 # if HAVE_LANGINFO_CODESET
694 
695   /* Most systems support nl_langinfo (CODESET) nowadays.  */
696   codeset = nl_langinfo (CODESET);
697 
698 #  ifdef __CYGWIN__
699   /* Cygwin < 1.7 does not have locales.  nl_langinfo (CODESET) always
700      returns "US-ASCII".  Return the suffix of the locale name from the
701      environment variables (if present) or the codepage as a number.  */
702   if (codeset != NULL && strcmp (codeset, "US-ASCII") == 0)
703     {
704       const char *locale;
705       static char buf[2 + 10 + 1];
706 
707       locale = getenv ("LC_ALL");
708       if (locale == NULL || locale[0] == '\0')
709         {
710           locale = getenv ("LC_CTYPE");
711           if (locale == NULL || locale[0] == '\0')
712             locale = getenv ("LANG");
713         }
714       if (locale != NULL && locale[0] != '\0')
715         {
716           /* If the locale name contains an encoding after the dot, return
717              it.  */
718           const char *dot = strchr (locale, '.');
719 
720           if (dot != NULL)
721             {
722               const char *modifier;
723 
724               dot++;
725               /* Look for the possible @... trailer and remove it, if any.  */
726               modifier = strchr (dot, '@');
727               if (modifier == NULL)
728                 return dot;
729               if (modifier - dot < sizeof (buf))
730                 {
731                   memcpy (buf, dot, modifier - dot);
732                   buf [modifier - dot] = '\0';
733                   return buf;
734                 }
735             }
736         }
737 
738       /* The Windows API has a function returning the locale's codepage as a
739          number: GetACP().  This encoding is used by Cygwin, unless the user
740          has set the environment variable CYGWIN=codepage:oem (which very few
741          people do).
742          Output directed to console windows needs to be converted (to
743          GetOEMCP() if the console is using a raster font, or to
744          GetConsoleOutputCP() if it is using a TrueType font).  Cygwin does
745          this conversion transparently (see winsup/cygwin/fhandler_console.cc),
746          converting to GetConsoleOutputCP().  This leads to correct results,
747          except when SetConsoleOutputCP has been called and a raster font is
748          in use.  */
749       sprintf (buf, "CP%u", GetACP ());
750       codeset = buf;
751     }
752 #  endif
753 
754   if (codeset == NULL)
755     /* The canonical name cannot be determined.  */
756     codeset = "";
757 
758 # elif defined WINDOWS_NATIVE
759 
760   static char buf[2 + 10 + 1];
761 
762   /* The Windows API has a function returning the locale's codepage as
763      a number, but the value doesn't change according to what the
764      'setlocale' call specified.  So we use it as a last resort, in
765      case the string returned by 'setlocale' doesn't specify the
766      codepage.  */
767   char *current_locale = setlocale (LC_ALL, NULL);
768   char *pdot;
769 
770   /* If they set different locales for different categories,
771      'setlocale' will return a semi-colon separated list of locale
772      values.  To make sure we use the correct one, we choose LC_CTYPE.  */
773   if (strchr (current_locale, ';'))
774     current_locale = setlocale (LC_CTYPE, NULL);
775 
776   pdot = strrchr (current_locale, '.');
777   if (pdot && 2 + strlen (pdot + 1) + 1 <= sizeof (buf))
778     sprintf (buf, "CP%s", pdot + 1);
779   else
780     {
781       /* The Windows API has a function returning the locale's codepage as a
782         number: GetACP().
783         When the output goes to a console window, it needs to be provided in
784         GetOEMCP() encoding if the console is using a raster font, or in
785         GetConsoleOutputCP() encoding if it is using a TrueType font.
786         But in GUI programs and for output sent to files and pipes, GetACP()
787         encoding is the best bet.  */
788       sprintf (buf, "CP%u", GetACP ());
789     }
790   codeset = buf;
791 
792 # elif defined OS2
793 
794   const char *locale;
795   static char buf[2 + 10 + 1];
796   ULONG cp[3];
797   ULONG cplen;
798 
799   codeset = NULL;
800 
801   /* Allow user to override the codeset, as set in the operating system,
802      with standard language environment variables.  */
803   locale = getenv ("LC_ALL");
804   if (locale == NULL || locale[0] == '\0')
805     {
806       locale = getenv ("LC_CTYPE");
807       if (locale == NULL || locale[0] == '\0')
808         locale = getenv ("LANG");
809     }
810   if (locale != NULL && locale[0] != '\0')
811     {
812       /* If the locale name contains an encoding after the dot, return it.  */
813       const char *dot = strchr (locale, '.');
814 
815       if (dot != NULL)
816         {
817           const char *modifier;
818 
819           dot++;
820           /* Look for the possible @... trailer and remove it, if any.  */
821           modifier = strchr (dot, '@');
822           if (modifier == NULL)
823             return dot;
824           if (modifier - dot < sizeof (buf))
825             {
826               memcpy (buf, dot, modifier - dot);
827               buf [modifier - dot] = '\0';
828               return buf;
829             }
830         }
831 
832       /* For the POSIX locale, don't use the system's codepage.  */
833       if (strcmp (locale, "C") == 0 || strcmp (locale, "POSIX") == 0)
834         codeset = "";
835     }
836 
837   if (codeset == NULL)
838     {
839       /* OS/2 has a function returning the locale's codepage as a number.  */
840       if (DosQueryCp (sizeof (cp), cp, &cplen))
841         codeset = "";
842       else
843         {
844           sprintf (buf, "CP%u", cp[0]);
845           codeset = buf;
846         }
847     }
848 
849 # else
850 
851 #  error "Add code for other platforms here."
852 
853 # endif
854 
855   /* Resolve alias.  */
856   {
857 # ifdef alias_table_defined
858     /* On some platforms, UTF-8 locales are the most frequently used ones.
859        Speed up the common case and slow down the less common cases by
860        testing for this case first.  */
861 #  if defined __OpenBSD__ || (defined __APPLE__ && defined __MACH__) || defined __sun || defined __CYGWIN__
862     if (strcmp (codeset, "UTF-8") == 0)
863       goto done_table_lookup;
864     else
865 #  endif
866       {
867         const struct table_entry * const table = alias_table;
868         size_t const table_size =
869           sizeof (alias_table) / sizeof (struct table_entry);
870         /* The table is sorted.  Perform a binary search.  */
871         size_t hi = table_size;
872         size_t lo = 0;
873         while (lo < hi)
874           {
875             /* Invariant:
876                for i < lo, strcmp (table[i].alias, codeset) < 0,
877                for i >= hi, strcmp (table[i].alias, codeset) > 0.  */
878             size_t mid = (hi + lo) >> 1; /* >= lo, < hi */
879             int cmp = strcmp (table[mid].alias, codeset);
880             if (cmp < 0)
881               lo = mid + 1;
882             else if (cmp > 0)
883               hi = mid;
884             else
885               {
886                 /* Found an i with
887                      strcmp (table[i].alias, codeset) == 0.  */
888                 codeset = table[mid].canonical;
889                 goto done_table_lookup;
890               }
891           }
892       }
893     if (0)
894       done_table_lookup: ;
895     else
896 # endif
897       {
898         /* Did not find it in the table.  */
899         /* On Mac OS X, all modern locales use the UTF-8 encoding.
900            BeOS and Haiku have a single locale, and it has UTF-8 encoding.  */
901 # if (defined __APPLE__ && defined __MACH__) || defined __BEOS__ || defined __HAIKU__
902         codeset = "UTF-8";
903 # else
904         /* Don't return an empty string.  GNU libc and GNU libiconv interpret
905            the empty string as denoting "the locale's character encoding",
906            thus GNU libiconv would call this function a second time.  */
907         if (codeset[0] == '\0')
908           codeset = "ASCII";
909 # endif
910       }
911   }
912 
913 #else
914 
915   /* On old systems which lack it, use setlocale or getenv.  */
916   const char *locale = NULL;
917 
918   /* But most old systems don't have a complete set of locales.  Some
919      (like DJGPP) have only the C locale.  Therefore we don't use setlocale
920      here; it would return "C" when it doesn't support the locale name the
921      user has set.  */
922 # if 0
923   locale = setlocale (LC_CTYPE, NULL);
924 # endif
925   if (locale == NULL || locale[0] == '\0')
926     {
927       locale = getenv ("LC_ALL");
928       if (locale == NULL || locale[0] == '\0')
929         {
930           locale = getenv ("LC_CTYPE");
931           if (locale == NULL || locale[0] == '\0')
932             locale = getenv ("LANG");
933             if (locale == NULL)
934               locale = "";
935         }
936     }
937 
938   /* Map locale name to canonical encoding name.  */
939   {
940 # ifdef locale_table_defined
941     const struct table_entry * const table = locale_table;
942     size_t const table_size =
943       sizeof (locale_table) / sizeof (struct table_entry);
944     /* The table is sorted.  Perform a binary search.  */
945     size_t hi = table_size;
946     size_t lo = 0;
947     while (lo < hi)
948       {
949         /* Invariant:
950            for i < lo, strcmp (table[i].locale, locale) < 0,
951            for i >= hi, strcmp (table[i].locale, locale) > 0.  */
952         size_t mid = (hi + lo) >> 1; /* >= lo, < hi */
953         int cmp = strcmp (table[mid].locale, locale);
954         if (cmp < 0)
955           lo = mid + 1;
956         else if (cmp > 0)
957           hi = mid;
958         else
959           {
960             /* Found an i with
961                  strcmp (table[i].locale, locale) == 0.  */
962             codeset = table[mid].canonical;
963             goto done_table_lookup;
964           }
965       }
966     if (0)
967       done_table_lookup: ;
968     else
969 # endif
970       {
971         /* Did not find it in the table.  */
972         /* On Mac OS X, all modern locales use the UTF-8 encoding.
973            BeOS and Haiku have a single locale, and it has UTF-8 encoding.  */
974 # if (defined __APPLE__ && defined __MACH__) || defined __BEOS__ || defined __HAIKU__
975         codeset = "UTF-8";
976 # else
977         /* The canonical name cannot be determined.  */
978         /* Don't return an empty string.  GNU libc and GNU libiconv interpret
979            the empty string as denoting "the locale's character encoding",
980            thus GNU libiconv would call this function a second time.  */
981         codeset = "ASCII";
982 # endif
983       }
984   }
985 
986 #endif
987 
988 #ifdef DARWIN7
989   /* Mac OS X sets MB_CUR_MAX to 1 when LC_ALL=C, and "UTF-8"
990      (the default codeset) does not work when MB_CUR_MAX is 1.  */
991   if (strcmp (codeset, "UTF-8") == 0 && MB_CUR_MAX_L (uselocale (NULL)) <= 1)
992     codeset = "ASCII";
993 #endif
994 
995   return codeset;
996 }
997