1 /* Determine a canonical name for the current locale's character encoding.
2 
3    Copyright (C) 2000-2006, 2008-2021 Free Software Foundation, Inc.
4 
5    This program is free software; you can redistribute it and/or modify
6    it under the terms of the GNU Lesser General Public License as published by
7    the Free Software Foundation; either version 2, or (at your option)
8    any later version.
9 
10    This program is distributed in the hope that it will be useful,
11    but WITHOUT ANY WARRANTY; without even the implied warranty of
12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13    GNU Lesser General Public License for more details.
14 
15    You should have received a copy of the GNU Lesser General Public License along
16    with this program; if not, see <https://www.gnu.org/licenses/>.  */
17 
18 /* Written by Bruno Haible <bruno@clisp.org>.  */
19 
20 #include <config.h>
21 
22 /* Specification.  */
23 #include "localcharset.h"
24 
25 #include <stddef.h>
26 #include <stdio.h>
27 #include <string.h>
28 #include <stdlib.h>
29 
30 #if defined __APPLE__ && defined __MACH__ && HAVE_LANGINFO_CODESET
31 # define DARWIN7 /* Darwin 7 or newer, i.e. Mac OS X 10.3 or newer */
32 #endif
33 
34 #if defined _WIN32 && !defined __CYGWIN__
35 # define WINDOWS_NATIVE
36 # include <locale.h>
37 #endif
38 
39 #if defined __EMX__
40 /* Assume EMX program runs on OS/2, even if compiled under DOS.  */
41 # ifndef OS2
42 #  define OS2
43 # endif
44 #endif
45 
46 #if !defined WINDOWS_NATIVE
47 # if HAVE_LANGINFO_CODESET
48 #  include <langinfo.h>
49 # else
50 #  if 0 /* see comment regarding use of setlocale(), below */
51 #   include <locale.h>
52 #  endif
53 # endif
54 # ifdef __CYGWIN__
55 #  define WIN32_LEAN_AND_MEAN
56 #  include <windows.h>
57 # endif
58 #elif defined WINDOWS_NATIVE
59 # define WIN32_LEAN_AND_MEAN
60 # include <windows.h>
61   /* For the use of setlocale() below, the Gnulib override in setlocale.c is
62      not needed; see the platform lists in setlocale_null.m4.  */
63 # undef setlocale
64 #endif
65 #if defined OS2
66 # define INCL_DOS
67 # include <os2.h>
68 #endif
69 
70 /* For MB_CUR_MAX_L */
71 #if defined DARWIN7
72 # include <xlocale.h>
73 #endif
74 
75 
76 #if HAVE_LANGINFO_CODESET || defined WINDOWS_NATIVE || defined OS2
77 
78 /* On these platforms, we use a mapping from non-canonical encoding name
79    to GNU canonical encoding name.  */
80 
81 /* With glibc-2.1 or newer, we don't need any canonicalization,
82    because glibc has iconv and both glibc and libiconv support all
83    GNU canonical names directly.  */
84 # if !((defined __GNU_LIBRARY__ && __GLIBC__ >= 2) || defined __UCLIBC__)
85 
86 struct table_entry
87 {
88   const char alias[11+1];
89   const char canonical[11+1];
90 };
91 
92 /* Table of platform-dependent mappings, sorted in ascending order.  */
93 static const struct table_entry alias_table[] =
94   {
95 #  if defined __FreeBSD__                                   /* FreeBSD */
96   /*{ "ARMSCII-8",  "ARMSCII-8" },*/
97     { "Big5",       "BIG5" },
98     { "C",          "ASCII" },
99   /*{ "CP1131",     "CP1131" },*/
100   /*{ "CP1251",     "CP1251" },*/
101   /*{ "CP866",      "CP866" },*/
102   /*{ "GB18030",    "GB18030" },*/
103   /*{ "GB2312",     "GB2312" },*/
104   /*{ "GBK",        "GBK" },*/
105   /*{ "ISCII-DEV",  "?" },*/
106     { "ISO8859-1",  "ISO-8859-1" },
107     { "ISO8859-13", "ISO-8859-13" },
108     { "ISO8859-15", "ISO-8859-15" },
109     { "ISO8859-2",  "ISO-8859-2" },
110     { "ISO8859-5",  "ISO-8859-5" },
111     { "ISO8859-7",  "ISO-8859-7" },
112     { "ISO8859-9",  "ISO-8859-9" },
113   /*{ "KOI8-R",     "KOI8-R" },*/
114   /*{ "KOI8-U",     "KOI8-U" },*/
115     { "SJIS",       "SHIFT_JIS" },
116     { "US-ASCII",   "ASCII" },
117     { "eucCN",      "GB2312" },
118     { "eucJP",      "EUC-JP" },
119     { "eucKR",      "EUC-KR" }
120 #   define alias_table_defined
121 #  endif
122 #  if defined __NetBSD__                                    /* NetBSD */
123     { "646",        "ASCII" },
124   /*{ "ARMSCII-8",  "ARMSCII-8" },*/
125   /*{ "BIG5",       "BIG5" },*/
126     { "Big5-HKSCS", "BIG5-HKSCS" },
127   /*{ "CP1251",     "CP1251" },*/
128   /*{ "CP866",      "CP866" },*/
129   /*{ "GB18030",    "GB18030" },*/
130   /*{ "GB2312",     "GB2312" },*/
131     { "ISO8859-1",  "ISO-8859-1" },
132     { "ISO8859-13", "ISO-8859-13" },
133     { "ISO8859-15", "ISO-8859-15" },
134     { "ISO8859-2",  "ISO-8859-2" },
135     { "ISO8859-4",  "ISO-8859-4" },
136     { "ISO8859-5",  "ISO-8859-5" },
137     { "ISO8859-7",  "ISO-8859-7" },
138   /*{ "KOI8-R",     "KOI8-R" },*/
139   /*{ "KOI8-U",     "KOI8-U" },*/
140   /*{ "PT154",      "PT154" },*/
141     { "SJIS",       "SHIFT_JIS" },
142     { "eucCN",      "GB2312" },
143     { "eucJP",      "EUC-JP" },
144     { "eucKR",      "EUC-KR" },
145     { "eucTW",      "EUC-TW" }
146 #   define alias_table_defined
147 #  endif
148 #  if defined __OpenBSD__                                   /* OpenBSD */
149     { "646",        "ASCII" },
150     { "ISO8859-1",  "ISO-8859-1" },
151     { "ISO8859-13", "ISO-8859-13" },
152     { "ISO8859-15", "ISO-8859-15" },
153     { "ISO8859-2",  "ISO-8859-2" },
154     { "ISO8859-4",  "ISO-8859-4" },
155     { "ISO8859-5",  "ISO-8859-5" },
156     { "ISO8859-7",  "ISO-8859-7" },
157     { "US-ASCII",   "ASCII" }
158 #   define alias_table_defined
159 #  endif
160 #  if defined __APPLE__ && defined __MACH__                 /* Mac OS X */
161     /* Darwin 7.5 has nl_langinfo(CODESET), but sometimes its value is
162        useless:
163        - It returns the empty string when LANG is set to a locale of the
164          form ll_CC, although ll_CC/LC_CTYPE is a symlink to an UTF-8
165          LC_CTYPE file.
166        - The environment variables LANG, LC_CTYPE, LC_ALL are not set by
167          the system; nl_langinfo(CODESET) returns "US-ASCII" in this case.
168        - The documentation says:
169            "... all code that calls BSD system routines should ensure
170             that the const *char parameters of these routines are in UTF-8
171             encoding. All BSD system functions expect their string
172             parameters to be in UTF-8 encoding and nothing else."
173          It also says
174            "An additional caveat is that string parameters for files,
175             paths, and other file-system entities must be in canonical
176             UTF-8. In a canonical UTF-8 Unicode string, all decomposable
177             characters are decomposed ..."
178          but this is not true: You can pass non-decomposed UTF-8 strings
179          to file system functions, and it is the OS which will convert
180          them to decomposed UTF-8 before accessing the file system.
181        - The Apple Terminal application displays UTF-8 by default.
182        - However, other applications are free to use different encodings:
183          - xterm uses ISO-8859-1 by default.
184          - TextEdit uses MacRoman by default.
185        We prefer UTF-8 over decomposed UTF-8-MAC because one should
186        minimize the use of decomposed Unicode. Unfortunately, through the
187        Darwin file system, decomposed UTF-8 strings are leaked into user
188        space nevertheless.
189        Then there are also the locales with encodings other than US-ASCII
190        and UTF-8. These locales can be occasionally useful to users (e.g.
191        when grepping through ISO-8859-1 encoded text files), when all their
192        file names are in US-ASCII.
193      */
194     { "ARMSCII-8",  "ARMSCII-8" },
195     { "Big5",       "BIG5" },
196     { "Big5HKSCS",  "BIG5-HKSCS" },
197     { "CP1131",     "CP1131" },
198     { "CP1251",     "CP1251" },
199     { "CP866",      "CP866" },
200     { "CP949",      "CP949" },
201     { "GB18030",    "GB18030" },
202     { "GB2312",     "GB2312" },
203     { "GBK",        "GBK" },
204   /*{ "ISCII-DEV",  "?" },*/
205     { "ISO8859-1",  "ISO-8859-1" },
206     { "ISO8859-13", "ISO-8859-13" },
207     { "ISO8859-15", "ISO-8859-15" },
208     { "ISO8859-2",  "ISO-8859-2" },
209     { "ISO8859-4",  "ISO-8859-4" },
210     { "ISO8859-5",  "ISO-8859-5" },
211     { "ISO8859-7",  "ISO-8859-7" },
212     { "ISO8859-9",  "ISO-8859-9" },
213     { "KOI8-R",     "KOI8-R" },
214     { "KOI8-U",     "KOI8-U" },
215     { "PT154",      "PT154" },
216     { "SJIS",       "SHIFT_JIS" },
217     { "eucCN",      "GB2312" },
218     { "eucJP",      "EUC-JP" },
219     { "eucKR",      "EUC-KR" }
220 #   define alias_table_defined
221 #  endif
222 #  if defined _AIX                                          /* AIX */
223   /*{ "GBK",        "GBK" },*/
224     { "IBM-1046",   "CP1046" },
225     { "IBM-1124",   "CP1124" },
226     { "IBM-1129",   "CP1129" },
227     { "IBM-1252",   "CP1252" },
228     { "IBM-850",    "CP850" },
229     { "IBM-856",    "CP856" },
230     { "IBM-921",    "ISO-8859-13" },
231     { "IBM-922",    "CP922" },
232     { "IBM-932",    "CP932" },
233     { "IBM-943",    "CP943" },
234     { "IBM-eucCN",  "GB2312" },
235     { "IBM-eucJP",  "EUC-JP" },
236     { "IBM-eucKR",  "EUC-KR" },
237     { "IBM-eucTW",  "EUC-TW" },
238     { "ISO8859-1",  "ISO-8859-1" },
239     { "ISO8859-15", "ISO-8859-15" },
240     { "ISO8859-2",  "ISO-8859-2" },
241     { "ISO8859-5",  "ISO-8859-5" },
242     { "ISO8859-6",  "ISO-8859-6" },
243     { "ISO8859-7",  "ISO-8859-7" },
244     { "ISO8859-8",  "ISO-8859-8" },
245     { "ISO8859-9",  "ISO-8859-9" },
246     { "TIS-620",    "TIS-620" },
247   /*{ "UTF-8",      "UTF-8" },*/
248     { "big5",       "BIG5" }
249 #   define alias_table_defined
250 #  endif
251 #  if defined __hpux                                        /* HP-UX */
252     { "SJIS",      "SHIFT_JIS" },
253     { "arabic8",   "HP-ARABIC8" },
254     { "big5",      "BIG5" },
255     { "cp1251",    "CP1251" },
256     { "eucJP",     "EUC-JP" },
257     { "eucKR",     "EUC-KR" },
258     { "eucTW",     "EUC-TW" },
259     { "gb18030",   "GB18030" },
260     { "greek8",    "HP-GREEK8" },
261     { "hebrew8",   "HP-HEBREW8" },
262     { "hkbig5",    "BIG5-HKSCS" },
263     { "hp15CN",    "GB2312" },
264     { "iso88591",  "ISO-8859-1" },
265     { "iso885913", "ISO-8859-13" },
266     { "iso885915", "ISO-8859-15" },
267     { "iso88592",  "ISO-8859-2" },
268     { "iso88594",  "ISO-8859-4" },
269     { "iso88595",  "ISO-8859-5" },
270     { "iso88596",  "ISO-8859-6" },
271     { "iso88597",  "ISO-8859-7" },
272     { "iso88598",  "ISO-8859-8" },
273     { "iso88599",  "ISO-8859-9" },
274     { "kana8",     "HP-KANA8" },
275     { "koi8r",     "KOI8-R" },
276     { "roman8",    "HP-ROMAN8" },
277     { "tis620",    "TIS-620" },
278     { "turkish8",  "HP-TURKISH8" },
279     { "utf8",      "UTF-8" }
280 #   define alias_table_defined
281 #  endif
282 #  if defined __sgi                                         /* IRIX */
283     { "ISO8859-1",  "ISO-8859-1" },
284     { "ISO8859-15", "ISO-8859-15" },
285     { "ISO8859-2",  "ISO-8859-2" },
286     { "ISO8859-5",  "ISO-8859-5" },
287     { "ISO8859-7",  "ISO-8859-7" },
288     { "ISO8859-9",  "ISO-8859-9" },
289     { "eucCN",      "GB2312" },
290     { "eucJP",      "EUC-JP" },
291     { "eucKR",      "EUC-KR" },
292     { "eucTW",      "EUC-TW" }
293 #   define alias_table_defined
294 #  endif
295 #  if defined __osf__                                       /* OSF/1 */
296   /*{ "GBK",        "GBK" },*/
297     { "ISO8859-1",  "ISO-8859-1" },
298     { "ISO8859-15", "ISO-8859-15" },
299     { "ISO8859-2",  "ISO-8859-2" },
300     { "ISO8859-4",  "ISO-8859-4" },
301     { "ISO8859-5",  "ISO-8859-5" },
302     { "ISO8859-7",  "ISO-8859-7" },
303     { "ISO8859-8",  "ISO-8859-8" },
304     { "ISO8859-9",  "ISO-8859-9" },
305     { "KSC5601",    "CP949" },
306     { "SJIS",       "SHIFT_JIS" },
307     { "TACTIS",     "TIS-620" },
308   /*{ "UTF-8",      "UTF-8" },*/
309     { "big5",       "BIG5" },
310     { "cp850",      "CP850" },
311     { "dechanyu",   "DEC-HANYU" },
312     { "dechanzi",   "GB2312" },
313     { "deckanji",   "DEC-KANJI" },
314     { "deckorean",  "EUC-KR" },
315     { "eucJP",      "EUC-JP" },
316     { "eucKR",      "EUC-KR" },
317     { "eucTW",      "EUC-TW" },
318     { "sdeckanji",  "EUC-JP" }
319 #   define alias_table_defined
320 #  endif
321 #  if defined __sun                                         /* Solaris */
322     { "5601",        "EUC-KR" },
323     { "646",         "ASCII" },
324   /*{ "BIG5",        "BIG5" },*/
325     { "Big5-HKSCS",  "BIG5-HKSCS" },
326     { "GB18030",     "GB18030" },
327   /*{ "GBK",         "GBK" },*/
328     { "ISO8859-1",   "ISO-8859-1" },
329     { "ISO8859-11",  "TIS-620" },
330     { "ISO8859-13",  "ISO-8859-13" },
331     { "ISO8859-15",  "ISO-8859-15" },
332     { "ISO8859-2",   "ISO-8859-2" },
333     { "ISO8859-3",   "ISO-8859-3" },
334     { "ISO8859-4",   "ISO-8859-4" },
335     { "ISO8859-5",   "ISO-8859-5" },
336     { "ISO8859-6",   "ISO-8859-6" },
337     { "ISO8859-7",   "ISO-8859-7" },
338     { "ISO8859-8",   "ISO-8859-8" },
339     { "ISO8859-9",   "ISO-8859-9" },
340     { "PCK",         "SHIFT_JIS" },
341     { "TIS620.2533", "TIS-620" },
342   /*{ "UTF-8",       "UTF-8" },*/
343     { "ansi-1251",   "CP1251" },
344     { "cns11643",    "EUC-TW" },
345     { "eucJP",       "EUC-JP" },
346     { "gb2312",      "GB2312" },
347     { "koi8-r",      "KOI8-R" }
348 #   define alias_table_defined
349 #  endif
350 #  if defined __minix                                       /* Minix */
351     { "646", "ASCII" }
352 #   define alias_table_defined
353 #  endif
354 #  if defined WINDOWS_NATIVE || defined __CYGWIN__          /* Windows */
355     { "CP1361",  "JOHAB" },
356     { "CP20127", "ASCII" },
357     { "CP20866", "KOI8-R" },
358     { "CP20936", "GB2312" },
359     { "CP21866", "KOI8-RU" },
360     { "CP28591", "ISO-8859-1" },
361     { "CP28592", "ISO-8859-2" },
362     { "CP28593", "ISO-8859-3" },
363     { "CP28594", "ISO-8859-4" },
364     { "CP28595", "ISO-8859-5" },
365     { "CP28596", "ISO-8859-6" },
366     { "CP28597", "ISO-8859-7" },
367     { "CP28598", "ISO-8859-8" },
368     { "CP28599", "ISO-8859-9" },
369     { "CP28605", "ISO-8859-15" },
370     { "CP38598", "ISO-8859-8" },
371     { "CP51932", "EUC-JP" },
372     { "CP51936", "GB2312" },
373     { "CP51949", "EUC-KR" },
374     { "CP51950", "EUC-TW" },
375     { "CP54936", "GB18030" },
376     { "CP65001", "UTF-8" },
377     { "CP936",   "GBK" }
378 #   define alias_table_defined
379 #  endif
380 #  if defined OS2                                           /* OS/2 */
381     /* The list of encodings is taken from "List of OS/2 Codepages"
382        by Alex Taylor:
383        <http://altsan.org/os2/toolkits/uls/index.html#codepages>.
384        See also "__convcp() of kLIBC":
385        <https://github.com/bitwiseworks/libc/blob/master/src/emx/src/lib/locale/__convcp.c>.  */
386     { "CP1004",        "CP1252" },
387   /*{ "CP1041",        "CP943" },*/
388   /*{ "CP1088",        "CP949" },*/
389     { "CP1089",        "ISO-8859-6" },
390   /*{ "CP1114",        "CP950" },*/
391   /*{ "CP1115",        "GB2312" },*/
392     { "CP1208",        "UTF-8" },
393   /*{ "CP1380",        "GB2312" },*/
394     { "CP1381",        "GB2312" },
395     { "CP1383",        "GB2312" },
396     { "CP1386",        "GBK" },
397   /*{ "CP301",         "CP943" },*/
398     { "CP3372",        "EUC-JP" },
399     { "CP4946",        "CP850" },
400   /*{ "CP5048",        "JIS_X0208-1990" },*/
401   /*{ "CP5049",        "JIS_X0212-1990" },*/
402   /*{ "CP5067",        "KS_C_5601-1987" },*/
403     { "CP813",         "ISO-8859-7" },
404     { "CP819",         "ISO-8859-1" },
405     { "CP878",         "KOI8-R" },
406   /*{ "CP897",         "CP943" },*/
407     { "CP912",         "ISO-8859-2" },
408     { "CP913",         "ISO-8859-3" },
409     { "CP914",         "ISO-8859-4" },
410     { "CP915",         "ISO-8859-5" },
411     { "CP916",         "ISO-8859-8" },
412     { "CP920",         "ISO-8859-9" },
413     { "CP921",         "ISO-8859-13" },
414     { "CP923",         "ISO-8859-15" },
415   /*{ "CP941",         "CP943" },*/
416   /*{ "CP947",         "CP950" },*/
417   /*{ "CP951",         "CP949" },*/
418   /*{ "CP952",         "JIS_X0208-1990" },*/
419   /*{ "CP953",         "JIS_X0212-1990" },*/
420     { "CP954",         "EUC-JP" },
421     { "CP964",         "EUC-TW" },
422     { "CP970",         "EUC-KR" },
423   /*{ "CP971",         "KS_C_5601-1987" },*/
424     { "IBM-1004",      "CP1252" },
425   /*{ "IBM-1006",      "?" },*/
426   /*{ "IBM-1008",      "?" },*/
427   /*{ "IBM-1041",      "CP943" },*/
428   /*{ "IBM-1051",      "?" },*/
429   /*{ "IBM-1088",      "CP949" },*/
430     { "IBM-1089",      "ISO-8859-6" },
431   /*{ "IBM-1098",      "?" },*/
432   /*{ "IBM-1114",      "CP950" },*/
433   /*{ "IBM-1115",      "GB2312" },*/
434   /*{ "IBM-1116",      "?" },*/
435   /*{ "IBM-1117",      "?" },*/
436   /*{ "IBM-1118",      "?" },*/
437   /*{ "IBM-1119",      "?" },*/
438     { "IBM-1124",      "CP1124" },
439     { "IBM-1125",      "CP1125" },
440     { "IBM-1131",      "CP1131" },
441     { "IBM-1208",      "UTF-8" },
442     { "IBM-1250",      "CP1250" },
443     { "IBM-1251",      "CP1251" },
444     { "IBM-1252",      "CP1252" },
445     { "IBM-1253",      "CP1253" },
446     { "IBM-1254",      "CP1254" },
447     { "IBM-1255",      "CP1255" },
448     { "IBM-1256",      "CP1256" },
449     { "IBM-1257",      "CP1257" },
450   /*{ "IBM-1275",      "?" },*/
451   /*{ "IBM-1276",      "?" },*/
452   /*{ "IBM-1277",      "?" },*/
453   /*{ "IBM-1280",      "?" },*/
454   /*{ "IBM-1281",      "?" },*/
455   /*{ "IBM-1282",      "?" },*/
456   /*{ "IBM-1283",      "?" },*/
457   /*{ "IBM-1380",      "GB2312" },*/
458     { "IBM-1381",      "GB2312" },
459     { "IBM-1383",      "GB2312" },
460     { "IBM-1386",      "GBK" },
461   /*{ "IBM-301",       "CP943" },*/
462     { "IBM-3372",      "EUC-JP" },
463     { "IBM-367",       "ASCII" },
464     { "IBM-437",       "CP437" },
465     { "IBM-4946",      "CP850" },
466   /*{ "IBM-5048",      "JIS_X0208-1990" },*/
467   /*{ "IBM-5049",      "JIS_X0212-1990" },*/
468   /*{ "IBM-5067",      "KS_C_5601-1987" },*/
469     { "IBM-813",       "ISO-8859-7" },
470     { "IBM-819",       "ISO-8859-1" },
471     { "IBM-850",       "CP850" },
472   /*{ "IBM-851",       "?" },*/
473     { "IBM-852",       "CP852" },
474     { "IBM-855",       "CP855" },
475     { "IBM-856",       "CP856" },
476     { "IBM-857",       "CP857" },
477   /*{ "IBM-859",       "?" },*/
478     { "IBM-860",       "CP860" },
479     { "IBM-861",       "CP861" },
480     { "IBM-862",       "CP862" },
481     { "IBM-863",       "CP863" },
482     { "IBM-864",       "CP864" },
483     { "IBM-865",       "CP865" },
484     { "IBM-866",       "CP866" },
485   /*{ "IBM-868",       "?" },*/
486     { "IBM-869",       "CP869" },
487     { "IBM-874",       "CP874" },
488     { "IBM-878",       "KOI8-R" },
489   /*{ "IBM-895",       "?" },*/
490   /*{ "IBM-897",       "CP943" },*/
491   /*{ "IBM-907",       "?" },*/
492   /*{ "IBM-909",       "?" },*/
493     { "IBM-912",       "ISO-8859-2" },
494     { "IBM-913",       "ISO-8859-3" },
495     { "IBM-914",       "ISO-8859-4" },
496     { "IBM-915",       "ISO-8859-5" },
497     { "IBM-916",       "ISO-8859-8" },
498     { "IBM-920",       "ISO-8859-9" },
499     { "IBM-921",       "ISO-8859-13" },
500     { "IBM-922",       "CP922" },
501     { "IBM-923",       "ISO-8859-15" },
502     { "IBM-932",       "CP932" },
503   /*{ "IBM-941",       "CP943" },*/
504   /*{ "IBM-942",       "?" },*/
505     { "IBM-943",       "CP943" },
506   /*{ "IBM-947",       "CP950" },*/
507     { "IBM-949",       "CP949" },
508     { "IBM-950",       "CP950" },
509   /*{ "IBM-951",       "CP949" },*/
510   /*{ "IBM-952",       "JIS_X0208-1990" },*/
511   /*{ "IBM-953",       "JIS_X0212-1990" },*/
512     { "IBM-954",       "EUC-JP" },
513   /*{ "IBM-955",       "?" },*/
514     { "IBM-964",       "EUC-TW" },
515     { "IBM-970",       "EUC-KR" },
516   /*{ "IBM-971",       "KS_C_5601-1987" },*/
517     { "IBM-eucCN",     "GB2312" },
518     { "IBM-eucJP",     "EUC-JP" },
519     { "IBM-eucKR",     "EUC-KR" },
520     { "IBM-eucTW",     "EUC-TW" },
521     { "IBM33722",      "EUC-JP" },
522     { "ISO8859-1",     "ISO-8859-1" },
523     { "ISO8859-2",     "ISO-8859-2" },
524     { "ISO8859-3",     "ISO-8859-3" },
525     { "ISO8859-4",     "ISO-8859-4" },
526     { "ISO8859-5",     "ISO-8859-5" },
527     { "ISO8859-6",     "ISO-8859-6" },
528     { "ISO8859-7",     "ISO-8859-7" },
529     { "ISO8859-8",     "ISO-8859-8" },
530     { "ISO8859-9",     "ISO-8859-9" },
531   /*{ "JISX0201-1976", "JISX0201-1976" },*/
532   /*{ "JISX0208-1978", "?" },*/
533   /*{ "JISX0208-1983", "JIS_X0208-1983" },*/
534   /*{ "JISX0208-1990", "JIS_X0208-1990" },*/
535   /*{ "JISX0212-1990", "JIS_X0212-1990" },*/
536   /*{ "KSC5601-1987",  "KS_C_5601-1987" },*/
537     { "SJIS-1",        "CP943" },
538     { "SJIS-2",        "CP943" },
539     { "eucJP",         "EUC-JP" },
540     { "eucKR",         "EUC-KR" },
541     { "eucTW-1993",    "EUC-TW" }
542 #   define alias_table_defined
543 #  endif
544 #  if defined VMS                                           /* OpenVMS */
545     /* The list of encodings is taken from the OpenVMS 7.3-1 documentation
546        "Compaq C Run-Time Library Reference Manual for OpenVMS systems"
547        section 10.7 "Handling Different Character Sets".  */
548     { "DECHANYU",  "DEC-HANYU" },
549     { "DECHANZI",  "GB2312" },
550     { "DECKANJI",  "DEC-KANJI" },
551     { "DECKOREAN", "EUC-KR" },
552     { "ISO8859-1", "ISO-8859-1" },
553     { "ISO8859-2", "ISO-8859-2" },
554     { "ISO8859-5", "ISO-8859-5" },
555     { "ISO8859-7", "ISO-8859-7" },
556     { "ISO8859-8", "ISO-8859-8" },
557     { "ISO8859-9", "ISO-8859-9" },
558     { "SDECKANJI", "EUC-JP" },
559     { "SJIS",      "SHIFT_JIS" },
560     { "eucJP",     "EUC-JP" },
561     { "eucTW",     "EUC-TW" }
562 #   define alias_table_defined
563 #  endif
564 #  ifndef alias_table_defined
565     /* Just a dummy entry, to avoid a C syntax error.  */
566     { "", "" }
567 #  endif
568   };
569 
570 # endif
571 
572 #else
573 
574 /* On these platforms, we use a mapping from locale name to GNU canonical
575    encoding name.  */
576 
577 struct table_entry
578 {
579   const char locale[17+1];
580   const char canonical[11+1];
581 };
582 
583 /* Table of platform-dependent mappings, sorted in ascending order.  */
584 static const struct table_entry locale_table[] =
585   {
586 # if defined __FreeBSD__                                    /* FreeBSD 4.2 */
587     { "cs_CZ.ISO_8859-2",  "ISO-8859-2" },
588     { "da_DK.DIS_8859-15", "ISO-8859-15" },
589     { "da_DK.ISO_8859-1",  "ISO-8859-1" },
590     { "de_AT.DIS_8859-15", "ISO-8859-15" },
591     { "de_AT.ISO_8859-1",  "ISO-8859-1" },
592     { "de_CH.DIS_8859-15", "ISO-8859-15" },
593     { "de_CH.ISO_8859-1",  "ISO-8859-1" },
594     { "de_DE.DIS_8859-15", "ISO-8859-15" },
595     { "de_DE.ISO_8859-1",  "ISO-8859-1" },
596     { "en_AU.DIS_8859-15", "ISO-8859-15" },
597     { "en_AU.ISO_8859-1",  "ISO-8859-1" },
598     { "en_CA.DIS_8859-15", "ISO-8859-15" },
599     { "en_CA.ISO_8859-1",  "ISO-8859-1" },
600     { "en_GB.DIS_8859-15", "ISO-8859-15" },
601     { "en_GB.ISO_8859-1",  "ISO-8859-1" },
602     { "en_US.DIS_8859-15", "ISO-8859-15" },
603     { "en_US.ISO_8859-1",  "ISO-8859-1" },
604     { "es_ES.DIS_8859-15", "ISO-8859-15" },
605     { "es_ES.ISO_8859-1",  "ISO-8859-1" },
606     { "fi_FI.DIS_8859-15", "ISO-8859-15" },
607     { "fi_FI.ISO_8859-1",  "ISO-8859-1" },
608     { "fr_BE.DIS_8859-15", "ISO-8859-15" },
609     { "fr_BE.ISO_8859-1",  "ISO-8859-1" },
610     { "fr_CA.DIS_8859-15", "ISO-8859-15" },
611     { "fr_CA.ISO_8859-1",  "ISO-8859-1" },
612     { "fr_CH.DIS_8859-15", "ISO-8859-15" },
613     { "fr_CH.ISO_8859-1",  "ISO-8859-1" },
614     { "fr_FR.DIS_8859-15", "ISO-8859-15" },
615     { "fr_FR.ISO_8859-1",  "ISO-8859-1" },
616     { "hr_HR.ISO_8859-2",  "ISO-8859-2" },
617     { "hu_HU.ISO_8859-2",  "ISO-8859-2" },
618     { "is_IS.DIS_8859-15", "ISO-8859-15" },
619     { "is_IS.ISO_8859-1",  "ISO-8859-1" },
620     { "it_CH.DIS_8859-15", "ISO-8859-15" },
621     { "it_CH.ISO_8859-1",  "ISO-8859-1" },
622     { "it_IT.DIS_8859-15", "ISO-8859-15" },
623     { "it_IT.ISO_8859-1",  "ISO-8859-1" },
624     { "ja_JP.EUC",         "EUC-JP" },
625     { "ja_JP.SJIS",        "SHIFT_JIS" },
626     { "ja_JP.Shift_JIS",   "SHIFT_JIS" },
627     { "ko_KR.EUC",         "EUC-KR" },
628     { "la_LN.ASCII",       "ASCII" },
629     { "la_LN.DIS_8859-15", "ISO-8859-15" },
630     { "la_LN.ISO_8859-1",  "ISO-8859-1" },
631     { "la_LN.ISO_8859-2",  "ISO-8859-2" },
632     { "la_LN.ISO_8859-4",  "ISO-8859-4" },
633     { "lt_LN.ASCII",       "ASCII" },
634     { "lt_LN.DIS_8859-15", "ISO-8859-15" },
635     { "lt_LN.ISO_8859-1",  "ISO-8859-1" },
636     { "lt_LN.ISO_8859-2",  "ISO-8859-2" },
637     { "lt_LT.ISO_8859-4",  "ISO-8859-4" },
638     { "nl_BE.DIS_8859-15", "ISO-8859-15" },
639     { "nl_BE.ISO_8859-1",  "ISO-8859-1" },
640     { "nl_NL.DIS_8859-15", "ISO-8859-15" },
641     { "nl_NL.ISO_8859-1",  "ISO-8859-1" },
642     { "no_NO.DIS_8859-15", "ISO-8859-15" },
643     { "no_NO.ISO_8859-1",  "ISO-8859-1" },
644     { "pl_PL.ISO_8859-2",  "ISO-8859-2" },
645     { "pt_PT.DIS_8859-15", "ISO-8859-15" },
646     { "pt_PT.ISO_8859-1",  "ISO-8859-1" },
647     { "ru_RU.CP866",       "CP866" },
648     { "ru_RU.ISO_8859-5",  "ISO-8859-5" },
649     { "ru_RU.KOI8-R",      "KOI8-R" },
650     { "ru_SU.CP866",       "CP866" },
651     { "ru_SU.ISO_8859-5",  "ISO-8859-5" },
652     { "ru_SU.KOI8-R",      "KOI8-R" },
653     { "sl_SI.ISO_8859-2",  "ISO-8859-2" },
654     { "sv_SE.DIS_8859-15", "ISO-8859-15" },
655     { "sv_SE.ISO_8859-1",  "ISO-8859-1" },
656     { "uk_UA.KOI8-U",      "KOI8-U" },
657     { "zh_CN.EUC",         "GB2312" },
658     { "zh_TW.BIG5",        "BIG5" },
659     { "zh_TW.Big5",        "BIG5" }
660 #  define locale_table_defined
661 # endif
662 # if defined __DJGPP__                                      /* DOS / DJGPP 2.03 */
663     /* The encodings given here may not all be correct.
664        If you find that the encoding given for your language and
665        country is not the one your DOS machine actually uses, just
666        correct it in this file, and send a mail to
667        Juan Manuel Guerrero <juan.guerrero@gmx.de>
668        and <bug-gnulib@gnu.org>.  */
669     { "C",     "ASCII" },
670     { "ar",    "CP864" },
671     { "ar_AE", "CP864" },
672     { "ar_DZ", "CP864" },
673     { "ar_EG", "CP864" },
674     { "ar_IQ", "CP864" },
675     { "ar_IR", "CP864" },
676     { "ar_JO", "CP864" },
677     { "ar_KW", "CP864" },
678     { "ar_MA", "CP864" },
679     { "ar_OM", "CP864" },
680     { "ar_QA", "CP864" },
681     { "ar_SA", "CP864" },
682     { "ar_SY", "CP864" },
683     { "be",    "CP866" },
684     { "be_BE", "CP866" },
685     { "bg",    "CP866" }, /* not CP855 ?? */
686     { "bg_BG", "CP866" }, /* not CP855 ?? */
687     { "ca",    "CP850" },
688     { "ca_ES", "CP850" },
689     { "cs",    "CP852" },
690     { "cs_CZ", "CP852" },
691     { "da",    "CP865" }, /* not CP850 ?? */
692     { "da_DK", "CP865" }, /* not CP850 ?? */
693     { "de",    "CP850" },
694     { "de_AT", "CP850" },
695     { "de_CH", "CP850" },
696     { "de_DE", "CP850" },
697     { "el",    "CP869" },
698     { "el_GR", "CP869" },
699     { "en",    "CP850" },
700     { "en_AU", "CP850" }, /* not CP437 ?? */
701     { "en_CA", "CP850" },
702     { "en_GB", "CP850" },
703     { "en_NZ", "CP437" },
704     { "en_US", "CP437" },
705     { "en_ZA", "CP850" }, /* not CP437 ?? */
706     { "eo",    "CP850" },
707     { "eo_EO", "CP850" },
708     { "es",    "CP850" },
709     { "es_AR", "CP850" },
710     { "es_BO", "CP850" },
711     { "es_CL", "CP850" },
712     { "es_CO", "CP850" },
713     { "es_CR", "CP850" },
714     { "es_CU", "CP850" },
715     { "es_DO", "CP850" },
716     { "es_EC", "CP850" },
717     { "es_ES", "CP850" },
718     { "es_GT", "CP850" },
719     { "es_HN", "CP850" },
720     { "es_MX", "CP850" },
721     { "es_NI", "CP850" },
722     { "es_PA", "CP850" },
723     { "es_PE", "CP850" },
724     { "es_PY", "CP850" },
725     { "es_SV", "CP850" },
726     { "es_UY", "CP850" },
727     { "es_VE", "CP850" },
728     { "et",    "CP850" },
729     { "et_EE", "CP850" },
730     { "eu",    "CP850" },
731     { "eu_ES", "CP850" },
732     { "fi",    "CP850" },
733     { "fi_FI", "CP850" },
734     { "fr",    "CP850" },
735     { "fr_BE", "CP850" },
736     { "fr_CA", "CP850" },
737     { "fr_CH", "CP850" },
738     { "fr_FR", "CP850" },
739     { "ga",    "CP850" },
740     { "ga_IE", "CP850" },
741     { "gd",    "CP850" },
742     { "gd_GB", "CP850" },
743     { "gl",    "CP850" },
744     { "gl_ES", "CP850" },
745     { "he",    "CP862" },
746     { "he_IL", "CP862" },
747     { "hr",    "CP852" },
748     { "hr_HR", "CP852" },
749     { "hu",    "CP852" },
750     { "hu_HU", "CP852" },
751     { "id",    "CP850" }, /* not CP437 ?? */
752     { "id_ID", "CP850" }, /* not CP437 ?? */
753     { "is",    "CP861" }, /* not CP850 ?? */
754     { "is_IS", "CP861" }, /* not CP850 ?? */
755     { "it",    "CP850" },
756     { "it_CH", "CP850" },
757     { "it_IT", "CP850" },
758     { "ja",    "CP932" },
759     { "ja_JP", "CP932" },
760     { "kr",    "CP949" }, /* not CP934 ?? */
761     { "kr_KR", "CP949" }, /* not CP934 ?? */
762     { "lt",    "CP775" },
763     { "lt_LT", "CP775" },
764     { "lv",    "CP775" },
765     { "lv_LV", "CP775" },
766     { "mk",    "CP866" }, /* not CP855 ?? */
767     { "mk_MK", "CP866" }, /* not CP855 ?? */
768     { "mt",    "CP850" },
769     { "mt_MT", "CP850" },
770     { "nb",    "CP865" }, /* not CP850 ?? */
771     { "nb_NO", "CP865" }, /* not CP850 ?? */
772     { "nl",    "CP850" },
773     { "nl_BE", "CP850" },
774     { "nl_NL", "CP850" },
775     { "nn",    "CP865" }, /* not CP850 ?? */
776     { "nn_NO", "CP865" }, /* not CP850 ?? */
777     { "no",    "CP865" }, /* not CP850 ?? */
778     { "no_NO", "CP865" }, /* not CP850 ?? */
779     { "pl",    "CP852" },
780     { "pl_PL", "CP852" },
781     { "pt",    "CP850" },
782     { "pt_BR", "CP850" },
783     { "pt_PT", "CP850" },
784     { "ro",    "CP852" },
785     { "ro_RO", "CP852" },
786     { "ru",    "CP866" },
787     { "ru_RU", "CP866" },
788     { "sk",    "CP852" },
789     { "sk_SK", "CP852" },
790     { "sl",    "CP852" },
791     { "sl_SI", "CP852" },
792     { "sq",    "CP852" },
793     { "sq_AL", "CP852" },
794     { "sr",    "CP852" }, /* CP852 or CP866 or CP855 ?? */
795     { "sr_CS", "CP852" }, /* CP852 or CP866 or CP855 ?? */
796     { "sr_YU", "CP852" }, /* CP852 or CP866 or CP855 ?? */
797     { "sv",    "CP850" },
798     { "sv_SE", "CP850" },
799     { "th",    "CP874" },
800     { "th_TH", "CP874" },
801     { "tr",    "CP857" },
802     { "tr_TR", "CP857" },
803     { "uk",    "CP1125" },
804     { "uk_UA", "CP1125" },
805     { "zh_CN", "GBK" },
806     { "zh_TW", "CP950" } /* not CP938 ?? */
807 #  define locale_table_defined
808 # endif
809 # ifndef locale_table_defined
810     /* Just a dummy entry, to avoid a C syntax error.  */
811     { "", "" }
812 # endif
813   };
814 
815 #endif
816 
817 
818 /* Determine the current locale's character encoding, and canonicalize it
819    into one of the canonical names listed below.
820    The result must not be freed; it is statically allocated.  The result
821    becomes invalid when setlocale() is used to change the global locale, or
822    when the value of one of the environment variables LC_ALL, LC_CTYPE, LANG
823    is changed; threads in multithreaded programs should not do this.
824    If the canonical name cannot be determined, the result is a non-canonical
825    name.  */
826 
827 #ifdef STATIC
828 STATIC
829 #endif
830 const char *
locale_charset(void)831 locale_charset (void)
832 {
833   const char *codeset;
834 
835   /* This function must be multithread-safe.  To achieve this without using
836      thread-local storage, we use a simple strcpy or memcpy to fill this static
837      buffer.  Filling it through, for example, strcpy + strcat would not be
838      guaranteed to leave the buffer's contents intact if another thread is
839      currently accessing it.  If necessary, the contents is first assembled in
840      a stack-allocated buffer.  */
841 
842 #if HAVE_LANGINFO_CODESET || defined WINDOWS_NATIVE || defined OS2
843 
844 # if HAVE_LANGINFO_CODESET
845 
846   /* Most systems support nl_langinfo (CODESET) nowadays.  */
847   codeset = nl_langinfo (CODESET);
848 
849 #  ifdef __CYGWIN__
850   /* Cygwin < 1.7 does not have locales.  nl_langinfo (CODESET) always
851      returns "US-ASCII".  Return the suffix of the locale name from the
852      environment variables (if present) or the codepage as a number.  */
853   if (codeset != NULL && strcmp (codeset, "US-ASCII") == 0)
854     {
855       const char *locale;
856       static char resultbuf[2 + 10 + 1];
857 
858       locale = getenv ("LC_ALL");
859       if (locale == NULL || locale[0] == '\0')
860         {
861           locale = getenv ("LC_CTYPE");
862           if (locale == NULL || locale[0] == '\0')
863             locale = getenv ("LANG");
864         }
865       if (locale != NULL && locale[0] != '\0')
866         {
867           /* If the locale name contains an encoding after the dot, return
868              it.  */
869           const char *dot = strchr (locale, '.');
870 
871           if (dot != NULL)
872             {
873               const char *modifier;
874 
875               dot++;
876               /* Look for the possible @... trailer and remove it, if any.  */
877               modifier = strchr (dot, '@');
878               if (modifier == NULL)
879                 return dot;
880               if (modifier - dot < sizeof (resultbuf))
881                 {
882                   /* This way of filling resultbuf is multithread-safe.  */
883                   memcpy (resultbuf, dot, modifier - dot);
884                   resultbuf [modifier - dot] = '\0';
885                   return resultbuf;
886                 }
887             }
888         }
889 
890       /* The Windows API has a function returning the locale's codepage as a
891          number: GetACP().  This encoding is used by Cygwin, unless the user
892          has set the environment variable CYGWIN=codepage:oem (which very few
893          people do).
894          Output directed to console windows needs to be converted (to
895          GetOEMCP() if the console is using a raster font, or to
896          GetConsoleOutputCP() if it is using a TrueType font).  Cygwin does
897          this conversion transparently (see winsup/cygwin/fhandler_console.cc),
898          converting to GetConsoleOutputCP().  This leads to correct results,
899          except when SetConsoleOutputCP has been called and a raster font is
900          in use.  */
901       {
902         char buf[2 + 10 + 1];
903 
904         sprintf (buf, "CP%u", GetACP ());
905         strcpy (resultbuf, buf);
906         codeset = resultbuf;
907       }
908     }
909 #  endif
910 
911   if (codeset == NULL)
912     /* The canonical name cannot be determined.  */
913     codeset = "";
914 
915 # elif defined WINDOWS_NATIVE
916 
917   char buf[2 + 10 + 1];
918   static char resultbuf[2 + 10 + 1];
919 
920   /* The Windows API has a function returning the locale's codepage as
921      a number, but the value doesn't change according to what the
922      'setlocale' call specified.  So we use it as a last resort, in
923      case the string returned by 'setlocale' doesn't specify the
924      codepage.  */
925   char *current_locale = setlocale (LC_CTYPE, NULL);
926   char *pdot = strrchr (current_locale, '.');
927 
928   if (pdot && 2 + strlen (pdot + 1) + 1 <= sizeof (buf))
929     sprintf (buf, "CP%s", pdot + 1);
930   else
931     {
932       /* The Windows API has a function returning the locale's codepage as a
933          number: GetACP().
934          When the output goes to a console window, it needs to be provided in
935          GetOEMCP() encoding if the console is using a raster font, or in
936          GetConsoleOutputCP() encoding if it is using a TrueType font.
937          But in GUI programs and for output sent to files and pipes, GetACP()
938          encoding is the best bet.  */
939       sprintf (buf, "CP%u", GetACP ());
940     }
941   /* For a locale name such as "French_France.65001", in Windows 10,
942      setlocale now returns "French_France.utf8" instead.  */
943   if (strcmp (buf + 2, "65001") == 0 || strcmp (buf + 2, "utf8") == 0)
944     codeset = "UTF-8";
945   else
946     {
947       strcpy (resultbuf, buf);
948       codeset = resultbuf;
949     }
950 
951 # elif defined OS2
952 
953   const char *locale;
954   static char resultbuf[2 + 10 + 1];
955   ULONG cp[3];
956   ULONG cplen;
957 
958   codeset = NULL;
959 
960   /* Allow user to override the codeset, as set in the operating system,
961      with standard language environment variables.  */
962   locale = getenv ("LC_ALL");
963   if (locale == NULL || locale[0] == '\0')
964     {
965       locale = getenv ("LC_CTYPE");
966       if (locale == NULL || locale[0] == '\0')
967         locale = getenv ("LANG");
968     }
969   if (locale != NULL && locale[0] != '\0')
970     {
971       /* If the locale name contains an encoding after the dot, return it.  */
972       const char *dot = strchr (locale, '.');
973 
974       if (dot != NULL)
975         {
976           const char *modifier;
977 
978           dot++;
979           /* Look for the possible @... trailer and remove it, if any.  */
980           modifier = strchr (dot, '@');
981           if (modifier == NULL)
982             return dot;
983           if (modifier - dot < sizeof (resultbuf))
984             {
985               /* This way of filling resultbuf is multithread-safe.  */
986               memcpy (resultbuf, dot, modifier - dot);
987               resultbuf [modifier - dot] = '\0';
988               return resultbuf;
989             }
990         }
991 
992       /* For the POSIX locale, don't use the system's codepage.  */
993       if (strcmp (locale, "C") == 0 || strcmp (locale, "POSIX") == 0)
994         codeset = "";
995     }
996 
997   if (codeset == NULL)
998     {
999       /* OS/2 has a function returning the locale's codepage as a number.  */
1000       if (DosQueryCp (sizeof (cp), cp, &cplen))
1001         codeset = "";
1002       else
1003         {
1004           char buf[2 + 10 + 1];
1005 
1006           sprintf (buf, "CP%u", cp[0]);
1007           strcpy (resultbuf, buf);
1008           codeset = resultbuf;
1009         }
1010     }
1011 
1012 # else
1013 
1014 #  error "Add code for other platforms here."
1015 
1016 # endif
1017 
1018   /* Resolve alias.  */
1019   {
1020 # ifdef alias_table_defined
1021     /* On some platforms, UTF-8 locales are the most frequently used ones.
1022        Speed up the common case and slow down the less common cases by
1023        testing for this case first.  */
1024 #  if defined __OpenBSD__ || (defined __APPLE__ && defined __MACH__) || defined __sun || defined __CYGWIN__
1025     if (strcmp (codeset, "UTF-8") == 0)
1026       goto done_table_lookup;
1027     else
1028 #  endif
1029       {
1030         const struct table_entry * const table = alias_table;
1031         size_t const table_size =
1032           sizeof (alias_table) / sizeof (struct table_entry);
1033         /* The table is sorted.  Perform a binary search.  */
1034         size_t hi = table_size;
1035         size_t lo = 0;
1036         while (lo < hi)
1037           {
1038             /* Invariant:
1039                for i < lo, strcmp (table[i].alias, codeset) < 0,
1040                for i >= hi, strcmp (table[i].alias, codeset) > 0.  */
1041             size_t mid = (hi + lo) >> 1; /* >= lo, < hi */
1042             int cmp = strcmp (table[mid].alias, codeset);
1043             if (cmp < 0)
1044               lo = mid + 1;
1045             else if (cmp > 0)
1046               hi = mid;
1047             else
1048               {
1049                 /* Found an i with
1050                      strcmp (table[i].alias, codeset) == 0.  */
1051                 codeset = table[mid].canonical;
1052                 goto done_table_lookup;
1053               }
1054           }
1055       }
1056     if (0)
1057       done_table_lookup: ;
1058     else
1059 # endif
1060       {
1061         /* Did not find it in the table.  */
1062         /* On Mac OS X, all modern locales use the UTF-8 encoding.
1063            BeOS and Haiku have a single locale, and it has UTF-8 encoding.  */
1064 # if (defined __APPLE__ && defined __MACH__) || defined __BEOS__ || defined __HAIKU__
1065         codeset = "UTF-8";
1066 # else
1067         /* Don't return an empty string.  GNU libc and GNU libiconv interpret
1068            the empty string as denoting "the locale's character encoding",
1069            thus GNU libiconv would call this function a second time.  */
1070         if (codeset[0] == '\0')
1071           codeset = "ASCII";
1072 # endif
1073       }
1074   }
1075 
1076 #else
1077 
1078   /* On old systems which lack it, use setlocale or getenv.  */
1079   const char *locale = NULL;
1080 
1081   /* But most old systems don't have a complete set of locales.  Some
1082      (like DJGPP) have only the C locale.  Therefore we don't use setlocale
1083      here; it would return "C" when it doesn't support the locale name the
1084      user has set.  */
1085 # if 0
1086   locale = setlocale (LC_CTYPE, NULL);
1087 # endif
1088   if (locale == NULL || locale[0] == '\0')
1089     {
1090       locale = getenv ("LC_ALL");
1091       if (locale == NULL || locale[0] == '\0')
1092         {
1093           locale = getenv ("LC_CTYPE");
1094           if (locale == NULL || locale[0] == '\0')
1095             locale = getenv ("LANG");
1096             if (locale == NULL)
1097               locale = "";
1098         }
1099     }
1100 
1101   /* Map locale name to canonical encoding name.  */
1102   {
1103 # ifdef locale_table_defined
1104     const struct table_entry * const table = locale_table;
1105     size_t const table_size =
1106       sizeof (locale_table) / sizeof (struct table_entry);
1107     /* The table is sorted.  Perform a binary search.  */
1108     size_t hi = table_size;
1109     size_t lo = 0;
1110     while (lo < hi)
1111       {
1112         /* Invariant:
1113            for i < lo, strcmp (table[i].locale, locale) < 0,
1114            for i >= hi, strcmp (table[i].locale, locale) > 0.  */
1115         size_t mid = (hi + lo) >> 1; /* >= lo, < hi */
1116         int cmp = strcmp (table[mid].locale, locale);
1117         if (cmp < 0)
1118           lo = mid + 1;
1119         else if (cmp > 0)
1120           hi = mid;
1121         else
1122           {
1123             /* Found an i with
1124                  strcmp (table[i].locale, locale) == 0.  */
1125             codeset = table[mid].canonical;
1126             goto done_table_lookup;
1127           }
1128       }
1129     if (0)
1130       done_table_lookup: ;
1131     else
1132 # endif
1133       {
1134         /* Did not find it in the table.  */
1135         /* On Mac OS X, all modern locales use the UTF-8 encoding.
1136            BeOS and Haiku have a single locale, and it has UTF-8 encoding.  */
1137 # if (defined __APPLE__ && defined __MACH__) || defined __BEOS__ || defined __HAIKU__
1138         codeset = "UTF-8";
1139 # else
1140         /* The canonical name cannot be determined.  */
1141         /* Don't return an empty string.  GNU libc and GNU libiconv interpret
1142            the empty string as denoting "the locale's character encoding",
1143            thus GNU libiconv would call this function a second time.  */
1144         codeset = "ASCII";
1145 # endif
1146       }
1147   }
1148 
1149 #endif
1150 
1151 #ifdef DARWIN7
1152   /* Mac OS X sets MB_CUR_MAX to 1 when LC_ALL=C, and "UTF-8"
1153      (the default codeset) does not work when MB_CUR_MAX is 1.  */
1154   if (strcmp (codeset, "UTF-8") == 0 && MB_CUR_MAX_L (uselocale (NULL)) <= 1)
1155     codeset = "ASCII";
1156 #endif
1157 
1158   return codeset;
1159 }
1160 
1161 /* A variant of the above, without calls to `setlocale', `nl_langinfo',
1162    etc.  */
1163 const char *
environ_locale_charset(void)1164 environ_locale_charset (void)
1165 {
1166   static char buf[2 + 10 + 1];
1167   const char *codeset, *aliases;
1168   const char *locale = NULL;
1169 
1170   locale = getenv ("LC_ALL");
1171   if (locale == NULL || locale[0] == '\0')
1172     {
1173       locale = getenv ("LC_CTYPE");
1174       if (locale == NULL || locale[0] == '\0')
1175 	locale = getenv ("LANG");
1176     }
1177 
1178   if (locale != NULL && locale[0] != '\0')
1179     {
1180       /* If the locale name contains an encoding after the dot, return it.  */
1181       const char *dot = strchr (locale, '.');
1182 
1183       if (dot != NULL)
1184         {
1185           const char *modifier;
1186 
1187           dot++;
1188           /* Look for the possible @... trailer and remove it, if any.  */
1189           modifier = strchr (dot, '@');
1190           if (modifier == NULL)
1191             return dot;
1192           if (modifier - dot < sizeof (buf))
1193             {
1194               memcpy (buf, dot, modifier - dot);
1195               buf [modifier - dot] = '\0';
1196               return buf;
1197             }
1198         }
1199       else if (strcmp (locale, "C") == 0)
1200 	{
1201 	  strcpy (buf, "ASCII");
1202 	  return buf;
1203 	}
1204       else
1205 	codeset = "";
1206     }
1207   else
1208     codeset = "";
1209 
1210   /* Resolve alias. */
1211   {
1212 # ifdef alias_table_defined
1213     /* On some platforms, UTF-8 locales are the most frequently used ones.
1214        Speed up the common case and slow down the less common cases by
1215        testing for this case first.  */
1216 #  if defined __OpenBSD__ || (defined __APPLE__ && defined __MACH__) || defined __sun || defined __CYGWIN__
1217     if (strcmp (codeset, "UTF-8") == 0)
1218       goto done_table_lookup;
1219     else
1220 #  endif
1221       {
1222         const struct table_entry * const table = alias_table;
1223         size_t const table_size =
1224           sizeof (alias_table) / sizeof (struct table_entry);
1225         /* The table is sorted.  Perform a binary search.  */
1226         size_t hi = table_size;
1227         size_t lo = 0;
1228         while (lo < hi)
1229           {
1230             /* Invariant:
1231                for i < lo, strcmp (table[i].alias, codeset) < 0,
1232                for i >= hi, strcmp (table[i].alias, codeset) > 0.  */
1233             size_t mid = (hi + lo) >> 1; /* >= lo, < hi */
1234             int cmp = strcmp (table[mid].alias, codeset);
1235             if (cmp < 0)
1236               lo = mid + 1;
1237             else if (cmp > 0)
1238               hi = mid;
1239             else
1240               {
1241                 /* Found an i with
1242                      strcmp (table[i].alias, codeset) == 0.  */
1243                 codeset = table[mid].canonical;
1244                 goto done_table_lookup;
1245               }
1246           }
1247       }
1248     if (0)
1249       done_table_lookup: ;
1250     else
1251 # endif
1252       {
1253         /* Did not find it in the table.  */
1254         /* On Mac OS X, all modern locales use the UTF-8 encoding.
1255            BeOS and Haiku have a single locale, and it has UTF-8 encoding.  */
1256 # if (defined __APPLE__ && defined __MACH__) || defined __BEOS__ || defined __HAIKU__
1257         codeset = "UTF-8";
1258 # else
1259         /* Don't return an empty string.  GNU libc and GNU libiconv interpret
1260            the empty string as denoting "the locale's character encoding",
1261            thus GNU libiconv would call this function a second time.  */
1262         if (codeset[0] == '\0')
1263           codeset = "ASCII";
1264 # endif
1265       }
1266   }
1267 
1268   /* Don't return an empty string.  GNU libc and GNU libiconv interpret
1269      the empty string as denoting "the locale's character encoding",
1270      thus GNU libiconv would call this function a second time.  */
1271   if (codeset[0] == '\0')
1272     /* Default to Latin-1, for backward compatibility with Guile 1.8.  */
1273     codeset = "ISO-8859-1";
1274 
1275   return codeset;
1276 }
1277