1 /* -*- buffer-read-only: t -*- vi: set ro: */
2 /* DO NOT EDIT! GENERATED AUTOMATICALLY! */
3 /* Determine a canonical name for the current locale's character encoding.
4 
5    Copyright (C) 2000-2006, 2008-2014 Free Software Foundation, Inc.
6 
7    This program is free software; you can redistribute it and/or modify
8    it under the terms of the GNU General Public License as published by
9    the Free Software Foundation; either version 3, or (at your option)
10    any later version.
11 
12    This program is distributed in the hope that it will be useful,
13    but WITHOUT ANY WARRANTY; without even the implied warranty of
14    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15    GNU General Public License for more details.
16 
17    You should have received a copy of the GNU General Public License along
18    with this program; if not, see <http://www.gnu.org/licenses/>.  */
19 
20 /* Written by Bruno Haible <bruno@clisp.org>.  */
21 
22 #include <config.h>
23 
24 /* Specification.  */
25 #include "localcharset.h"
26 
27 #include <fcntl.h>
28 #include <stddef.h>
29 #include <stdio.h>
30 #include <string.h>
31 #include <stdlib.h>
32 
33 #if defined __APPLE__ && defined __MACH__ && HAVE_LANGINFO_CODESET
34 # define DARWIN7 /* Darwin 7 or newer, i.e. Mac OS X 10.3 or newer */
35 #endif
36 
37 #if defined _WIN32 || defined __WIN32__
38 # define WINDOWS_NATIVE
39 #endif
40 
41 #if defined __EMX__
42 /* Assume EMX program runs on OS/2, even if compiled under DOS.  */
43 # ifndef OS2
44 #  define OS2
45 # endif
46 #endif
47 
48 #if !defined WINDOWS_NATIVE
49 # include <unistd.h>
50 # if HAVE_LANGINFO_CODESET
51 #  include <langinfo.h>
52 # else
53 #  if 0 /* see comment below */
54 #   include <locale.h>
55 #  endif
56 # endif
57 # ifdef __CYGWIN__
58 #  define WIN32_LEAN_AND_MEAN
59 #  include <windows.h>
60 # endif
61 #elif defined WINDOWS_NATIVE
62 # define WIN32_LEAN_AND_MEAN
63 # include <windows.h>
64 #endif
65 #if defined OS2
66 # define INCL_DOS
67 # include <os2.h>
68 #endif
69 
70 /* For MB_CUR_MAX_L */
71 #if defined DARWIN7
72 # include <xlocale.h>
73 #endif
74 
75 #if ENABLE_RELOCATABLE
76 # include "relocatable.h"
77 #else
78 # define relocate(pathname) (pathname)
79 #endif
80 
81 /* Get LIBDIR.  */
82 #ifndef LIBDIR
83 # include "configmake.h"
84 #endif
85 
86 /* Define O_NOFOLLOW to 0 on platforms where it does not exist.  */
87 #ifndef O_NOFOLLOW
88 # define O_NOFOLLOW 0
89 #endif
90 
91 #if defined _WIN32 || defined __WIN32__ || defined __CYGWIN__ || defined __EMX__ || defined __DJGPP__
92   /* Native Windows, Cygwin, OS/2, DOS */
93 # define ISSLASH(C) ((C) == '/' || (C) == '\\')
94 #endif
95 
96 #ifndef DIRECTORY_SEPARATOR
97 # define DIRECTORY_SEPARATOR '/'
98 #endif
99 
100 #ifndef ISSLASH
101 # define ISSLASH(C) ((C) == DIRECTORY_SEPARATOR)
102 #endif
103 
104 #if HAVE_DECL_GETC_UNLOCKED
105 # undef getc
106 # define getc getc_unlocked
107 #endif
108 
109 /* The following static variable is declared 'volatile' to avoid a
110    possible multithread problem in the function get_charset_aliases. If we
111    are running in a threaded environment, and if two threads initialize
112    'charset_aliases' simultaneously, both will produce the same value,
113    and everything will be ok if the two assignments to 'charset_aliases'
114    are atomic. But I don't know what will happen if the two assignments mix.  */
115 #if __STDC__ != 1
116 # define volatile /* empty */
117 #endif
118 /* Pointer to the contents of the charset.alias file, if it has already been
119    read, else NULL.  Its format is:
120    ALIAS_1 '\0' CANONICAL_1 '\0' ... ALIAS_n '\0' CANONICAL_n '\0' '\0'  */
121 static const char * volatile charset_aliases;
122 
123 /* Return a pointer to the contents of the charset.alias file.  */
124 static const char *
get_charset_aliases(void)125 get_charset_aliases (void)
126 {
127   const char *cp;
128 
129   cp = charset_aliases;
130   if (cp == NULL)
131     {
132 #if !(defined DARWIN7 || defined VMS || defined WINDOWS_NATIVE || defined __CYGWIN__)
133       const char *dir;
134       const char *base = "charset.alias";
135       char *file_name;
136 
137       /* Make it possible to override the charset.alias location.  This is
138          necessary for running the testsuite before "make install".  */
139       dir = getenv ("CHARSETALIASDIR");
140       if (dir == NULL || dir[0] == '\0')
141         dir = relocate (LIBDIR);
142 
143       /* Concatenate dir and base into freshly allocated file_name.  */
144       {
145         size_t dir_len = strlen (dir);
146         size_t base_len = strlen (base);
147         int add_slash = (dir_len > 0 && !ISSLASH (dir[dir_len - 1]));
148         file_name = (char *) malloc (dir_len + add_slash + base_len + 1);
149         if (file_name != NULL)
150           {
151             memcpy (file_name, dir, dir_len);
152             if (add_slash)
153               file_name[dir_len] = DIRECTORY_SEPARATOR;
154             memcpy (file_name + dir_len + add_slash, base, base_len + 1);
155           }
156       }
157 
158       if (file_name == NULL)
159         /* Out of memory.  Treat the file as empty.  */
160         cp = "";
161       else
162         {
163           int fd;
164 
165           /* Open the file.  Reject symbolic links on platforms that support
166              O_NOFOLLOW.  This is a security feature.  Without it, an attacker
167              could retrieve parts of the contents (namely, the tail of the
168              first line that starts with "* ") of an arbitrary file by placing
169              a symbolic link to that file under the name "charset.alias" in
170              some writable directory and defining the environment variable
171              CHARSETALIASDIR to point to that directory.  */
172           fd = open (file_name,
173                      O_RDONLY | (HAVE_WORKING_O_NOFOLLOW ? O_NOFOLLOW : 0));
174           if (fd < 0)
175             /* File not found.  Treat it as empty.  */
176             cp = "";
177           else
178             {
179               FILE *fp;
180 
181               fp = fdopen (fd, "r");
182               if (fp == NULL)
183                 {
184                   /* Out of memory.  Treat the file as empty.  */
185                   close (fd);
186                   cp = "";
187                 }
188               else
189                 {
190                   /* Parse the file's contents.  */
191                   char *res_ptr = NULL;
192                   size_t res_size = 0;
193 
194                   for (;;)
195                     {
196                       int c;
197                       char buf1[50+1];
198                       char buf2[50+1];
199                       size_t l1, l2;
200                       char *old_res_ptr;
201 
202                       c = getc (fp);
203                       if (c == EOF)
204                         break;
205                       if (c == '\n' || c == ' ' || c == '\t')
206                         continue;
207                       if (c == '#')
208                         {
209                           /* Skip comment, to end of line.  */
210                           do
211                             c = getc (fp);
212                           while (!(c == EOF || c == '\n'));
213                           if (c == EOF)
214                             break;
215                           continue;
216                         }
217                       ungetc (c, fp);
218                       if (fscanf (fp, "%50s %50s", buf1, buf2) < 2)
219                         break;
220                       l1 = strlen (buf1);
221                       l2 = strlen (buf2);
222                       old_res_ptr = res_ptr;
223                       if (res_size == 0)
224                         {
225                           res_size = l1 + 1 + l2 + 1;
226                           res_ptr = (char *) malloc (res_size + 1);
227                         }
228                       else
229                         {
230                           res_size += l1 + 1 + l2 + 1;
231                           res_ptr = (char *) realloc (res_ptr, res_size + 1);
232                         }
233                       if (res_ptr == NULL)
234                         {
235                           /* Out of memory. */
236                           res_size = 0;
237                           free (old_res_ptr);
238                           break;
239                         }
240                       strcpy (res_ptr + res_size - (l2 + 1) - (l1 + 1), buf1);
241                       strcpy (res_ptr + res_size - (l2 + 1), buf2);
242                     }
243                   fclose (fp);
244                   if (res_size == 0)
245                     cp = "";
246                   else
247                     {
248                       *(res_ptr + res_size) = '\0';
249                       cp = res_ptr;
250                     }
251                 }
252             }
253 
254           free (file_name);
255         }
256 
257 #else
258 
259 # if defined DARWIN7
260       /* To avoid the trouble of installing a file that is shared by many
261          GNU packages -- many packaging systems have problems with this --,
262          simply inline the aliases here.  */
263       cp = "ISO8859-1" "\0" "ISO-8859-1" "\0"
264            "ISO8859-2" "\0" "ISO-8859-2" "\0"
265            "ISO8859-4" "\0" "ISO-8859-4" "\0"
266            "ISO8859-5" "\0" "ISO-8859-5" "\0"
267            "ISO8859-7" "\0" "ISO-8859-7" "\0"
268            "ISO8859-9" "\0" "ISO-8859-9" "\0"
269            "ISO8859-13" "\0" "ISO-8859-13" "\0"
270            "ISO8859-15" "\0" "ISO-8859-15" "\0"
271            "KOI8-R" "\0" "KOI8-R" "\0"
272            "KOI8-U" "\0" "KOI8-U" "\0"
273            "CP866" "\0" "CP866" "\0"
274            "CP949" "\0" "CP949" "\0"
275            "CP1131" "\0" "CP1131" "\0"
276            "CP1251" "\0" "CP1251" "\0"
277            "eucCN" "\0" "GB2312" "\0"
278            "GB2312" "\0" "GB2312" "\0"
279            "eucJP" "\0" "EUC-JP" "\0"
280            "eucKR" "\0" "EUC-KR" "\0"
281            "Big5" "\0" "BIG5" "\0"
282            "Big5HKSCS" "\0" "BIG5-HKSCS" "\0"
283            "GBK" "\0" "GBK" "\0"
284            "GB18030" "\0" "GB18030" "\0"
285            "SJIS" "\0" "SHIFT_JIS" "\0"
286            "ARMSCII-8" "\0" "ARMSCII-8" "\0"
287            "PT154" "\0" "PT154" "\0"
288          /*"ISCII-DEV" "\0" "?" "\0"*/
289            "*" "\0" "UTF-8" "\0";
290 # endif
291 
292 # if defined VMS
293       /* To avoid the troubles of an extra file charset.alias_vms in the
294          sources of many GNU packages, simply inline the aliases here.  */
295       /* The list of encodings is taken from the OpenVMS 7.3-1 documentation
296          "Compaq C Run-Time Library Reference Manual for OpenVMS systems"
297          section 10.7 "Handling Different Character Sets".  */
298       cp = "ISO8859-1" "\0" "ISO-8859-1" "\0"
299            "ISO8859-2" "\0" "ISO-8859-2" "\0"
300            "ISO8859-5" "\0" "ISO-8859-5" "\0"
301            "ISO8859-7" "\0" "ISO-8859-7" "\0"
302            "ISO8859-8" "\0" "ISO-8859-8" "\0"
303            "ISO8859-9" "\0" "ISO-8859-9" "\0"
304            /* Japanese */
305            "eucJP" "\0" "EUC-JP" "\0"
306            "SJIS" "\0" "SHIFT_JIS" "\0"
307            "DECKANJI" "\0" "DEC-KANJI" "\0"
308            "SDECKANJI" "\0" "EUC-JP" "\0"
309            /* Chinese */
310            "eucTW" "\0" "EUC-TW" "\0"
311            "DECHANYU" "\0" "DEC-HANYU" "\0"
312            "DECHANZI" "\0" "GB2312" "\0"
313            /* Korean */
314            "DECKOREAN" "\0" "EUC-KR" "\0";
315 # endif
316 
317 # if defined WINDOWS_NATIVE || defined __CYGWIN__
318       /* To avoid the troubles of installing a separate file in the same
319          directory as the DLL and of retrieving the DLL's directory at
320          runtime, simply inline the aliases here.  */
321 
322       cp = "CP936" "\0" "GBK" "\0"
323            "CP1361" "\0" "JOHAB" "\0"
324            "CP20127" "\0" "ASCII" "\0"
325            "CP20866" "\0" "KOI8-R" "\0"
326            "CP20936" "\0" "GB2312" "\0"
327            "CP21866" "\0" "KOI8-RU" "\0"
328            "CP28591" "\0" "ISO-8859-1" "\0"
329            "CP28592" "\0" "ISO-8859-2" "\0"
330            "CP28593" "\0" "ISO-8859-3" "\0"
331            "CP28594" "\0" "ISO-8859-4" "\0"
332            "CP28595" "\0" "ISO-8859-5" "\0"
333            "CP28596" "\0" "ISO-8859-6" "\0"
334            "CP28597" "\0" "ISO-8859-7" "\0"
335            "CP28598" "\0" "ISO-8859-8" "\0"
336            "CP28599" "\0" "ISO-8859-9" "\0"
337            "CP28605" "\0" "ISO-8859-15" "\0"
338            "CP38598" "\0" "ISO-8859-8" "\0"
339            "CP51932" "\0" "EUC-JP" "\0"
340            "CP51936" "\0" "GB2312" "\0"
341            "CP51949" "\0" "EUC-KR" "\0"
342            "CP51950" "\0" "EUC-TW" "\0"
343            "CP54936" "\0" "GB18030" "\0"
344            "CP65001" "\0" "UTF-8" "\0";
345 # endif
346 #endif
347 
348       charset_aliases = cp;
349     }
350 
351   return cp;
352 }
353 
354 /* Determine the current locale's character encoding, and canonicalize it
355    into one of the canonical names listed in config.charset.
356    The result must not be freed; it is statically allocated.
357    If the canonical name cannot be determined, the result is a non-canonical
358    name.  */
359 
360 #ifdef STATIC
361 STATIC
362 #endif
363 const char *
locale_charset(void)364 locale_charset (void)
365 {
366   const char *codeset;
367   const char *aliases;
368 
369 #if !(defined WINDOWS_NATIVE || defined OS2)
370 
371 # if HAVE_LANGINFO_CODESET
372 
373   /* Most systems support nl_langinfo (CODESET) nowadays.  */
374   codeset = nl_langinfo (CODESET);
375 
376 #  ifdef __CYGWIN__
377   /* Cygwin < 1.7 does not have locales.  nl_langinfo (CODESET) always
378      returns "US-ASCII".  Return the suffix of the locale name from the
379      environment variables (if present) or the codepage as a number.  */
380   if (codeset != NULL && strcmp (codeset, "US-ASCII") == 0)
381     {
382       const char *locale;
383       static char buf[2 + 10 + 1];
384 
385       locale = getenv ("LC_ALL");
386       if (locale == NULL || locale[0] == '\0')
387         {
388           locale = getenv ("LC_CTYPE");
389           if (locale == NULL || locale[0] == '\0')
390             locale = getenv ("LANG");
391         }
392       if (locale != NULL && locale[0] != '\0')
393         {
394           /* If the locale name contains an encoding after the dot, return
395              it.  */
396           const char *dot = strchr (locale, '.');
397 
398           if (dot != NULL)
399             {
400               const char *modifier;
401 
402               dot++;
403               /* Look for the possible @... trailer and remove it, if any.  */
404               modifier = strchr (dot, '@');
405               if (modifier == NULL)
406                 return dot;
407               if (modifier - dot < sizeof (buf))
408                 {
409                   memcpy (buf, dot, modifier - dot);
410                   buf [modifier - dot] = '\0';
411                   return buf;
412                 }
413             }
414         }
415 
416       /* The Windows API has a function returning the locale's codepage as a
417          number: GetACP().  This encoding is used by Cygwin, unless the user
418          has set the environment variable CYGWIN=codepage:oem (which very few
419          people do).
420          Output directed to console windows needs to be converted (to
421          GetOEMCP() if the console is using a raster font, or to
422          GetConsoleOutputCP() if it is using a TrueType font).  Cygwin does
423          this conversion transparently (see winsup/cygwin/fhandler_console.cc),
424          converting to GetConsoleOutputCP().  This leads to correct results,
425          except when SetConsoleOutputCP has been called and a raster font is
426          in use.  */
427       sprintf (buf, "CP%u", GetACP ());
428       codeset = buf;
429     }
430 #  endif
431 
432 # else
433 
434   /* On old systems which lack it, use setlocale or getenv.  */
435   const char *locale = NULL;
436 
437   /* But most old systems don't have a complete set of locales.  Some
438      (like SunOS 4 or DJGPP) have only the C locale.  Therefore we don't
439      use setlocale here; it would return "C" when it doesn't support the
440      locale name the user has set.  */
441 #  if 0
442   locale = setlocale (LC_CTYPE, NULL);
443 #  endif
444   if (locale == NULL || locale[0] == '\0')
445     {
446       locale = getenv ("LC_ALL");
447       if (locale == NULL || locale[0] == '\0')
448         {
449           locale = getenv ("LC_CTYPE");
450           if (locale == NULL || locale[0] == '\0')
451             locale = getenv ("LANG");
452         }
453     }
454 
455   /* On some old systems, one used to set locale = "iso8859_1". On others,
456      you set it to "language_COUNTRY.charset". In any case, we resolve it
457      through the charset.alias file.  */
458   codeset = locale;
459 
460 # endif
461 
462 #elif defined WINDOWS_NATIVE
463 
464   static char buf[2 + 10 + 1];
465 
466   /* The Windows API has a function returning the locale's codepage as a
467      number: GetACP().
468      When the output goes to a console window, it needs to be provided in
469      GetOEMCP() encoding if the console is using a raster font, or in
470      GetConsoleOutputCP() encoding if it is using a TrueType font.
471      But in GUI programs and for output sent to files and pipes, GetACP()
472      encoding is the best bet.  */
473   sprintf (buf, "CP%u", GetACP ());
474   codeset = buf;
475 
476 #elif defined OS2
477 
478   const char *locale;
479   static char buf[2 + 10 + 1];
480   ULONG cp[3];
481   ULONG cplen;
482 
483   /* Allow user to override the codeset, as set in the operating system,
484      with standard language environment variables.  */
485   locale = getenv ("LC_ALL");
486   if (locale == NULL || locale[0] == '\0')
487     {
488       locale = getenv ("LC_CTYPE");
489       if (locale == NULL || locale[0] == '\0')
490         locale = getenv ("LANG");
491     }
492   if (locale != NULL && locale[0] != '\0')
493     {
494       /* If the locale name contains an encoding after the dot, return it.  */
495       const char *dot = strchr (locale, '.');
496 
497       if (dot != NULL)
498         {
499           const char *modifier;
500 
501           dot++;
502           /* Look for the possible @... trailer and remove it, if any.  */
503           modifier = strchr (dot, '@');
504           if (modifier == NULL)
505             return dot;
506           if (modifier - dot < sizeof (buf))
507             {
508               memcpy (buf, dot, modifier - dot);
509               buf [modifier - dot] = '\0';
510               return buf;
511             }
512         }
513 
514       /* Resolve through the charset.alias file.  */
515       codeset = locale;
516     }
517   else
518     {
519       /* OS/2 has a function returning the locale's codepage as a number.  */
520       if (DosQueryCp (sizeof (cp), cp, &cplen))
521         codeset = "";
522       else
523         {
524           sprintf (buf, "CP%u", cp[0]);
525           codeset = buf;
526         }
527     }
528 
529 #endif
530 
531   if (codeset == NULL)
532     /* The canonical name cannot be determined.  */
533     codeset = "";
534 
535   /* Resolve alias. */
536   for (aliases = get_charset_aliases ();
537        *aliases != '\0';
538        aliases += strlen (aliases) + 1, aliases += strlen (aliases) + 1)
539     if (strcmp (codeset, aliases) == 0
540         || (aliases[0] == '*' && aliases[1] == '\0'))
541       {
542         codeset = aliases + strlen (aliases) + 1;
543         break;
544       }
545 
546   /* Don't return an empty string.  GNU libc and GNU libiconv interpret
547      the empty string as denoting "the locale's character encoding",
548      thus GNU libiconv would call this function a second time.  */
549   if (codeset[0] == '\0')
550     codeset = "ASCII";
551 
552 #ifdef DARWIN7
553   /* Mac OS X sets MB_CUR_MAX to 1 when LC_ALL=C, and "UTF-8"
554      (the default codeset) does not work when MB_CUR_MAX is 1.  */
555   if (strcmp (codeset, "UTF-8") == 0 && MB_CUR_MAX_L (uselocale (NULL)) <= 1)
556     codeset = "ASCII";
557 #endif
558 
559   return codeset;
560 }
561