xref: /openbsd/gnu/gcc/intl/localcharset.c (revision 404b540a)
1*404b540aSrobert /* Determine a canonical name for the current locale's character encoding.
2*404b540aSrobert 
3*404b540aSrobert    Copyright (C) 2000-2003 Free Software Foundation, Inc.
4*404b540aSrobert 
5*404b540aSrobert    This program is free software; you can redistribute it and/or modify it
6*404b540aSrobert    under the terms of the GNU Library General Public License as published
7*404b540aSrobert    by the Free Software Foundation; either version 2, or (at your option)
8*404b540aSrobert    any later version.
9*404b540aSrobert 
10*404b540aSrobert    This program is distributed in the hope that it will be useful,
11*404b540aSrobert    but WITHOUT ANY WARRANTY; without even the implied warranty of
12*404b540aSrobert    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13*404b540aSrobert    Library General Public License for more details.
14*404b540aSrobert 
15*404b540aSrobert    You should have received a copy of the GNU Library General Public
16*404b540aSrobert    License along with this program; if not, write to the Free Software
17*404b540aSrobert    Foundation, Inc., 51 Franklin Street - Fifth Floor, Boston, MA 02110-1301,
18*404b540aSrobert    USA.  */
19*404b540aSrobert 
20*404b540aSrobert /* Written by Bruno Haible <bruno@clisp.org>.  */
21*404b540aSrobert 
22*404b540aSrobert #ifdef HAVE_CONFIG_H
23*404b540aSrobert # include <config.h>
24*404b540aSrobert #endif
25*404b540aSrobert 
26*404b540aSrobert /* Specification.  */
27*404b540aSrobert #include "localcharset.h"
28*404b540aSrobert 
29*404b540aSrobert #if HAVE_STDDEF_H
30*404b540aSrobert # include <stddef.h>
31*404b540aSrobert #endif
32*404b540aSrobert 
33*404b540aSrobert #include <stdio.h>
34*404b540aSrobert #if HAVE_STRING_H
35*404b540aSrobert # include <string.h>
36*404b540aSrobert #else
37*404b540aSrobert # include <strings.h>
38*404b540aSrobert #endif
39*404b540aSrobert #if HAVE_STDLIB_H
40*404b540aSrobert # include <stdlib.h>
41*404b540aSrobert #endif
42*404b540aSrobert 
43*404b540aSrobert #if defined _WIN32 || defined __WIN32__
44*404b540aSrobert # undef WIN32   /* avoid warning on mingw32 */
45*404b540aSrobert # define WIN32
46*404b540aSrobert #endif
47*404b540aSrobert 
48*404b540aSrobert #if defined __EMX__
49*404b540aSrobert /* Assume EMX program runs on OS/2, even if compiled under DOS.  */
50*404b540aSrobert # define OS2
51*404b540aSrobert #endif
52*404b540aSrobert 
53*404b540aSrobert #if !defined WIN32
54*404b540aSrobert # if HAVE_LANGINFO_CODESET
55*404b540aSrobert #  include <langinfo.h>
56*404b540aSrobert # else
57*404b540aSrobert #  if HAVE_SETLOCALE
58*404b540aSrobert #   include <locale.h>
59*404b540aSrobert #  endif
60*404b540aSrobert # endif
61*404b540aSrobert #elif defined WIN32
62*404b540aSrobert # define WIN32_LEAN_AND_MEAN
63*404b540aSrobert # include <windows.h>
64*404b540aSrobert #endif
65*404b540aSrobert #if defined OS2
66*404b540aSrobert # define INCL_DOS
67*404b540aSrobert # include <os2.h>
68*404b540aSrobert #endif
69*404b540aSrobert 
70*404b540aSrobert #if ENABLE_RELOCATABLE
71*404b540aSrobert # include "relocatable.h"
72*404b540aSrobert #else
73*404b540aSrobert # define relocate(pathname) (pathname)
74*404b540aSrobert #endif
75*404b540aSrobert 
76*404b540aSrobert #if defined _WIN32 || defined __WIN32__ || defined __EMX__ || defined __DJGPP__
77*404b540aSrobert   /* Win32, OS/2, DOS */
78*404b540aSrobert # define ISSLASH(C) ((C) == '/' || (C) == '\\')
79*404b540aSrobert #endif
80*404b540aSrobert 
81*404b540aSrobert #ifndef DIRECTORY_SEPARATOR
82*404b540aSrobert # define DIRECTORY_SEPARATOR '/'
83*404b540aSrobert #endif
84*404b540aSrobert 
85*404b540aSrobert #ifndef ISSLASH
86*404b540aSrobert # define ISSLASH(C) ((C) == DIRECTORY_SEPARATOR)
87*404b540aSrobert #endif
88*404b540aSrobert 
89*404b540aSrobert #ifdef HAVE_GETC_UNLOCKED
90*404b540aSrobert # undef getc
91*404b540aSrobert # define getc getc_unlocked
92*404b540aSrobert #endif
93*404b540aSrobert 
94*404b540aSrobert /* The following static variable is declared 'volatile' to avoid a
95*404b540aSrobert    possible multithread problem in the function get_charset_aliases. If we
96*404b540aSrobert    are running in a threaded environment, and if two threads initialize
97*404b540aSrobert    'charset_aliases' simultaneously, both will produce the same value,
98*404b540aSrobert    and everything will be ok if the two assignments to 'charset_aliases'
99*404b540aSrobert    are atomic. But I don't know what will happen if the two assignments mix.  */
100*404b540aSrobert #if __STDC__ != 1
101*404b540aSrobert # define volatile /* empty */
102*404b540aSrobert #endif
103*404b540aSrobert /* Pointer to the contents of the charset.alias file, if it has already been
104*404b540aSrobert    read, else NULL.  Its format is:
105*404b540aSrobert    ALIAS_1 '\0' CANONICAL_1 '\0' ... ALIAS_n '\0' CANONICAL_n '\0' '\0'  */
106*404b540aSrobert static const char * volatile charset_aliases;
107*404b540aSrobert 
108*404b540aSrobert /* Return a pointer to the contents of the charset.alias file.  */
109*404b540aSrobert static const char *
get_charset_aliases()110*404b540aSrobert get_charset_aliases ()
111*404b540aSrobert {
112*404b540aSrobert   const char *cp;
113*404b540aSrobert 
114*404b540aSrobert   cp = charset_aliases;
115*404b540aSrobert   if (cp == NULL)
116*404b540aSrobert     {
117*404b540aSrobert #if !(defined VMS || defined WIN32)
118*404b540aSrobert       FILE *fp;
119*404b540aSrobert       const char *dir = relocate (LIBDIR);
120*404b540aSrobert       const char *base = "charset.alias";
121*404b540aSrobert       char *file_name;
122*404b540aSrobert 
123*404b540aSrobert       /* Concatenate dir and base into freshly allocated file_name.  */
124*404b540aSrobert       {
125*404b540aSrobert 	size_t dir_len = strlen (dir);
126*404b540aSrobert 	size_t base_len = strlen (base);
127*404b540aSrobert 	int add_slash = (dir_len > 0 && !ISSLASH (dir[dir_len - 1]));
128*404b540aSrobert 	file_name = (char *) malloc (dir_len + add_slash + base_len + 1);
129*404b540aSrobert 	if (file_name != NULL)
130*404b540aSrobert 	  {
131*404b540aSrobert 	    memcpy (file_name, dir, dir_len);
132*404b540aSrobert 	    if (add_slash)
133*404b540aSrobert 	      file_name[dir_len] = DIRECTORY_SEPARATOR;
134*404b540aSrobert 	    memcpy (file_name + dir_len + add_slash, base, base_len + 1);
135*404b540aSrobert 	  }
136*404b540aSrobert       }
137*404b540aSrobert 
138*404b540aSrobert       if (file_name == NULL || (fp = fopen (file_name, "r")) == NULL)
139*404b540aSrobert 	/* Out of memory or file not found, treat it as empty.  */
140*404b540aSrobert 	cp = "";
141*404b540aSrobert       else
142*404b540aSrobert 	{
143*404b540aSrobert 	  /* Parse the file's contents.  */
144*404b540aSrobert 	  int c;
145*404b540aSrobert 	  char buf1[50+1];
146*404b540aSrobert 	  char buf2[50+1];
147*404b540aSrobert 	  char *res_ptr = NULL;
148*404b540aSrobert 	  size_t res_size = 0;
149*404b540aSrobert 	  size_t l1, l2;
150*404b540aSrobert 
151*404b540aSrobert 	  for (;;)
152*404b540aSrobert 	    {
153*404b540aSrobert 	      c = getc (fp);
154*404b540aSrobert 	      if (c == EOF)
155*404b540aSrobert 		break;
156*404b540aSrobert 	      if (c == '\n' || c == ' ' || c == '\t')
157*404b540aSrobert 		continue;
158*404b540aSrobert 	      if (c == '#')
159*404b540aSrobert 		{
160*404b540aSrobert 		  /* Skip comment, to end of line.  */
161*404b540aSrobert 		  do
162*404b540aSrobert 		    c = getc (fp);
163*404b540aSrobert 		  while (!(c == EOF || c == '\n'));
164*404b540aSrobert 		  if (c == EOF)
165*404b540aSrobert 		    break;
166*404b540aSrobert 		  continue;
167*404b540aSrobert 		}
168*404b540aSrobert 	      ungetc (c, fp);
169*404b540aSrobert 	      if (fscanf (fp, "%50s %50s", buf1, buf2) < 2)
170*404b540aSrobert 		break;
171*404b540aSrobert 	      l1 = strlen (buf1);
172*404b540aSrobert 	      l2 = strlen (buf2);
173*404b540aSrobert 	      if (res_size == 0)
174*404b540aSrobert 		{
175*404b540aSrobert 		  res_size = l1 + 1 + l2 + 1;
176*404b540aSrobert 		  res_ptr = (char *) malloc (res_size + 1);
177*404b540aSrobert 		}
178*404b540aSrobert 	      else
179*404b540aSrobert 		{
180*404b540aSrobert 		  res_size += l1 + 1 + l2 + 1;
181*404b540aSrobert 		  res_ptr = (char *) realloc (res_ptr, res_size + 1);
182*404b540aSrobert 		}
183*404b540aSrobert 	      if (res_ptr == NULL)
184*404b540aSrobert 		{
185*404b540aSrobert 		  /* Out of memory. */
186*404b540aSrobert 		  res_size = 0;
187*404b540aSrobert 		  break;
188*404b540aSrobert 		}
189*404b540aSrobert 	      strcpy (res_ptr + res_size - (l2 + 1) - (l1 + 1), buf1);
190*404b540aSrobert 	      strcpy (res_ptr + res_size - (l2 + 1), buf2);
191*404b540aSrobert 	    }
192*404b540aSrobert 	  fclose (fp);
193*404b540aSrobert 	  if (res_size == 0)
194*404b540aSrobert 	    cp = "";
195*404b540aSrobert 	  else
196*404b540aSrobert 	    {
197*404b540aSrobert 	      *(res_ptr + res_size) = '\0';
198*404b540aSrobert 	      cp = res_ptr;
199*404b540aSrobert 	    }
200*404b540aSrobert 	}
201*404b540aSrobert 
202*404b540aSrobert       if (file_name != NULL)
203*404b540aSrobert 	free (file_name);
204*404b540aSrobert 
205*404b540aSrobert #else
206*404b540aSrobert 
207*404b540aSrobert # if defined VMS
208*404b540aSrobert       /* To avoid the troubles of an extra file charset.alias_vms in the
209*404b540aSrobert 	 sources of many GNU packages, simply inline the aliases here.  */
210*404b540aSrobert       /* The list of encodings is taken from the OpenVMS 7.3-1 documentation
211*404b540aSrobert 	 "Compaq C Run-Time Library Reference Manual for OpenVMS systems"
212*404b540aSrobert 	 section 10.7 "Handling Different Character Sets".  */
213*404b540aSrobert       cp = "ISO8859-1" "\0" "ISO-8859-1" "\0"
214*404b540aSrobert 	   "ISO8859-2" "\0" "ISO-8859-2" "\0"
215*404b540aSrobert 	   "ISO8859-5" "\0" "ISO-8859-5" "\0"
216*404b540aSrobert 	   "ISO8859-7" "\0" "ISO-8859-7" "\0"
217*404b540aSrobert 	   "ISO8859-8" "\0" "ISO-8859-8" "\0"
218*404b540aSrobert 	   "ISO8859-9" "\0" "ISO-8859-9" "\0"
219*404b540aSrobert 	   /* Japanese */
220*404b540aSrobert 	   "eucJP" "\0" "EUC-JP" "\0"
221*404b540aSrobert 	   "SJIS" "\0" "SHIFT_JIS" "\0"
222*404b540aSrobert 	   "DECKANJI" "\0" "DEC-KANJI" "\0"
223*404b540aSrobert 	   "SDECKANJI" "\0" "EUC-JP" "\0"
224*404b540aSrobert 	   /* Chinese */
225*404b540aSrobert 	   "eucTW" "\0" "EUC-TW" "\0"
226*404b540aSrobert 	   "DECHANYU" "\0" "DEC-HANYU" "\0"
227*404b540aSrobert 	   "DECHANZI" "\0" "GB2312" "\0"
228*404b540aSrobert 	   /* Korean */
229*404b540aSrobert 	   "DECKOREAN" "\0" "EUC-KR" "\0";
230*404b540aSrobert # endif
231*404b540aSrobert 
232*404b540aSrobert # if defined WIN32
233*404b540aSrobert       /* To avoid the troubles of installing a separate file in the same
234*404b540aSrobert 	 directory as the DLL and of retrieving the DLL's directory at
235*404b540aSrobert 	 runtime, simply inline the aliases here.  */
236*404b540aSrobert 
237*404b540aSrobert       cp = "CP936" "\0" "GBK" "\0"
238*404b540aSrobert 	   "CP1361" "\0" "JOHAB" "\0"
239*404b540aSrobert 	   "CP20127" "\0" "ASCII" "\0"
240*404b540aSrobert 	   "CP20866" "\0" "KOI8-R" "\0"
241*404b540aSrobert 	   "CP21866" "\0" "KOI8-RU" "\0"
242*404b540aSrobert 	   "CP28591" "\0" "ISO-8859-1" "\0"
243*404b540aSrobert 	   "CP28592" "\0" "ISO-8859-2" "\0"
244*404b540aSrobert 	   "CP28593" "\0" "ISO-8859-3" "\0"
245*404b540aSrobert 	   "CP28594" "\0" "ISO-8859-4" "\0"
246*404b540aSrobert 	   "CP28595" "\0" "ISO-8859-5" "\0"
247*404b540aSrobert 	   "CP28596" "\0" "ISO-8859-6" "\0"
248*404b540aSrobert 	   "CP28597" "\0" "ISO-8859-7" "\0"
249*404b540aSrobert 	   "CP28598" "\0" "ISO-8859-8" "\0"
250*404b540aSrobert 	   "CP28599" "\0" "ISO-8859-9" "\0"
251*404b540aSrobert 	   "CP28605" "\0" "ISO-8859-15" "\0";
252*404b540aSrobert # endif
253*404b540aSrobert #endif
254*404b540aSrobert 
255*404b540aSrobert       charset_aliases = cp;
256*404b540aSrobert     }
257*404b540aSrobert 
258*404b540aSrobert   return cp;
259*404b540aSrobert }
260*404b540aSrobert 
261*404b540aSrobert /* Determine the current locale's character encoding, and canonicalize it
262*404b540aSrobert    into one of the canonical names listed in config.charset.
263*404b540aSrobert    The result must not be freed; it is statically allocated.
264*404b540aSrobert    If the canonical name cannot be determined, the result is a non-canonical
265*404b540aSrobert    name.  */
266*404b540aSrobert 
267*404b540aSrobert #ifdef STATIC
268*404b540aSrobert STATIC
269*404b540aSrobert #endif
270*404b540aSrobert const char *
locale_charset()271*404b540aSrobert locale_charset ()
272*404b540aSrobert {
273*404b540aSrobert   const char *codeset;
274*404b540aSrobert   const char *aliases;
275*404b540aSrobert 
276*404b540aSrobert #if !(defined WIN32 || defined OS2)
277*404b540aSrobert 
278*404b540aSrobert # if HAVE_LANGINFO_CODESET
279*404b540aSrobert 
280*404b540aSrobert   /* Most systems support nl_langinfo (CODESET) nowadays.  */
281*404b540aSrobert   codeset = nl_langinfo (CODESET);
282*404b540aSrobert 
283*404b540aSrobert # else
284*404b540aSrobert 
285*404b540aSrobert   /* On old systems which lack it, use setlocale or getenv.  */
286*404b540aSrobert   const char *locale = NULL;
287*404b540aSrobert 
288*404b540aSrobert   /* But most old systems don't have a complete set of locales.  Some
289*404b540aSrobert      (like SunOS 4 or DJGPP) have only the C locale.  Therefore we don't
290*404b540aSrobert      use setlocale here; it would return "C" when it doesn't support the
291*404b540aSrobert      locale name the user has set.  */
292*404b540aSrobert #  if HAVE_SETLOCALE && 0
293*404b540aSrobert   locale = setlocale (LC_CTYPE, NULL);
294*404b540aSrobert #  endif
295*404b540aSrobert   if (locale == NULL || locale[0] == '\0')
296*404b540aSrobert     {
297*404b540aSrobert       locale = getenv ("LC_ALL");
298*404b540aSrobert       if (locale == NULL || locale[0] == '\0')
299*404b540aSrobert 	{
300*404b540aSrobert 	  locale = getenv ("LC_CTYPE");
301*404b540aSrobert 	  if (locale == NULL || locale[0] == '\0')
302*404b540aSrobert 	    locale = getenv ("LANG");
303*404b540aSrobert 	}
304*404b540aSrobert     }
305*404b540aSrobert 
306*404b540aSrobert   /* On some old systems, one used to set locale = "iso8859_1". On others,
307*404b540aSrobert      you set it to "language_COUNTRY.charset". In any case, we resolve it
308*404b540aSrobert      through the charset.alias file.  */
309*404b540aSrobert   codeset = locale;
310*404b540aSrobert 
311*404b540aSrobert # endif
312*404b540aSrobert 
313*404b540aSrobert #elif defined WIN32
314*404b540aSrobert 
315*404b540aSrobert   static char buf[2 + 10 + 1];
316*404b540aSrobert 
317*404b540aSrobert   /* Woe32 has a function returning the locale's codepage as a number.  */
318*404b540aSrobert   sprintf (buf, "CP%u", GetACP ());
319*404b540aSrobert   codeset = buf;
320*404b540aSrobert 
321*404b540aSrobert #elif defined OS2
322*404b540aSrobert 
323*404b540aSrobert   const char *locale;
324*404b540aSrobert   static char buf[2 + 10 + 1];
325*404b540aSrobert   ULONG cp[3];
326*404b540aSrobert   ULONG cplen;
327*404b540aSrobert 
328*404b540aSrobert   /* Allow user to override the codeset, as set in the operating system,
329*404b540aSrobert      with standard language environment variables.  */
330*404b540aSrobert   locale = getenv ("LC_ALL");
331*404b540aSrobert   if (locale == NULL || locale[0] == '\0')
332*404b540aSrobert     {
333*404b540aSrobert       locale = getenv ("LC_CTYPE");
334*404b540aSrobert       if (locale == NULL || locale[0] == '\0')
335*404b540aSrobert 	locale = getenv ("LANG");
336*404b540aSrobert     }
337*404b540aSrobert   if (locale != NULL && locale[0] != '\0')
338*404b540aSrobert     {
339*404b540aSrobert       /* If the locale name contains an encoding after the dot, return it.  */
340*404b540aSrobert       const char *dot = strchr (locale, '.');
341*404b540aSrobert 
342*404b540aSrobert       if (dot != NULL)
343*404b540aSrobert 	{
344*404b540aSrobert 	  const char *modifier;
345*404b540aSrobert 
346*404b540aSrobert 	  dot++;
347*404b540aSrobert 	  /* Look for the possible @... trailer and remove it, if any.  */
348*404b540aSrobert 	  modifier = strchr (dot, '@');
349*404b540aSrobert 	  if (modifier == NULL)
350*404b540aSrobert 	    return dot;
351*404b540aSrobert 	  if (modifier - dot < sizeof (buf))
352*404b540aSrobert 	    {
353*404b540aSrobert 	      memcpy (buf, dot, modifier - dot);
354*404b540aSrobert 	      buf [modifier - dot] = '\0';
355*404b540aSrobert 	      return buf;
356*404b540aSrobert 	    }
357*404b540aSrobert 	}
358*404b540aSrobert 
359*404b540aSrobert       /* Resolve through the charset.alias file.  */
360*404b540aSrobert       codeset = locale;
361*404b540aSrobert     }
362*404b540aSrobert   else
363*404b540aSrobert     {
364*404b540aSrobert       /* OS/2 has a function returning the locale's codepage as a number.  */
365*404b540aSrobert       if (DosQueryCp (sizeof (cp), cp, &cplen))
366*404b540aSrobert 	codeset = "";
367*404b540aSrobert       else
368*404b540aSrobert 	{
369*404b540aSrobert 	  sprintf (buf, "CP%u", cp[0]);
370*404b540aSrobert 	  codeset = buf;
371*404b540aSrobert 	}
372*404b540aSrobert     }
373*404b540aSrobert 
374*404b540aSrobert #endif
375*404b540aSrobert 
376*404b540aSrobert   if (codeset == NULL)
377*404b540aSrobert     /* The canonical name cannot be determined.  */
378*404b540aSrobert     codeset = "";
379*404b540aSrobert 
380*404b540aSrobert   /* Resolve alias. */
381*404b540aSrobert   for (aliases = get_charset_aliases ();
382*404b540aSrobert        *aliases != '\0';
383*404b540aSrobert        aliases += strlen (aliases) + 1, aliases += strlen (aliases) + 1)
384*404b540aSrobert     if (strcmp (codeset, aliases) == 0
385*404b540aSrobert 	|| (aliases[0] == '*' && aliases[1] == '\0'))
386*404b540aSrobert       {
387*404b540aSrobert 	codeset = aliases + strlen (aliases) + 1;
388*404b540aSrobert 	break;
389*404b540aSrobert       }
390*404b540aSrobert 
391*404b540aSrobert   /* Don't return an empty string.  GNU libc and GNU libiconv interpret
392*404b540aSrobert      the empty string as denoting "the locale's character encoding",
393*404b540aSrobert      thus GNU libiconv would call this function a second time.  */
394*404b540aSrobert   if (codeset[0] == '\0')
395*404b540aSrobert     codeset = "ASCII";
396*404b540aSrobert 
397*404b540aSrobert   return codeset;
398*404b540aSrobert }
399