1*404b540aSrobert /* Determine a canonical name for the current locale's character encoding.
2*404b540aSrobert
3*404b540aSrobert Copyright (C) 2000-2003 Free Software Foundation, Inc.
4*404b540aSrobert
5*404b540aSrobert This program is free software; you can redistribute it and/or modify it
6*404b540aSrobert under the terms of the GNU Library General Public License as published
7*404b540aSrobert by the Free Software Foundation; either version 2, or (at your option)
8*404b540aSrobert any later version.
9*404b540aSrobert
10*404b540aSrobert This program is distributed in the hope that it will be useful,
11*404b540aSrobert but WITHOUT ANY WARRANTY; without even the implied warranty of
12*404b540aSrobert MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13*404b540aSrobert Library General Public License for more details.
14*404b540aSrobert
15*404b540aSrobert You should have received a copy of the GNU Library General Public
16*404b540aSrobert License along with this program; if not, write to the Free Software
17*404b540aSrobert Foundation, Inc., 51 Franklin Street - Fifth Floor, Boston, MA 02110-1301,
18*404b540aSrobert USA. */
19*404b540aSrobert
20*404b540aSrobert /* Written by Bruno Haible <bruno@clisp.org>. */
21*404b540aSrobert
22*404b540aSrobert #ifdef HAVE_CONFIG_H
23*404b540aSrobert # include <config.h>
24*404b540aSrobert #endif
25*404b540aSrobert
26*404b540aSrobert /* Specification. */
27*404b540aSrobert #include "localcharset.h"
28*404b540aSrobert
29*404b540aSrobert #if HAVE_STDDEF_H
30*404b540aSrobert # include <stddef.h>
31*404b540aSrobert #endif
32*404b540aSrobert
33*404b540aSrobert #include <stdio.h>
34*404b540aSrobert #if HAVE_STRING_H
35*404b540aSrobert # include <string.h>
36*404b540aSrobert #else
37*404b540aSrobert # include <strings.h>
38*404b540aSrobert #endif
39*404b540aSrobert #if HAVE_STDLIB_H
40*404b540aSrobert # include <stdlib.h>
41*404b540aSrobert #endif
42*404b540aSrobert
43*404b540aSrobert #if defined _WIN32 || defined __WIN32__
44*404b540aSrobert # undef WIN32 /* avoid warning on mingw32 */
45*404b540aSrobert # define WIN32
46*404b540aSrobert #endif
47*404b540aSrobert
48*404b540aSrobert #if defined __EMX__
49*404b540aSrobert /* Assume EMX program runs on OS/2, even if compiled under DOS. */
50*404b540aSrobert # define OS2
51*404b540aSrobert #endif
52*404b540aSrobert
53*404b540aSrobert #if !defined WIN32
54*404b540aSrobert # if HAVE_LANGINFO_CODESET
55*404b540aSrobert # include <langinfo.h>
56*404b540aSrobert # else
57*404b540aSrobert # if HAVE_SETLOCALE
58*404b540aSrobert # include <locale.h>
59*404b540aSrobert # endif
60*404b540aSrobert # endif
61*404b540aSrobert #elif defined WIN32
62*404b540aSrobert # define WIN32_LEAN_AND_MEAN
63*404b540aSrobert # include <windows.h>
64*404b540aSrobert #endif
65*404b540aSrobert #if defined OS2
66*404b540aSrobert # define INCL_DOS
67*404b540aSrobert # include <os2.h>
68*404b540aSrobert #endif
69*404b540aSrobert
70*404b540aSrobert #if ENABLE_RELOCATABLE
71*404b540aSrobert # include "relocatable.h"
72*404b540aSrobert #else
73*404b540aSrobert # define relocate(pathname) (pathname)
74*404b540aSrobert #endif
75*404b540aSrobert
76*404b540aSrobert #if defined _WIN32 || defined __WIN32__ || defined __EMX__ || defined __DJGPP__
77*404b540aSrobert /* Win32, OS/2, DOS */
78*404b540aSrobert # define ISSLASH(C) ((C) == '/' || (C) == '\\')
79*404b540aSrobert #endif
80*404b540aSrobert
81*404b540aSrobert #ifndef DIRECTORY_SEPARATOR
82*404b540aSrobert # define DIRECTORY_SEPARATOR '/'
83*404b540aSrobert #endif
84*404b540aSrobert
85*404b540aSrobert #ifndef ISSLASH
86*404b540aSrobert # define ISSLASH(C) ((C) == DIRECTORY_SEPARATOR)
87*404b540aSrobert #endif
88*404b540aSrobert
89*404b540aSrobert #ifdef HAVE_GETC_UNLOCKED
90*404b540aSrobert # undef getc
91*404b540aSrobert # define getc getc_unlocked
92*404b540aSrobert #endif
93*404b540aSrobert
94*404b540aSrobert /* The following static variable is declared 'volatile' to avoid a
95*404b540aSrobert possible multithread problem in the function get_charset_aliases. If we
96*404b540aSrobert are running in a threaded environment, and if two threads initialize
97*404b540aSrobert 'charset_aliases' simultaneously, both will produce the same value,
98*404b540aSrobert and everything will be ok if the two assignments to 'charset_aliases'
99*404b540aSrobert are atomic. But I don't know what will happen if the two assignments mix. */
100*404b540aSrobert #if __STDC__ != 1
101*404b540aSrobert # define volatile /* empty */
102*404b540aSrobert #endif
103*404b540aSrobert /* Pointer to the contents of the charset.alias file, if it has already been
104*404b540aSrobert read, else NULL. Its format is:
105*404b540aSrobert ALIAS_1 '\0' CANONICAL_1 '\0' ... ALIAS_n '\0' CANONICAL_n '\0' '\0' */
106*404b540aSrobert static const char * volatile charset_aliases;
107*404b540aSrobert
108*404b540aSrobert /* Return a pointer to the contents of the charset.alias file. */
109*404b540aSrobert static const char *
get_charset_aliases()110*404b540aSrobert get_charset_aliases ()
111*404b540aSrobert {
112*404b540aSrobert const char *cp;
113*404b540aSrobert
114*404b540aSrobert cp = charset_aliases;
115*404b540aSrobert if (cp == NULL)
116*404b540aSrobert {
117*404b540aSrobert #if !(defined VMS || defined WIN32)
118*404b540aSrobert FILE *fp;
119*404b540aSrobert const char *dir = relocate (LIBDIR);
120*404b540aSrobert const char *base = "charset.alias";
121*404b540aSrobert char *file_name;
122*404b540aSrobert
123*404b540aSrobert /* Concatenate dir and base into freshly allocated file_name. */
124*404b540aSrobert {
125*404b540aSrobert size_t dir_len = strlen (dir);
126*404b540aSrobert size_t base_len = strlen (base);
127*404b540aSrobert int add_slash = (dir_len > 0 && !ISSLASH (dir[dir_len - 1]));
128*404b540aSrobert file_name = (char *) malloc (dir_len + add_slash + base_len + 1);
129*404b540aSrobert if (file_name != NULL)
130*404b540aSrobert {
131*404b540aSrobert memcpy (file_name, dir, dir_len);
132*404b540aSrobert if (add_slash)
133*404b540aSrobert file_name[dir_len] = DIRECTORY_SEPARATOR;
134*404b540aSrobert memcpy (file_name + dir_len + add_slash, base, base_len + 1);
135*404b540aSrobert }
136*404b540aSrobert }
137*404b540aSrobert
138*404b540aSrobert if (file_name == NULL || (fp = fopen (file_name, "r")) == NULL)
139*404b540aSrobert /* Out of memory or file not found, treat it as empty. */
140*404b540aSrobert cp = "";
141*404b540aSrobert else
142*404b540aSrobert {
143*404b540aSrobert /* Parse the file's contents. */
144*404b540aSrobert int c;
145*404b540aSrobert char buf1[50+1];
146*404b540aSrobert char buf2[50+1];
147*404b540aSrobert char *res_ptr = NULL;
148*404b540aSrobert size_t res_size = 0;
149*404b540aSrobert size_t l1, l2;
150*404b540aSrobert
151*404b540aSrobert for (;;)
152*404b540aSrobert {
153*404b540aSrobert c = getc (fp);
154*404b540aSrobert if (c == EOF)
155*404b540aSrobert break;
156*404b540aSrobert if (c == '\n' || c == ' ' || c == '\t')
157*404b540aSrobert continue;
158*404b540aSrobert if (c == '#')
159*404b540aSrobert {
160*404b540aSrobert /* Skip comment, to end of line. */
161*404b540aSrobert do
162*404b540aSrobert c = getc (fp);
163*404b540aSrobert while (!(c == EOF || c == '\n'));
164*404b540aSrobert if (c == EOF)
165*404b540aSrobert break;
166*404b540aSrobert continue;
167*404b540aSrobert }
168*404b540aSrobert ungetc (c, fp);
169*404b540aSrobert if (fscanf (fp, "%50s %50s", buf1, buf2) < 2)
170*404b540aSrobert break;
171*404b540aSrobert l1 = strlen (buf1);
172*404b540aSrobert l2 = strlen (buf2);
173*404b540aSrobert if (res_size == 0)
174*404b540aSrobert {
175*404b540aSrobert res_size = l1 + 1 + l2 + 1;
176*404b540aSrobert res_ptr = (char *) malloc (res_size + 1);
177*404b540aSrobert }
178*404b540aSrobert else
179*404b540aSrobert {
180*404b540aSrobert res_size += l1 + 1 + l2 + 1;
181*404b540aSrobert res_ptr = (char *) realloc (res_ptr, res_size + 1);
182*404b540aSrobert }
183*404b540aSrobert if (res_ptr == NULL)
184*404b540aSrobert {
185*404b540aSrobert /* Out of memory. */
186*404b540aSrobert res_size = 0;
187*404b540aSrobert break;
188*404b540aSrobert }
189*404b540aSrobert strcpy (res_ptr + res_size - (l2 + 1) - (l1 + 1), buf1);
190*404b540aSrobert strcpy (res_ptr + res_size - (l2 + 1), buf2);
191*404b540aSrobert }
192*404b540aSrobert fclose (fp);
193*404b540aSrobert if (res_size == 0)
194*404b540aSrobert cp = "";
195*404b540aSrobert else
196*404b540aSrobert {
197*404b540aSrobert *(res_ptr + res_size) = '\0';
198*404b540aSrobert cp = res_ptr;
199*404b540aSrobert }
200*404b540aSrobert }
201*404b540aSrobert
202*404b540aSrobert if (file_name != NULL)
203*404b540aSrobert free (file_name);
204*404b540aSrobert
205*404b540aSrobert #else
206*404b540aSrobert
207*404b540aSrobert # if defined VMS
208*404b540aSrobert /* To avoid the troubles of an extra file charset.alias_vms in the
209*404b540aSrobert sources of many GNU packages, simply inline the aliases here. */
210*404b540aSrobert /* The list of encodings is taken from the OpenVMS 7.3-1 documentation
211*404b540aSrobert "Compaq C Run-Time Library Reference Manual for OpenVMS systems"
212*404b540aSrobert section 10.7 "Handling Different Character Sets". */
213*404b540aSrobert cp = "ISO8859-1" "\0" "ISO-8859-1" "\0"
214*404b540aSrobert "ISO8859-2" "\0" "ISO-8859-2" "\0"
215*404b540aSrobert "ISO8859-5" "\0" "ISO-8859-5" "\0"
216*404b540aSrobert "ISO8859-7" "\0" "ISO-8859-7" "\0"
217*404b540aSrobert "ISO8859-8" "\0" "ISO-8859-8" "\0"
218*404b540aSrobert "ISO8859-9" "\0" "ISO-8859-9" "\0"
219*404b540aSrobert /* Japanese */
220*404b540aSrobert "eucJP" "\0" "EUC-JP" "\0"
221*404b540aSrobert "SJIS" "\0" "SHIFT_JIS" "\0"
222*404b540aSrobert "DECKANJI" "\0" "DEC-KANJI" "\0"
223*404b540aSrobert "SDECKANJI" "\0" "EUC-JP" "\0"
224*404b540aSrobert /* Chinese */
225*404b540aSrobert "eucTW" "\0" "EUC-TW" "\0"
226*404b540aSrobert "DECHANYU" "\0" "DEC-HANYU" "\0"
227*404b540aSrobert "DECHANZI" "\0" "GB2312" "\0"
228*404b540aSrobert /* Korean */
229*404b540aSrobert "DECKOREAN" "\0" "EUC-KR" "\0";
230*404b540aSrobert # endif
231*404b540aSrobert
232*404b540aSrobert # if defined WIN32
233*404b540aSrobert /* To avoid the troubles of installing a separate file in the same
234*404b540aSrobert directory as the DLL and of retrieving the DLL's directory at
235*404b540aSrobert runtime, simply inline the aliases here. */
236*404b540aSrobert
237*404b540aSrobert cp = "CP936" "\0" "GBK" "\0"
238*404b540aSrobert "CP1361" "\0" "JOHAB" "\0"
239*404b540aSrobert "CP20127" "\0" "ASCII" "\0"
240*404b540aSrobert "CP20866" "\0" "KOI8-R" "\0"
241*404b540aSrobert "CP21866" "\0" "KOI8-RU" "\0"
242*404b540aSrobert "CP28591" "\0" "ISO-8859-1" "\0"
243*404b540aSrobert "CP28592" "\0" "ISO-8859-2" "\0"
244*404b540aSrobert "CP28593" "\0" "ISO-8859-3" "\0"
245*404b540aSrobert "CP28594" "\0" "ISO-8859-4" "\0"
246*404b540aSrobert "CP28595" "\0" "ISO-8859-5" "\0"
247*404b540aSrobert "CP28596" "\0" "ISO-8859-6" "\0"
248*404b540aSrobert "CP28597" "\0" "ISO-8859-7" "\0"
249*404b540aSrobert "CP28598" "\0" "ISO-8859-8" "\0"
250*404b540aSrobert "CP28599" "\0" "ISO-8859-9" "\0"
251*404b540aSrobert "CP28605" "\0" "ISO-8859-15" "\0";
252*404b540aSrobert # endif
253*404b540aSrobert #endif
254*404b540aSrobert
255*404b540aSrobert charset_aliases = cp;
256*404b540aSrobert }
257*404b540aSrobert
258*404b540aSrobert return cp;
259*404b540aSrobert }
260*404b540aSrobert
261*404b540aSrobert /* Determine the current locale's character encoding, and canonicalize it
262*404b540aSrobert into one of the canonical names listed in config.charset.
263*404b540aSrobert The result must not be freed; it is statically allocated.
264*404b540aSrobert If the canonical name cannot be determined, the result is a non-canonical
265*404b540aSrobert name. */
266*404b540aSrobert
267*404b540aSrobert #ifdef STATIC
268*404b540aSrobert STATIC
269*404b540aSrobert #endif
270*404b540aSrobert const char *
locale_charset()271*404b540aSrobert locale_charset ()
272*404b540aSrobert {
273*404b540aSrobert const char *codeset;
274*404b540aSrobert const char *aliases;
275*404b540aSrobert
276*404b540aSrobert #if !(defined WIN32 || defined OS2)
277*404b540aSrobert
278*404b540aSrobert # if HAVE_LANGINFO_CODESET
279*404b540aSrobert
280*404b540aSrobert /* Most systems support nl_langinfo (CODESET) nowadays. */
281*404b540aSrobert codeset = nl_langinfo (CODESET);
282*404b540aSrobert
283*404b540aSrobert # else
284*404b540aSrobert
285*404b540aSrobert /* On old systems which lack it, use setlocale or getenv. */
286*404b540aSrobert const char *locale = NULL;
287*404b540aSrobert
288*404b540aSrobert /* But most old systems don't have a complete set of locales. Some
289*404b540aSrobert (like SunOS 4 or DJGPP) have only the C locale. Therefore we don't
290*404b540aSrobert use setlocale here; it would return "C" when it doesn't support the
291*404b540aSrobert locale name the user has set. */
292*404b540aSrobert # if HAVE_SETLOCALE && 0
293*404b540aSrobert locale = setlocale (LC_CTYPE, NULL);
294*404b540aSrobert # endif
295*404b540aSrobert if (locale == NULL || locale[0] == '\0')
296*404b540aSrobert {
297*404b540aSrobert locale = getenv ("LC_ALL");
298*404b540aSrobert if (locale == NULL || locale[0] == '\0')
299*404b540aSrobert {
300*404b540aSrobert locale = getenv ("LC_CTYPE");
301*404b540aSrobert if (locale == NULL || locale[0] == '\0')
302*404b540aSrobert locale = getenv ("LANG");
303*404b540aSrobert }
304*404b540aSrobert }
305*404b540aSrobert
306*404b540aSrobert /* On some old systems, one used to set locale = "iso8859_1". On others,
307*404b540aSrobert you set it to "language_COUNTRY.charset". In any case, we resolve it
308*404b540aSrobert through the charset.alias file. */
309*404b540aSrobert codeset = locale;
310*404b540aSrobert
311*404b540aSrobert # endif
312*404b540aSrobert
313*404b540aSrobert #elif defined WIN32
314*404b540aSrobert
315*404b540aSrobert static char buf[2 + 10 + 1];
316*404b540aSrobert
317*404b540aSrobert /* Woe32 has a function returning the locale's codepage as a number. */
318*404b540aSrobert sprintf (buf, "CP%u", GetACP ());
319*404b540aSrobert codeset = buf;
320*404b540aSrobert
321*404b540aSrobert #elif defined OS2
322*404b540aSrobert
323*404b540aSrobert const char *locale;
324*404b540aSrobert static char buf[2 + 10 + 1];
325*404b540aSrobert ULONG cp[3];
326*404b540aSrobert ULONG cplen;
327*404b540aSrobert
328*404b540aSrobert /* Allow user to override the codeset, as set in the operating system,
329*404b540aSrobert with standard language environment variables. */
330*404b540aSrobert locale = getenv ("LC_ALL");
331*404b540aSrobert if (locale == NULL || locale[0] == '\0')
332*404b540aSrobert {
333*404b540aSrobert locale = getenv ("LC_CTYPE");
334*404b540aSrobert if (locale == NULL || locale[0] == '\0')
335*404b540aSrobert locale = getenv ("LANG");
336*404b540aSrobert }
337*404b540aSrobert if (locale != NULL && locale[0] != '\0')
338*404b540aSrobert {
339*404b540aSrobert /* If the locale name contains an encoding after the dot, return it. */
340*404b540aSrobert const char *dot = strchr (locale, '.');
341*404b540aSrobert
342*404b540aSrobert if (dot != NULL)
343*404b540aSrobert {
344*404b540aSrobert const char *modifier;
345*404b540aSrobert
346*404b540aSrobert dot++;
347*404b540aSrobert /* Look for the possible @... trailer and remove it, if any. */
348*404b540aSrobert modifier = strchr (dot, '@');
349*404b540aSrobert if (modifier == NULL)
350*404b540aSrobert return dot;
351*404b540aSrobert if (modifier - dot < sizeof (buf))
352*404b540aSrobert {
353*404b540aSrobert memcpy (buf, dot, modifier - dot);
354*404b540aSrobert buf [modifier - dot] = '\0';
355*404b540aSrobert return buf;
356*404b540aSrobert }
357*404b540aSrobert }
358*404b540aSrobert
359*404b540aSrobert /* Resolve through the charset.alias file. */
360*404b540aSrobert codeset = locale;
361*404b540aSrobert }
362*404b540aSrobert else
363*404b540aSrobert {
364*404b540aSrobert /* OS/2 has a function returning the locale's codepage as a number. */
365*404b540aSrobert if (DosQueryCp (sizeof (cp), cp, &cplen))
366*404b540aSrobert codeset = "";
367*404b540aSrobert else
368*404b540aSrobert {
369*404b540aSrobert sprintf (buf, "CP%u", cp[0]);
370*404b540aSrobert codeset = buf;
371*404b540aSrobert }
372*404b540aSrobert }
373*404b540aSrobert
374*404b540aSrobert #endif
375*404b540aSrobert
376*404b540aSrobert if (codeset == NULL)
377*404b540aSrobert /* The canonical name cannot be determined. */
378*404b540aSrobert codeset = "";
379*404b540aSrobert
380*404b540aSrobert /* Resolve alias. */
381*404b540aSrobert for (aliases = get_charset_aliases ();
382*404b540aSrobert *aliases != '\0';
383*404b540aSrobert aliases += strlen (aliases) + 1, aliases += strlen (aliases) + 1)
384*404b540aSrobert if (strcmp (codeset, aliases) == 0
385*404b540aSrobert || (aliases[0] == '*' && aliases[1] == '\0'))
386*404b540aSrobert {
387*404b540aSrobert codeset = aliases + strlen (aliases) + 1;
388*404b540aSrobert break;
389*404b540aSrobert }
390*404b540aSrobert
391*404b540aSrobert /* Don't return an empty string. GNU libc and GNU libiconv interpret
392*404b540aSrobert the empty string as denoting "the locale's character encoding",
393*404b540aSrobert thus GNU libiconv would call this function a second time. */
394*404b540aSrobert if (codeset[0] == '\0')
395*404b540aSrobert codeset = "ASCII";
396*404b540aSrobert
397*404b540aSrobert return codeset;
398*404b540aSrobert }
399