1 /* Determine a canonical name for the current locale's character encoding. 2 3 Copyright (C) 2000-2002 Free Software Foundation, Inc. 4 5 This program is free software; you can redistribute it and/or modify it 6 under the terms of the GNU Library General Public License as published 7 by the Free Software Foundation; either version 2, or (at your option) 8 any later version. 9 10 This program is distributed in the hope that it will be useful, 11 but WITHOUT ANY WARRANTY; without even the implied warranty of 12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 13 Library General Public License for more details. 14 15 You should have received a copy of the GNU Library General Public 16 License along with this program; if not, write to the Free Software 17 Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, 18 USA. */ 19 20 /* Written by Bruno Haible <haible@clisp.cons.org>. */ 21 22 #ifdef HAVE_CONFIG_H 23 # include <config.h> 24 #endif 25 26 #if HAVE_STDDEF_H 27 # include <stddef.h> 28 #endif 29 30 #include <stdio.h> 31 #if HAVE_STRING_H 32 # include <string.h> 33 #else 34 # include <strings.h> 35 #endif 36 #if HAVE_STDLIB_H 37 # include <stdlib.h> 38 #endif 39 40 #if defined _WIN32 || defined __WIN32__ 41 # undef WIN32 /* avoid warning on mingw32 */ 42 # define WIN32 43 #endif 44 45 #if defined __EMX__ 46 /* Assume EMX program runs on OS/2, even if compiled under DOS. */ 47 # define OS2 48 #endif 49 50 #if !defined WIN32 51 # if HAVE_LANGINFO_CODESET 52 # include <langinfo.h> 53 # else 54 # if HAVE_SETLOCALE 55 # include <locale.h> 56 # endif 57 # endif 58 #elif defined WIN32 59 # define WIN32_LEAN_AND_MEAN 60 # include <windows.h> 61 #endif 62 #if defined OS2 63 # define INCL_DOS 64 # include <os2.h> 65 #endif 66 67 #if defined _WIN32 || defined __WIN32__ || defined __EMX__ || defined __DJGPP__ 68 /* Win32, OS/2, DOS */ 69 # define ISSLASH(C) ((C) == '/' || (C) == '\\') 70 #endif 71 72 #ifndef DIRECTORY_SEPARATOR 73 # define DIRECTORY_SEPARATOR '/' 74 #endif 75 76 #ifndef ISSLASH 77 # define ISSLASH(C) ((C) == DIRECTORY_SEPARATOR) 78 #endif 79 80 #ifdef HAVE_GETC_UNLOCKED 81 # undef getc 82 # define getc getc_unlocked 83 #endif 84 85 /* The following static variable is declared 'volatile' to avoid a 86 possible multithread problem in the function get_charset_aliases. If we 87 are running in a threaded environment, and if two threads initialize 88 'charset_aliases' simultaneously, both will produce the same value, 89 and everything will be ok if the two assignments to 'charset_aliases' 90 are atomic. But I don't know what will happen if the two assignments mix. */ 91 #if __STDC__ != 1 92 # define volatile /* empty */ 93 #endif 94 /* Pointer to the contents of the charset.alias file, if it has already been 95 read, else NULL. Its format is: 96 ALIAS_1 '\0' CANONICAL_1 '\0' ... ALIAS_n '\0' CANONICAL_n '\0' '\0' */ 97 static const char * volatile charset_aliases; 98 99 /* Return a pointer to the contents of the charset.alias file. */ 100 static const char * 101 get_charset_aliases () 102 { 103 const char *cp; 104 105 cp = charset_aliases; 106 if (cp == NULL) 107 { 108 #if !defined WIN32 109 FILE *fp; 110 const char *dir = LIBDIR; 111 const char *base = "charset.alias"; 112 char *file_name; 113 114 /* Concatenate dir and base into freshly allocated file_name. */ 115 { 116 size_t dir_len = strlen (dir); 117 size_t base_len = strlen (base); 118 int add_slash = (dir_len > 0 && !ISSLASH (dir[dir_len - 1])); 119 file_name = (char *) malloc (dir_len + add_slash + base_len + 1); 120 if (file_name != NULL) 121 { 122 memcpy (file_name, dir, dir_len); 123 if (add_slash) 124 file_name[dir_len] = DIRECTORY_SEPARATOR; 125 memcpy (file_name + dir_len + add_slash, base, base_len + 1); 126 } 127 } 128 129 if (file_name == NULL || (fp = fopen (file_name, "r")) == NULL) 130 /* Out of memory or file not found, treat it as empty. */ 131 cp = ""; 132 else 133 { 134 /* Parse the file's contents. */ 135 int c; 136 char buf1[50+1]; 137 char buf2[50+1]; 138 char *res_ptr = NULL; 139 size_t res_size = 0; 140 size_t l1, l2; 141 142 for (;;) 143 { 144 c = getc (fp); 145 if (c == EOF) 146 break; 147 if (c == '\n' || c == ' ' || c == '\t') 148 continue; 149 if (c == '#') 150 { 151 /* Skip comment, to end of line. */ 152 do 153 c = getc (fp); 154 while (!(c == EOF || c == '\n')); 155 if (c == EOF) 156 break; 157 continue; 158 } 159 ungetc (c, fp); 160 if (fscanf (fp, "%50s %50s", buf1, buf2) < 2) 161 break; 162 l1 = strlen (buf1); 163 l2 = strlen (buf2); 164 if (res_size == 0) 165 { 166 res_size = l1 + 1 + l2 + 1; 167 res_ptr = (char *) malloc (res_size + 1); 168 } 169 else 170 { 171 res_size += l1 + 1 + l2 + 1; 172 res_ptr = (char *) realloc (res_ptr, res_size + 1); 173 } 174 if (res_ptr == NULL) 175 { 176 /* Out of memory. */ 177 res_size = 0; 178 break; 179 } 180 strcpy (res_ptr + res_size - (l2 + 1) - (l1 + 1), buf1); 181 strcpy (res_ptr + res_size - (l2 + 1), buf2); 182 } 183 fclose (fp); 184 if (res_size == 0) 185 cp = ""; 186 else 187 { 188 *(res_ptr + res_size) = '\0'; 189 cp = res_ptr; 190 } 191 } 192 193 if (file_name != NULL) 194 free (file_name); 195 196 #else 197 198 /* To avoid the troubles of installing a separate file in the same 199 directory as the DLL and of retrieving the DLL's directory at 200 runtime, simply inline the aliases here. */ 201 202 # if defined WIN32 203 cp = "CP936" "\0" "GBK" "\0" 204 "CP1361" "\0" "JOHAB" "\0"; 205 # endif 206 #endif 207 208 charset_aliases = cp; 209 } 210 211 return cp; 212 } 213 214 /* Determine the current locale's character encoding, and canonicalize it 215 into one of the canonical names listed in config.charset. 216 The result must not be freed; it is statically allocated. 217 If the canonical name cannot be determined, the result is a non-canonical 218 name. */ 219 220 #ifdef STATIC 221 STATIC 222 #endif 223 const char * 224 locale_charset () 225 { 226 const char *codeset; 227 const char *aliases; 228 229 #if !(defined WIN32 || defined OS2) 230 231 # if HAVE_LANGINFO_CODESET 232 233 /* Most systems support nl_langinfo (CODESET) nowadays. */ 234 codeset = nl_langinfo (CODESET); 235 236 # else 237 238 /* On old systems which lack it, use setlocale or getenv. */ 239 const char *locale = NULL; 240 241 /* But most old systems don't have a complete set of locales. Some 242 (like SunOS 4 or DJGPP) have only the C locale. Therefore we don't 243 use setlocale here; it would return "C" when it doesn't support the 244 locale name the user has set. */ 245 # if HAVE_SETLOCALE && 0 246 locale = setlocale (LC_CTYPE, NULL); 247 # endif 248 if (locale == NULL || locale[0] == '\0') 249 { 250 locale = getenv ("LC_ALL"); 251 if (locale == NULL || locale[0] == '\0') 252 { 253 locale = getenv ("LC_CTYPE"); 254 if (locale == NULL || locale[0] == '\0') 255 locale = getenv ("LANG"); 256 } 257 } 258 259 /* On some old systems, one used to set locale = "iso8859_1". On others, 260 you set it to "language_COUNTRY.charset". In any case, we resolve it 261 through the charset.alias file. */ 262 codeset = locale; 263 264 # endif 265 266 #elif defined WIN32 267 268 static char buf[2 + 10 + 1]; 269 270 /* Win32 has a function returning the locale's codepage as a number. */ 271 sprintf (buf, "CP%u", GetACP ()); 272 codeset = buf; 273 274 #elif defined OS2 275 276 const char *locale; 277 static char buf[2 + 10 + 1]; 278 ULONG cp[3]; 279 ULONG cplen; 280 281 /* Allow user to override the codeset, as set in the operating system, 282 with standard language environment variables. */ 283 locale = getenv ("LC_ALL"); 284 if (locale == NULL || locale[0] == '\0') 285 { 286 locale = getenv ("LC_CTYPE"); 287 if (locale == NULL || locale[0] == '\0') 288 locale = getenv ("LANG"); 289 } 290 if (locale != NULL && locale[0] != '\0') 291 { 292 /* If the locale name contains an encoding after the dot, return it. */ 293 const char *dot = strchr (locale, '.'); 294 295 if (dot != NULL) 296 { 297 const char *modifier; 298 299 dot++; 300 /* Look for the possible @... trailer and remove it, if any. */ 301 modifier = strchr (dot, '@'); 302 if (modifier == NULL) 303 return dot; 304 if (modifier - dot < sizeof (buf)) 305 { 306 memcpy (buf, dot, modifier - dot); 307 buf [modifier - dot] = '\0'; 308 return buf; 309 } 310 } 311 312 /* Resolve through the charset.alias file. */ 313 codeset = locale; 314 } 315 else 316 { 317 /* OS/2 has a function returning the locale's codepage as a number. */ 318 if (DosQueryCp (sizeof (cp), cp, &cplen)) 319 codeset = ""; 320 else 321 { 322 sprintf (buf, "CP%u", cp[0]); 323 codeset = buf; 324 } 325 } 326 327 #endif 328 329 if (codeset == NULL) 330 /* The canonical name cannot be determined. */ 331 codeset = ""; 332 333 /* Resolve alias. */ 334 for (aliases = get_charset_aliases (); 335 *aliases != '\0'; 336 aliases += strlen (aliases) + 1, aliases += strlen (aliases) + 1) 337 if (strcmp (codeset, aliases) == 0 338 || (aliases[0] == '*' && aliases[1] == '\0')) 339 { 340 codeset = aliases + strlen (aliases) + 1; 341 break; 342 } 343 344 return codeset; 345 } 346