1 /*
2 * Main part of code, written by:
3 *
4 * Copyright (C) 1999-2001 Håvard Kvålen <havardk@xmms.org>
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version 2
9 * of the License, or (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
19 * 02110-1301, USA.
20 *
21 */
22
23 #include "config.h"
24
25 #include "charset.h"
26
27 #include <stdlib.h>
28 #include <glib/gi18n.h>
29
30 #ifdef HAVE_LANGINFO_CODESET
31 #include <langinfo.h>
32 #endif
33
34 #include "setting.h"
35 #include "log.h"
36 #include "win32/win32dep.h"
37
38 typedef struct
39 {
40 const gchar *charset_title;
41 const gchar *charset_name;
42 } CharsetInfo;
43
44 #define CHARSET_TRANS_ARRAY_LEN ( sizeof(charset_trans_array) / sizeof((charset_trans_array)[0]) )
45 static const CharsetInfo charset_trans_array[] = {
46 {N_("Arabic (IBM-864)"), "IBM864" },
47 {N_("Arabic (ISO-8859-6)"), "ISO-8859-6" },
48 {N_("Arabic (Windows-1256)"), "windows-1256" },
49 {N_("Baltic (ISO-8859-13)"), "ISO-8859-13" },
50 {N_("Baltic (ISO-8859-4)"), "ISO-8859-4" },
51 {N_("Baltic (Windows-1257)"), "windows-1257" },
52 {N_("Celtic (ISO-8859-14)"), "ISO-8859-14" },
53 {N_("Central European (IBM-852)"), "IBM852" },
54 {N_("Central European (ISO-8859-2)"), "ISO-8859-2" },
55 {N_("Central European (Windows-1250)"), "windows-1250" },
56 {N_("Chinese Simplified (GB18030)"), "gb18030" },
57 {N_("Chinese Simplified (GB2312)"), "GB2312" },
58 {N_("Chinese Traditional (Big5)"), "Big5" },
59 {N_("Chinese Traditional (Big5-HKSCS)"), "Big5-HKSCS" },
60 {N_("Cyrillic (IBM-855)"), "IBM855" },
61 {N_("Cyrillic (ISO-8859-5)"), "ISO-8859-5" },
62 {N_("Cyrillic (ISO-IR-111)"), "ISO-IR-111" },
63 {N_("Cyrillic (KOI8-R)"), "KOI8-R" },
64 {N_("Cyrillic (Windows-1251)"), "windows-1251" },
65 {N_("Cyrillic/Russian (CP-866)"), "IBM866" },
66 {N_("Cyrillic/Ukrainian (KOI8-U)"), "KOI8-U" },
67 {N_("English (US-ASCII)"), "us-ascii" },
68 {N_("Greek (ISO-8859-7)"), "ISO-8859-7" },
69 {N_("Greek (Windows-1253)"), "windows-1253" },
70 {N_("Hebrew (IBM-862)"), "IBM862" },
71 {N_("Hebrew (Windows-1255)"), "windows-1255" },
72 {N_("Japanese (EUC-JP)"), "EUC-JP" },
73 {N_("Japanese (ISO-2022-JP)"), "ISO-2022-JP" },
74 {N_("Japanese (Shift_JIS)"), "Shift_JIS" },
75 {N_("Korean (EUC-KR)"), "EUC-KR" },
76 {N_("Nordic (ISO-8859-10)"), "ISO-8859-10" },
77 {N_("South European (ISO-8859-3)"), "ISO-8859-3" },
78 {N_("Thai (TIS-620)"), "TIS-620" },
79 {N_("Turkish (IBM-857)"), "IBM857" },
80 {N_("Turkish (ISO-8859-9)"), "ISO-8859-9" },
81 {N_("Turkish (Windows-1254)"), "windows-1254" },
82 //{N_("Unicode (UTF-7)"), "UTF-7" },
83 {N_("Unicode (UTF-8)"), "UTF-8" },
84
85 //{N_("Unicode (UTF-16BE)"), "UTF-16BE" },
86 //{N_("Unicode (UTF-16LE)"), "UTF-16LE" },
87 //{N_("Unicode (UTF-32BE)"), "UTF-32BE" },
88 //{N_("Unicode (UTF-32LE)"), "UTF-32LE" },
89
90 {N_("Vietnamese (VISCII)"), "VISCII" },
91 {N_("Vietnamese (Windows-1258)"), "windows-1258" },
92 {N_("Visual Hebrew (ISO-8859-8)"), "ISO-8859-8" },
93 {N_("Western (IBM-850)"), "IBM850" },
94 {N_("Western (ISO-8859-1)"), "ISO-8859-1" },
95 {N_("Western (ISO-8859-15)"), "ISO-8859-15" },
96 {N_("Western (Windows-1252)"), "windows-1252" }
97
98 /*
99 * From this point, character sets aren't supported by iconv
100 */
101 /* {N_("Arabic (IBM-864-I)"), "IBM864i" },
102 {N_("Arabic (ISO-8859-6-E)"), "ISO-8859-6-E" },
103 {N_("Arabic (ISO-8859-6-I)"), "ISO-8859-6-I" },
104 {N_("Arabic (MacArabic)"), "x-mac-arabic" },
105 {N_("Armenian (ARMSCII-8)"), "armscii-8" },
106 {N_("Central European (MacCE)"), "x-mac-ce" },
107 {N_("Chinese Simplified (GBK)"), "x-gbk" },
108 {N_("Chinese Simplified (HZ)"), "HZ-GB-2312" },
109 {N_("Chinese Traditional (EUC-TW)"), "x-euc-tw" },
110 {N_("Croatian (MacCroatian)"), "x-mac-croatian" },
111 {N_("Cyrillic (MacCyrillic)"), "x-mac-cyrillic" },
112 {N_("Cyrillic/Ukrainian (MacUkrainian)"), "x-mac-ukrainian" },
113 {N_("Farsi (MacFarsi)"), "x-mac-farsi"},
114 {N_("Greek (MacGreek)"), "x-mac-greek" },
115 {N_("Gujarati (MacGujarati)"), "x-mac-gujarati" },
116 {N_("Gurmukhi (MacGurmukhi)"), "x-mac-gurmukhi" },
117 {N_("Hebrew (ISO-8859-8-E)"), "ISO-8859-8-E" },
118 {N_("Hebrew (ISO-8859-8-I)"), "ISO-8859-8-I" },
119 {N_("Hebrew (MacHebrew)"), "x-mac-hebrew" },
120 {N_("Hindi (MacDevanagari)"), "x-mac-devanagari" },
121 {N_("Icelandic (MacIcelandic)"), "x-mac-icelandic" },
122 {N_("Korean (JOHAB)"), "x-johab" },
123 {N_("Korean (UHC)"), "x-windows-949" },
124 {N_("Romanian (MacRomanian)"), "x-mac-romanian" },
125 {N_("Turkish (MacTurkish)"), "x-mac-turkish" },
126 {N_("User Defined"), "x-user-defined" },
127 {N_("Vietnamese (TCVN)"), "x-viet-tcvn5712" },
128 {N_("Vietnamese (VPS)"), "x-viet-vps" },
129 {N_("Western (MacRoman)"), "x-mac-roman" },
130 // charsets whithout possibly translatable names
131 {"T61.8bit", "T61.8bit" },
132 {"x-imap4-modified-utf7", "x-imap4-modified-utf7"},
133 {"x-u-escaped", "x-u-escaped" },
134 {"windows-936", "windows-936" }
135 */
136 };
137
138 static GHashTable *encodings;
139
140
141 /* stolen from gnome-desktop-item.c */
142 static gboolean
check_locale(const char * locale)143 check_locale (const char *locale)
144 {
145 GIConv cd = g_iconv_open ("UTF-8", locale);
146 if ((GIConv)-1 == cd)
147 return FALSE;
148 g_iconv_close (cd);
149 return TRUE;
150 }
151
152 /* stolen from gnome-desktop-item.c */
153 G_GNUC_NULL_TERMINATED static void
insert_locales(GHashTable * encs,const gchar * enc,...)154 insert_locales (GHashTable *encs, const gchar *enc, ...)
155 {
156 va_list args;
157 char *s;
158
159 va_start (args, enc);
160 for (;;)
161 {
162 s = va_arg (args, char *);
163 if (s == NULL)
164 break;
165 /* A GDestroyNotify is not passed, so casting away the const is
166 * safe, as the key is never freed. */
167 g_hash_table_insert (encs, s, (gpointer)enc);
168 }
169 va_end (args);
170 }
171
172 /* stolen from gnome-desktop-item.c */
173 /* make a standard conversion table from the desktop standard spec */
174 void
Charset_Insert_Locales_Init(void)175 Charset_Insert_Locales_Init (void)
176 {
177 /* FIXME: Use g_hash_table_new_full. */
178 encodings = g_hash_table_new (g_str_hash, g_str_equal);
179
180 /* "C" is plain ascii */
181 insert_locales (encodings, "ASCII", "C", NULL);
182 #ifdef G_OS_WIN32
183 insert_locales (encodings, "windows-1256", "ar", NULL); // 2006.12.31 - For testing with Arabic
184 #else /* !G_OS_WIN32 */
185 insert_locales (encodings, "ISO-8859-6", "ar", NULL);
186 #endif /* !G_OS_WIN32 */
187 insert_locales (encodings, "ARMSCII-8", "by", NULL);
188 insert_locales (encodings, "BIG5", "zh_TW", NULL);
189 insert_locales (encodings, "CP1251", "be", "bg", NULL);
190 if (check_locale ("EUC-CN")) {
191 insert_locales (encodings, "EUC-CN", "zh_CN", NULL);
192 } else {
193 insert_locales (encodings, "GB2312", "zh_CN", NULL);
194 }
195 insert_locales (encodings, "EUC-JP", "ja", NULL);
196 insert_locales (encodings, "EUC-KR", "ko", NULL);
197 /*insert_locales (encodings, "GEORGIAN-ACADEMY", NULL);*/
198 insert_locales (encodings, "GEORGIAN-PS", "ka", NULL);
199 insert_locales (encodings, "ISO-8859-1", "br", "ca", "da", "de", "en", "es", "eu", "fi", "fr", "gl", "it", "nl", "wa", "nb", "nn", "pt", "pt", "sv", NULL);
200 #ifdef G_OS_WIN32
201 insert_locales (encodings, "windows-1250", "cs", "hr", "hu", "pl", "ro", "sk", "sl", "sq", "sr", NULL);
202 #else /* !G_OS_WIN32 */
203 insert_locales (encodings, "ISO-8859-2", "cs", "hr", "hu", "pl", "ro", "sk", "sl", "sq", "sr", NULL);
204 #endif /* !G_OS_WIN32 */
205 insert_locales (encodings, "ISO-8859-3", "eo", NULL);
206 insert_locales (encodings, "ISO-8859-5", "mk", "sp", NULL);
207 #ifdef G_OS_WIN32
208 insert_locales (encodings, "windows-1253", "el", NULL);
209 #else /* !G_OS_WIN32 */
210 insert_locales (encodings, "ISO-8859-7", "el", NULL);
211 #endif /* !G_OS_WIN32 */
212 #ifdef G_OS_WIN32
213 insert_locales (encodings, "windows-1254", "tr", NULL);
214 #else /* !G_OS_WIN32 */
215 insert_locales (encodings, "ISO-8859-9", "tr", NULL);
216 #endif /* !G_OS_WIN32 */
217 insert_locales (encodings, "ISO-8859-13", "lt", "lv", "mi", NULL);
218 insert_locales (encodings, "ISO-8859-14", "ga", "cy", NULL);
219 insert_locales (encodings, "ISO-8859-15", "et", NULL);
220 #ifdef G_OS_WIN32
221 insert_locales (encodings, "windows-1251", "ru", NULL);
222 #else /* !G_OS_WIN32 */
223 insert_locales (encodings, "KOI8-R", "ru", NULL);
224 #endif /* !G_OS_WIN32 */
225 insert_locales (encodings, "KOI8-U", "uk", NULL);
226 if (check_locale ("TCVN-5712")) {
227 insert_locales (encodings, "TCVN-5712", "vi", NULL);
228 } else {
229 insert_locales (encodings, "TCVN", "vi", NULL);
230 }
231 insert_locales (encodings, "TIS-620", "th", NULL);
232 #ifdef G_OS_WIN32
233 insert_locales (encodings, "windows-1255", "he", NULL);
234 #endif /* G_OS_WIN32 */
235 /*insert_locales (encodings, "VISCII", NULL);*/
236 }
237
238 void
Charset_Insert_Locales_Destroy(void)239 Charset_Insert_Locales_Destroy (void)
240 {
241 g_hash_table_destroy (encodings);
242 }
243
244 /*
245 * get_encoding_from_locale:
246 * @locale: a locale string, of the form
247 * language[_territory][.codeset][@modifer]
248 *
249 * Get the legacy (pre-Unicode) character encoding for the @locale, falling
250 * back to a hardcoded table if is not part of @locale, and as a last resort
251 * falling back to UTF-8.
252 *
253 * Returns: the legacy character encoding of @locale, or "UTF-8" on failure
254 */
255 const char *
get_encoding_from_locale(const char * locale)256 get_encoding_from_locale (const char *locale)
257 {
258 const char *encoding;
259 GStrv variants;
260 gsize i;
261
262 g_return_val_if_fail (locale != NULL, NULL);
263
264 /* Return early if the encoding is part of the locale. */
265 encoding = strchr (locale, '.');
266
267 if (encoding != NULL)
268 {
269 /* Ignore UTF-8 (and utf8). */
270 if ((strncmp (encoding, ".UTF-8", 6) != 0)
271 && (strncmp (encoding, ".utf8", 5) != 0))
272 {
273 const gchar *modifier;
274
275 modifier = strchr (encoding, '@');
276
277 if (modifier != NULL)
278 {
279 g_warning ("%s",
280 "Returning modifier in addition to character set");
281 }
282
283 return encoding;
284 }
285 }
286
287 /* Loop over variants of the locale, returning the first match. */
288 variants = g_get_locale_variants (locale);
289
290 for (i = 0; variants[i]; i++)
291 {
292 encoding = g_hash_table_lookup (encodings, variants[i]);
293
294 if (encoding != NULL)
295 {
296 g_strfreev (variants);
297 return encoding;
298 }
299 }
300
301 g_strfreev (variants);
302
303 return "UTF-8";
304 }
305
306 /*
307 * Return the locale from LANG if exists, else from LC_ALL
308 *
309 * http://www.opengroup.org/onlinepubs/009695399/basedefs/xbd_chap08.html#tag_08_02
310 *
311 * LANG
312 * This variable shall determine the locale category for native language,
313 * local customs, and coded character set in the absence of the LC_ALL and
314 * other LC_* ( LC_COLLATE , LC_CTYPE , LC_MESSAGES , LC_MONETARY , LC_NUMERIC ,
315 * LC_TIME ) environment variables. This can be used by applications to
316 * determine the language to use for error messages and instructions, collating
317 * sequences, date formats, and so on.
318 * LC_ALL
319 * This variable shall determine the values for all locale categories. The
320 * value of the LC_ALL environment variable has precedence over any of the
321 * other environment variables starting with LC_ ( LC_COLLATE , LC_CTYPE ,
322 * LC_MESSAGES , LC_MONETARY , LC_NUMERIC , LC_TIME ) and the LANG environment
323 * variable.
324 * LC_COLLATE
325 * This variable shall determine the locale category for character collation.
326 * It determines collation information for regular expressions and sorting,
327 * including equivalence classes and multi-character collating elements, in
328 * various utilities and the strcoll() and strxfrm() functions. Additional
329 * semantics of this variable, if any, are implementation-defined.
330 * LC_CTYPE
331 * This variable shall determine the locale category for character handling
332 * functions, such as tolower(), toupper(), and isalpha(). This environment
333 * variable determines the interpretation of sequences of bytes of text data
334 * as characters (for example, single as opposed to multi-byte characters),
335 * the classification of characters (for example, alpha, digit, graph), and
336 * the behavior of character classes. Additional semantics of this variable,
337 * if any, are implementation-defined.
338 * LC_MESSAGES
339 * This variable shall determine the locale category for processing affirmative
340 * and negative responses and the language and cultural conventions in which
341 * messages should be written. [XSI] [Option Start] It also affects the behavior
342 * of the catopen() function in determining the message catalog. [Option End]
343 * Additional semantics of this variable, if any, are implementation-defined.
344 * The language and cultural conventions of diagnostic and informative messages
345 * whose format is unspecified by IEEE Std 1003.1-2001 should be affected by
346 * the setting of LC_MESSAGES .
347 * LC_MONETARY
348 * This variable shall determine the locale category for monetary-related
349 * numeric formatting information. Additional semantics of this variable, if
350 * any, are implementation-defined.
351 * LC_NUMERIC
352 * This variable shall determine the locale category for numeric formatting
353 * (for example, thousands separator and radix character) information in
354 * various utilities as well as the formatted I/O operations in printf() and
355 * scanf() and the string conversion functions in strtod(). Additional semantics
356 * of this variable, if any, are implementation-defined.
357 * LC_TIME
358 * This variable shall determine the locale category for date and time formatting
359 * information. It affects the behavior of the time functions in strftime().
360 * Additional semantics of this variable, if any, are implementation-defined.
361 *
362 *
363 * The values of locale categories shall be determined by a precedence order; the
364 * first condition met below determines the value:
365 *
366 * 1. If the LC_ALL environment variable is defined and is not null, the value
367 * of LC_ALL shall be used.
368 * 2. If the LC_* environment variable ( LC_COLLATE , LC_CTYPE , LC_MESSAGES ,
369 * LC_MONETARY , LC_NUMERIC , LC_TIME ) is defined and is not null, the value
370 * of the environment variable shall be used to initialize the category that
371 * corresponds to the environment variable.
372 * 3. If the LANG environment variable is defined and is not null, the value of
373 * the LANG environment variable shall be used.
374 * 4. If the LANG environment variable is not set or is set to the empty string,
375 * the implementation-defined default locale shall be used.
376 *
377 */
get_locale(void)378 const gchar *get_locale (void)
379 {
380 const gchar *loc;
381
382 if ((loc = g_getenv("LC_ALL")) && *loc)
383 return loc;
384
385 else if ((loc = g_getenv("LC_CTYPE")) && *loc)
386 return loc;
387
388 else if ((loc = g_getenv("LANG")) && *loc)
389 return loc;
390
391 else
392 return NULL;
393 }
394
395
396
397 /*
398 * convert_string : (don't use with UTF-16 strings)
399 * - display_error : if TRUE, may return an escaped string and display an error
400 * message (if conversion fails).
401 */
convert_string(const gchar * string,const gchar * from_codeset,const gchar * to_codeset,const gboolean display_error)402 gchar *convert_string (const gchar *string, const gchar *from_codeset,
403 const gchar *to_codeset, const gboolean display_error)
404 {
405 return convert_string_1(string, -1, from_codeset, to_codeset, display_error);
406 }
407
408 /* Length must be passed, as the string might be Unicode, in which case we can't
409 * count zeroes (see strlen call below). */
410 gchar *
convert_string_1(const gchar * string,gssize length,const gchar * from_codeset,const gchar * to_codeset,const gboolean display_error)411 convert_string_1 (const gchar *string, gssize length, const gchar *from_codeset,
412 const gchar *to_codeset, const gboolean display_error)
413 {
414 gchar *output;
415 GError *error = NULL;
416 gsize bytes_written;
417
418 g_return_val_if_fail (string != NULL, NULL);
419
420 output = g_convert(string, length, to_codeset, from_codeset, NULL, &bytes_written, &error);
421 //output = g_convert_with_fallback(string, length, to_codeset, from_codeset, "?", NULL, &bytes_written, &error);
422
423 if (output == NULL)
424 {
425 gchar *escaped_str = g_strescape(string, NULL);
426 if (display_error)
427 {
428 Log_Print(LOG_ERROR,"convert_string(): Failed conversion from charset '%s' to '%s'. "
429 "String '%s'. Errcode %d (%s).",
430 from_codeset, to_codeset, escaped_str, error->code, error->message);
431 }
432 g_free(escaped_str);
433 g_error_free(error);
434 // Return the input string without converting it. If the string is
435 // displayed in the UI, it must be in UTF-8!
436 if ( (g_ascii_strcasecmp(to_codeset, "UTF-8"))
437 || (g_utf8_validate(string, -1, NULL)) )
438 {
439 return g_strdup(string);
440 }
441 }else
442 {
443 // Patch from Alexey Illarionov:
444 // g_convert returns null-terminated string only with one \0 at the
445 // end. It can cause some garbage at the end of a string for UTF-16.
446 // The second \0 should be set manually.
447 gchar *new_output;
448 new_output = g_realloc (output, bytes_written + 2);
449 if (new_output != NULL)
450 {
451 output = new_output;
452 output[bytes_written] = output[bytes_written + 1] = 0;
453 }
454 }
455
456 //g_print("from %s => len: %d, string: '%s'\n (%x %x %x %x %x %x %x %x)\n",from_codeset,length,string,string[0],string[1],string[2],string[3],string[4],string[5],string[6],string[7]);
457 //g_print("to %s => len: %d, output: '%s'\n (%x %x %x %x %x %x %x %x)\n\n",to_codeset,bytes_written+2,output,output[0],output[1],output[2],output[3],output[4],output[5],output[6],output[7]);
458
459 return output;
460 }
461
462 /*
463 * filename_from_display:
464 * @string: a UTF-8 string
465 *
466 * Convert a string from UTF-8 to the filesystem encoding.
467 *
468 * Returns: a newly-allocated filename in the GLib filename encoding on
469 * success, or an escaped ASCII string on error
470 */
471 gchar *
filename_from_display(const gchar * string)472 filename_from_display (const gchar *string)
473 {
474 GError *error = NULL;
475 gchar *ret = NULL;
476 const gchar **filename_encodings;
477
478 g_return_val_if_fail (string != NULL, NULL);
479 g_return_val_if_fail (g_utf8_validate (string, -1, NULL), NULL);
480
481 ret = g_filename_from_utf8 (string, -1, NULL, NULL, &error);
482
483 if (!ret)
484 {
485 g_debug ("Error while converting filename from display to GLib encoding: %s",
486 error->message);
487 g_clear_error (&error);
488 }
489 else
490 {
491 return ret;
492 }
493
494 /* If the target encoding is not UTF-8, try the user-chosen alternative. */
495 if (!g_get_filename_charsets (&filename_encodings))
496 {
497 EtRenameEncoding enc_option = g_settings_get_enum (MainSettings,
498 "rename-encoding");
499
500 switch (enc_option)
501 {
502 case ET_RENAME_ENCODING_TRY_ALTERNATIVE:
503 /* Already called g_filename_from_utf8(). */
504 break;
505 case ET_RENAME_ENCODING_TRANSLITERATE:
506 {
507 /* iconv_open (3):
508 * When the string "//TRANSLIT" is appended to tocode,
509 * transliteration is activated. This means that when a
510 * character cannot be represented in the target character set,
511 * it can be approximated through one or several similarly
512 * looking characters.
513 */
514 /* TODO: Use g_str_to_ascii() in GLib 2.40. */
515 gchar *enc = g_strconcat (*filename_encodings, "//TRANSLIT", NULL);
516 ret = g_convert (string, -1, enc, "UTF-8", NULL, NULL, &error);
517
518 if (!ret)
519 {
520 g_debug ("Error while converting filename from display to transliterated encoding '%s': %s",
521 enc, error->message);
522 g_clear_error (&error);
523 }
524
525 g_free (enc);
526 break;
527 }
528 case ET_RENAME_ENCODING_IGNORE:
529 {
530 /* iconv_open (3):
531 * When the string "//IGNORE" is appended to tocode, characters
532 * that cannot be represented in the target character set will
533 * be silently discarded.
534 */
535 gchar *enc = g_strconcat (*filename_encodings, "//IGNORE", NULL);
536 ret = g_convert (string, -1, enc, "UTF-8", NULL, NULL, &error);
537
538 if (!ret)
539 {
540 g_debug ("Error while converting filename from display to encoding with ignored failures '%s': %s",
541 enc, error->message);
542 g_clear_error (&error);
543 }
544
545 g_free (enc);
546 break;
547 }
548 default:
549 g_assert_not_reached ();
550 }
551 }
552
553 /* Try alternative encodings. */
554 if (!ret)
555 {
556 const gchar *legacy_encoding;
557
558 /* Guess the legacy (pre-Unicode) filesystem encoding from the locale.
559 * For example, fr_FR.UTF-8 => fr_FR => ISO-8859-1. */
560 legacy_encoding = get_encoding_from_locale (get_locale ());
561 ret = g_convert (string, -1, legacy_encoding, "UTF-8", NULL, NULL,
562 &error);
563
564 if (!ret)
565 {
566 g_debug ("Error while converting filename from display to legacy encoding '%s': %s",
567 legacy_encoding, error->message);
568 g_clear_error (&error);
569 }
570 }
571
572 if (!ret)
573 {
574 /* Failing that, try ISO-8859-1. */
575 ret = g_convert (string, -1, "ISO-8859-1", "UTF-8", NULL, NULL,
576 &error);
577
578 if (!ret)
579 {
580 g_debug ("Error while converting filename from display to ISO-8859-1: %s",
581 error->message);
582 g_clear_error (&error);
583 }
584 }
585
586 /* If all conversions fail, return an escaped version of the supplied UTF-8
587 * string. */
588 if (!ret)
589 {
590 gchar *escaped_str = g_strescape (string, NULL);
591
592 /* TODO: Improve error string. */
593 Log_Print (LOG_ERROR,
594 _("The UTF-8 string ‘%s’ could not be converted into filename encoding: %s"),
595 string, _("Invalid UTF-8"));
596
597 ret = escaped_str;
598 }
599
600 return ret;
601 }
602
603 /*
604 * Try_To_Validate_Utf8_String:
605 * @string: a string in unknown encoding
606 *
607 * Validate that @string is in UTF-8 encoding, or try to convert it to be so.
608 * Several alternative encodings are attempted, based on the current locale and
609 * some hardcoded fallbacks, before falling back to escaping the string.
610 *
611 * Returns: a newly-allocated UTF-8 encoded string
612 */
613 gchar *
Try_To_Validate_Utf8_String(const gchar * string)614 Try_To_Validate_Utf8_String (const gchar *string)
615 {
616 gchar *ret = NULL;
617 GError *error = NULL;
618
619 g_return_val_if_fail (string != NULL, NULL);
620
621 if (g_utf8_validate (string, -1, NULL))
622 {
623 /* String already in UTF-8. */
624 ret = g_strdup (string);
625 }
626 else
627 {
628 const gchar *legacy_encoding;
629
630 /* Guess the legacy (pre-Unicode) encoding associated with the locale.
631 * For example, fr_FR.UTF-8 => fr_FR => ISO-8859-1. */
632 legacy_encoding = get_encoding_from_locale (get_locale ());
633 ret = g_convert (string, -1, "UTF-8", legacy_encoding, NULL, NULL,
634 &error);
635
636 if (!ret)
637 {
638 /* Failing that, try ISO-8859-1. */
639 g_debug ("Error converting string to legacy encoding '%s': %s",
640 legacy_encoding, error->message);
641 g_clear_error (&error);
642 ret = g_convert (string, -1, "UTF-8", "ISO-8859-1", NULL, NULL,
643 &error);
644 }
645
646 if (!ret)
647 {
648 gchar *escaped_str = g_strescape (string, NULL);
649
650 /* TODO: Improve error string. */
651 Log_Print (LOG_ERROR,
652 _("The string ‘%s’ could not be converted into UTF-8: %s"),
653 escaped_str, error->message);
654 g_clear_error (&error);
655
656 ret = escaped_str;
657 }
658 }
659
660 return ret;
661 }
662
663 void
Charset_Populate_Combobox(GtkComboBox * combo,gint select_charset)664 Charset_Populate_Combobox (GtkComboBox *combo, gint select_charset)
665 {
666 gsize i;
667
668 for (i = 0; i < CHARSET_TRANS_ARRAY_LEN; i++)
669 {
670 gtk_combo_box_text_append_text (GTK_COMBO_BOX_TEXT (combo),
671 _(charset_trans_array[i].charset_title));
672
673 }
674
675 gtk_combo_box_set_active (combo, select_charset);
676 }
677
678 const gchar *
et_charset_get_name_from_index(guint index)679 et_charset_get_name_from_index (guint index)
680 {
681 g_return_val_if_fail (index <= CHARSET_TRANS_ARRAY_LEN, NULL);
682
683 return charset_trans_array[index].charset_name;
684 }
685