1 /*
2  * Main part of code, written by:
3  *
4  * Copyright (C) 1999-2001  Håvard Kvålen <havardk@xmms.org>
5  *
6  * This program is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU General Public License
8  * as published by the Free Software Foundation; either version 2
9  * of the License, or (at your option) any later version.
10  *
11  * This program is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14  * GNU General Public License for more details.
15  *
16  * You should have received a copy of the GNU General Public License
17  * along with this program; if not, write to the Free Software
18  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
19  * 02110-1301, USA.
20  *
21  */
22 
23 #include "config.h"
24 
25 #include "charset.h"
26 
27 #include <stdlib.h>
28 #include <glib/gi18n.h>
29 
30 #ifdef HAVE_LANGINFO_CODESET
31 #include <langinfo.h>
32 #endif
33 
34 #include "setting.h"
35 #include "log.h"
36 #include "win32/win32dep.h"
37 
38 typedef struct
39 {
40     const gchar *charset_title;
41     const gchar *charset_name;
42 } CharsetInfo;
43 
44 #define CHARSET_TRANS_ARRAY_LEN ( sizeof(charset_trans_array) / sizeof((charset_trans_array)[0]) )
45 static const CharsetInfo charset_trans_array[] = {
46     {N_("Arabic (IBM-864)"),                  "IBM864"        },
47     {N_("Arabic (ISO-8859-6)"),               "ISO-8859-6"    },
48     {N_("Arabic (Windows-1256)"),             "windows-1256"  },
49     {N_("Baltic (ISO-8859-13)"),              "ISO-8859-13"   },
50     {N_("Baltic (ISO-8859-4)"),               "ISO-8859-4"    },
51     {N_("Baltic (Windows-1257)"),             "windows-1257"  },
52     {N_("Celtic (ISO-8859-14)"),              "ISO-8859-14"   },
53     {N_("Central European (IBM-852)"),        "IBM852"        },
54     {N_("Central European (ISO-8859-2)"),     "ISO-8859-2"    },
55     {N_("Central European (Windows-1250)"),   "windows-1250"  },
56     {N_("Chinese Simplified (GB18030)"),      "gb18030"       },
57     {N_("Chinese Simplified (GB2312)"),       "GB2312"        },
58     {N_("Chinese Traditional (Big5)"),        "Big5"          },
59     {N_("Chinese Traditional (Big5-HKSCS)"),  "Big5-HKSCS"    },
60     {N_("Cyrillic (IBM-855)"),                "IBM855"        },
61     {N_("Cyrillic (ISO-8859-5)"),             "ISO-8859-5"    },
62     {N_("Cyrillic (ISO-IR-111)"),             "ISO-IR-111"    },
63     {N_("Cyrillic (KOI8-R)"),                 "KOI8-R"        },
64     {N_("Cyrillic (Windows-1251)"),           "windows-1251"  },
65     {N_("Cyrillic/Russian (CP-866)"),         "IBM866"        },
66     {N_("Cyrillic/Ukrainian (KOI8-U)"),       "KOI8-U"        },
67     {N_("English (US-ASCII)"),                "us-ascii"      },
68     {N_("Greek (ISO-8859-7)"),                "ISO-8859-7"    },
69     {N_("Greek (Windows-1253)"),              "windows-1253"  },
70     {N_("Hebrew (IBM-862)"),                  "IBM862"        },
71     {N_("Hebrew (Windows-1255)"),             "windows-1255"  },
72     {N_("Japanese (EUC-JP)"),                 "EUC-JP"        },
73     {N_("Japanese (ISO-2022-JP)"),            "ISO-2022-JP"   },
74     {N_("Japanese (Shift_JIS)"),              "Shift_JIS"     },
75     {N_("Korean (EUC-KR)"),                   "EUC-KR"        },
76     {N_("Nordic (ISO-8859-10)"),              "ISO-8859-10"   },
77     {N_("South European (ISO-8859-3)"),       "ISO-8859-3"    },
78     {N_("Thai (TIS-620)"),                    "TIS-620"       },
79     {N_("Turkish (IBM-857)"),                 "IBM857"        },
80     {N_("Turkish (ISO-8859-9)"),              "ISO-8859-9"    },
81     {N_("Turkish (Windows-1254)"),            "windows-1254"  },
82     //{N_("Unicode (UTF-7)"),                   "UTF-7"         },
83     {N_("Unicode (UTF-8)"),                   "UTF-8"         },
84 
85     //{N_("Unicode (UTF-16BE)"),                "UTF-16BE"      },
86     //{N_("Unicode (UTF-16LE)"),                "UTF-16LE"      },
87     //{N_("Unicode (UTF-32BE)"),                "UTF-32BE"      },
88     //{N_("Unicode (UTF-32LE)"),                "UTF-32LE"      },
89 
90     {N_("Vietnamese (VISCII)"),               "VISCII"        },
91     {N_("Vietnamese (Windows-1258)"),         "windows-1258"  },
92     {N_("Visual Hebrew (ISO-8859-8)"),        "ISO-8859-8"    },
93     {N_("Western (IBM-850)"),                 "IBM850"        },
94     {N_("Western (ISO-8859-1)"),              "ISO-8859-1"    },
95     {N_("Western (ISO-8859-15)"),             "ISO-8859-15"   },
96     {N_("Western (Windows-1252)"),            "windows-1252"  }
97 
98     /*
99      * From this point, character sets aren't supported by iconv
100      */
101 /*    {N_("Arabic (IBM-864-I)"),                "IBM864i"              },
102     {N_("Arabic (ISO-8859-6-E)"),             "ISO-8859-6-E"         },
103     {N_("Arabic (ISO-8859-6-I)"),             "ISO-8859-6-I"         },
104     {N_("Arabic (MacArabic)"),                "x-mac-arabic"         },
105     {N_("Armenian (ARMSCII-8)"),              "armscii-8"            },
106     {N_("Central European (MacCE)"),          "x-mac-ce"             },
107     {N_("Chinese Simplified (GBK)"),          "x-gbk"                },
108     {N_("Chinese Simplified (HZ)"),           "HZ-GB-2312"           },
109     {N_("Chinese Traditional (EUC-TW)"),      "x-euc-tw"             },
110     {N_("Croatian (MacCroatian)"),            "x-mac-croatian"       },
111     {N_("Cyrillic (MacCyrillic)"),            "x-mac-cyrillic"       },
112     {N_("Cyrillic/Ukrainian (MacUkrainian)"), "x-mac-ukrainian"      },
113     {N_("Farsi (MacFarsi)"),                  "x-mac-farsi"},
114     {N_("Greek (MacGreek)"),                  "x-mac-greek"          },
115     {N_("Gujarati (MacGujarati)"),            "x-mac-gujarati"       },
116     {N_("Gurmukhi (MacGurmukhi)"),            "x-mac-gurmukhi"       },
117     {N_("Hebrew (ISO-8859-8-E)"),             "ISO-8859-8-E"         },
118     {N_("Hebrew (ISO-8859-8-I)"),             "ISO-8859-8-I"         },
119     {N_("Hebrew (MacHebrew)"),                "x-mac-hebrew"         },
120     {N_("Hindi (MacDevanagari)"),             "x-mac-devanagari"     },
121     {N_("Icelandic (MacIcelandic)"),          "x-mac-icelandic"      },
122     {N_("Korean (JOHAB)"),                    "x-johab"              },
123     {N_("Korean (UHC)"),                      "x-windows-949"        },
124     {N_("Romanian (MacRomanian)"),            "x-mac-romanian"       },
125     {N_("Turkish (MacTurkish)"),              "x-mac-turkish"        },
126     {N_("User Defined"),                      "x-user-defined"       },
127     {N_("Vietnamese (TCVN)"),                 "x-viet-tcvn5712"      },
128     {N_("Vietnamese (VPS)"),                  "x-viet-vps"           },
129     {N_("Western (MacRoman)"),                "x-mac-roman"          },
130     // charsets whithout possibly translatable names
131     {"T61.8bit",                              "T61.8bit"             },
132     {"x-imap4-modified-utf7",                 "x-imap4-modified-utf7"},
133     {"x-u-escaped",                           "x-u-escaped"          },
134     {"windows-936",                           "windows-936"          }
135 */
136 };
137 
138 static GHashTable *encodings;
139 
140 
141 /* stolen from gnome-desktop-item.c */
142 static gboolean
check_locale(const char * locale)143 check_locale (const char *locale)
144 {
145     GIConv cd = g_iconv_open ("UTF-8", locale);
146     if ((GIConv)-1 == cd)
147         return FALSE;
148     g_iconv_close (cd);
149     return TRUE;
150 }
151 
152 /* stolen from gnome-desktop-item.c */
153 G_GNUC_NULL_TERMINATED static void
insert_locales(GHashTable * encs,const gchar * enc,...)154 insert_locales (GHashTable *encs, const gchar *enc, ...)
155 {
156     va_list args;
157     char *s;
158 
159     va_start (args, enc);
160     for (;;)
161     {
162         s = va_arg (args, char *);
163         if (s == NULL)
164             break;
165         /* A GDestroyNotify is not passed, so casting away the const is
166          * safe, as the key is never freed. */
167         g_hash_table_insert (encs, s, (gpointer)enc);
168     }
169     va_end (args);
170 }
171 
172 /* stolen from gnome-desktop-item.c */
173 /* make a standard conversion table from the desktop standard spec */
174 void
Charset_Insert_Locales_Init(void)175 Charset_Insert_Locales_Init (void)
176 {
177     /* FIXME: Use g_hash_table_new_full. */
178     encodings = g_hash_table_new (g_str_hash, g_str_equal);
179 
180     /* "C" is plain ascii */
181     insert_locales (encodings, "ASCII", "C", NULL);
182 #ifdef G_OS_WIN32
183     insert_locales (encodings, "windows-1256", "ar", NULL); // 2006.12.31 - For testing with Arabic
184 #else /* !G_OS_WIN32 */
185     insert_locales (encodings, "ISO-8859-6", "ar", NULL);
186 #endif /* !G_OS_WIN32 */
187     insert_locales (encodings, "ARMSCII-8", "by", NULL);
188     insert_locales (encodings, "BIG5", "zh_TW", NULL);
189     insert_locales (encodings, "CP1251", "be", "bg", NULL);
190     if (check_locale ("EUC-CN")) {
191         insert_locales (encodings, "EUC-CN", "zh_CN", NULL);
192     } else {
193         insert_locales (encodings, "GB2312", "zh_CN", NULL);
194     }
195     insert_locales (encodings, "EUC-JP", "ja", NULL);
196     insert_locales (encodings, "EUC-KR", "ko", NULL);
197     /*insert_locales (encodings, "GEORGIAN-ACADEMY", NULL);*/
198     insert_locales (encodings, "GEORGIAN-PS", "ka", NULL);
199     insert_locales (encodings, "ISO-8859-1", "br", "ca", "da", "de", "en", "es", "eu", "fi", "fr", "gl", "it", "nl", "wa", "nb", "nn", "pt", "pt", "sv", NULL);
200 #ifdef G_OS_WIN32
201     insert_locales (encodings, "windows-1250", "cs", "hr", "hu", "pl", "ro", "sk", "sl", "sq", "sr", NULL);
202 #else /* !G_OS_WIN32 */
203     insert_locales (encodings, "ISO-8859-2", "cs", "hr", "hu", "pl", "ro", "sk", "sl", "sq", "sr", NULL);
204 #endif /* !G_OS_WIN32 */
205     insert_locales (encodings, "ISO-8859-3", "eo", NULL);
206     insert_locales (encodings, "ISO-8859-5", "mk", "sp", NULL);
207 #ifdef G_OS_WIN32
208     insert_locales (encodings, "windows-1253", "el", NULL);
209 #else /* !G_OS_WIN32 */
210     insert_locales (encodings, "ISO-8859-7", "el", NULL);
211 #endif /* !G_OS_WIN32 */
212 #ifdef G_OS_WIN32
213     insert_locales (encodings, "windows-1254", "tr", NULL);
214 #else /* !G_OS_WIN32 */
215     insert_locales (encodings, "ISO-8859-9", "tr", NULL);
216 #endif /* !G_OS_WIN32 */
217     insert_locales (encodings, "ISO-8859-13", "lt", "lv", "mi", NULL);
218     insert_locales (encodings, "ISO-8859-14", "ga", "cy", NULL);
219     insert_locales (encodings, "ISO-8859-15", "et", NULL);
220 #ifdef G_OS_WIN32
221     insert_locales (encodings, "windows-1251", "ru", NULL);
222 #else /* !G_OS_WIN32 */
223     insert_locales (encodings, "KOI8-R", "ru", NULL);
224 #endif /* !G_OS_WIN32 */
225     insert_locales (encodings, "KOI8-U", "uk", NULL);
226     if (check_locale ("TCVN-5712")) {
227         insert_locales (encodings, "TCVN-5712", "vi", NULL);
228     } else {
229         insert_locales (encodings, "TCVN", "vi", NULL);
230     }
231     insert_locales (encodings, "TIS-620", "th", NULL);
232 #ifdef G_OS_WIN32
233     insert_locales (encodings, "windows-1255", "he", NULL);
234 #endif /* G_OS_WIN32 */
235     /*insert_locales (encodings, "VISCII", NULL);*/
236 }
237 
238 void
Charset_Insert_Locales_Destroy(void)239 Charset_Insert_Locales_Destroy (void)
240 {
241     g_hash_table_destroy (encodings);
242 }
243 
244 /*
245  * get_encoding_from_locale:
246  * @locale: a locale string, of the form
247  *          language[_territory][.codeset][@modifer]
248  *
249  * Get the legacy (pre-Unicode) character encoding for the @locale, falling
250  * back to a hardcoded table if is not part of @locale, and as a last resort
251  * falling back to UTF-8.
252  *
253  * Returns: the legacy character encoding of @locale, or "UTF-8" on failure
254  */
255 const char *
get_encoding_from_locale(const char * locale)256 get_encoding_from_locale (const char *locale)
257 {
258     const char *encoding;
259     GStrv variants;
260     gsize i;
261 
262     g_return_val_if_fail (locale != NULL, NULL);
263 
264     /* Return early if the encoding is part of the locale. */
265     encoding = strchr (locale, '.');
266 
267     if (encoding != NULL)
268     {
269         /* Ignore UTF-8 (and utf8). */
270         if ((strncmp (encoding, ".UTF-8", 6) != 0)
271             && (strncmp (encoding, ".utf8", 5) != 0))
272         {
273             const gchar *modifier;
274 
275             modifier = strchr (encoding, '@');
276 
277             if (modifier != NULL)
278             {
279                 g_warning ("%s",
280                            "Returning modifier in addition to character set");
281             }
282 
283             return encoding;
284         }
285     }
286 
287     /* Loop over variants of the locale, returning the first match. */
288     variants = g_get_locale_variants (locale);
289 
290     for (i = 0; variants[i]; i++)
291     {
292         encoding = g_hash_table_lookup (encodings, variants[i]);
293 
294         if (encoding != NULL)
295         {
296             g_strfreev (variants);
297             return encoding;
298         }
299     }
300 
301     g_strfreev (variants);
302 
303     return "UTF-8";
304 }
305 
306 /*
307  * Return the locale from LANG if exists, else from LC_ALL
308  *
309  * http://www.opengroup.org/onlinepubs/009695399/basedefs/xbd_chap08.html#tag_08_02
310  *
311  * LANG
312  *     This variable shall determine the locale category for native language,
313  *     local customs, and coded character set in the absence of the LC_ALL and
314  *     other LC_* ( LC_COLLATE , LC_CTYPE , LC_MESSAGES , LC_MONETARY , LC_NUMERIC ,
315  *     LC_TIME ) environment variables. This can be used by applications to
316  *     determine the language to use for error messages and instructions, collating
317  *     sequences, date formats, and so on.
318  * LC_ALL
319  *     This variable shall determine the values for all locale categories. The
320  *     value of the LC_ALL environment variable has precedence over any of the
321  *     other environment variables starting with LC_ ( LC_COLLATE , LC_CTYPE ,
322  *     LC_MESSAGES , LC_MONETARY , LC_NUMERIC , LC_TIME ) and the LANG environment
323  *     variable.
324  * LC_COLLATE
325  *     This variable shall determine the locale category for character collation.
326  *     It determines collation information for regular expressions and sorting,
327  *     including equivalence classes and multi-character collating elements, in
328  *     various utilities and the strcoll() and strxfrm() functions. Additional
329  *     semantics of this variable, if any, are implementation-defined.
330  * LC_CTYPE
331  *     This variable shall determine the locale category for character handling
332  *     functions, such as tolower(), toupper(), and isalpha(). This environment
333  *     variable determines the interpretation of sequences of bytes of text data
334  *     as characters (for example, single as opposed to multi-byte characters),
335  *     the classification of characters (for example, alpha, digit, graph), and
336  *     the behavior of character classes. Additional semantics of this variable,
337  *    if any, are implementation-defined.
338  * LC_MESSAGES
339  *     This variable shall determine the locale category for processing affirmative
340  *     and negative responses and the language and cultural conventions in which
341  *     messages should be written. [XSI] [Option Start]  It also affects the behavior
342  *     of the catopen() function in determining the message catalog. [Option End]
343  *     Additional semantics of this variable, if any, are implementation-defined.
344  *     The language and cultural conventions of diagnostic and informative messages
345  *     whose format is unspecified by IEEE Std 1003.1-2001 should be affected by
346  *     the setting of LC_MESSAGES .
347  * LC_MONETARY
348  *     This variable shall determine the locale category for monetary-related
349  *     numeric formatting information. Additional semantics of this variable, if
350  *     any, are implementation-defined.
351  * LC_NUMERIC
352  *     This variable shall determine the locale category for numeric formatting
353  *     (for example, thousands separator and radix character) information in
354  *     various utilities as well as the formatted I/O operations in printf() and
355  *     scanf() and the string conversion functions in strtod(). Additional semantics
356  *     of this variable, if any, are implementation-defined.
357  * LC_TIME
358  *     This variable shall determine the locale category for date and time formatting
359  *     information. It affects the behavior of the time functions in strftime().
360  *     Additional semantics of this variable, if any, are implementation-defined.
361  *
362  *
363  * The values of locale categories shall be determined by a precedence order; the
364  * first condition met below determines the value:
365  *
366  *    1. If the LC_ALL environment variable is defined and is not null, the value
367  *       of LC_ALL shall be used.
368  *    2. If the LC_* environment variable ( LC_COLLATE , LC_CTYPE , LC_MESSAGES ,
369  *       LC_MONETARY , LC_NUMERIC , LC_TIME ) is defined and is not null, the value
370  *       of the environment variable shall be used to initialize the category that
371  *       corresponds to the environment variable.
372  *    3. If the LANG environment variable is defined and is not null, the value of
373  *       the LANG environment variable shall be used.
374  *    4. If the LANG environment variable is not set or is set to the empty string,
375  *       the implementation-defined default locale shall be used.
376  *
377  */
get_locale(void)378 const gchar *get_locale (void)
379 {
380     const gchar *loc;
381 
382     if ((loc = g_getenv("LC_ALL")) && *loc)
383         return loc;
384 
385     else if ((loc = g_getenv("LC_CTYPE")) && *loc)
386         return loc;
387 
388     else if ((loc = g_getenv("LANG")) && *loc)
389         return loc;
390 
391      else
392          return NULL;
393 }
394 
395 
396 
397 /*
398  * convert_string : (don't use with UTF-16 strings)
399  *  - display_error : if TRUE, may return an escaped string and display an error
400  *                    message (if conversion fails).
401  */
convert_string(const gchar * string,const gchar * from_codeset,const gchar * to_codeset,const gboolean display_error)402 gchar *convert_string (const gchar *string, const gchar *from_codeset,
403                        const gchar *to_codeset, const gboolean display_error)
404 {
405     return convert_string_1(string, -1, from_codeset, to_codeset, display_error);
406 }
407 
408 /* Length must be passed, as the string might be Unicode, in which case we can't
409  * count zeroes (see strlen call below). */
410 gchar *
convert_string_1(const gchar * string,gssize length,const gchar * from_codeset,const gchar * to_codeset,const gboolean display_error)411 convert_string_1 (const gchar *string, gssize length, const gchar *from_codeset,
412                          const gchar *to_codeset, const gboolean display_error)
413 {
414     gchar *output;
415     GError *error = NULL;
416     gsize bytes_written;
417 
418     g_return_val_if_fail (string != NULL, NULL);
419 
420     output = g_convert(string, length, to_codeset, from_codeset, NULL, &bytes_written, &error);
421     //output = g_convert_with_fallback(string, length, to_codeset, from_codeset, "?", NULL, &bytes_written, &error);
422 
423     if (output == NULL)
424     {
425         gchar *escaped_str = g_strescape(string, NULL);
426         if (display_error)
427         {
428             Log_Print(LOG_ERROR,"convert_string(): Failed conversion from charset '%s' to '%s'. "
429                       "String '%s'. Errcode %d (%s).",
430                       from_codeset, to_codeset, escaped_str, error->code, error->message);
431         }
432         g_free(escaped_str);
433         g_error_free(error);
434         // Return the input string without converting it. If the string is
435         // displayed in the UI, it must be in UTF-8!
436         if ( (g_ascii_strcasecmp(to_codeset, "UTF-8"))
437         ||   (g_utf8_validate(string, -1, NULL)) )
438         {
439             return g_strdup(string);
440         }
441     }else
442     {
443         // Patch from Alexey Illarionov:
444         //    g_convert returns null-terminated string only with one \0 at the
445         // end. It can cause some garbage at the end of a string for UTF-16.
446         // The second \0 should be set manually.
447         gchar *new_output;
448         new_output = g_realloc (output, bytes_written + 2);
449         if (new_output != NULL)
450         {
451             output = new_output;
452             output[bytes_written] = output[bytes_written + 1] = 0;
453         }
454     }
455 
456     //g_print("from %s => len: %d, string: '%s'\n     (%x %x %x %x %x %x %x %x)\n",from_codeset,length,string,string[0],string[1],string[2],string[3],string[4],string[5],string[6],string[7]);
457     //g_print("to   %s => len: %d, output: '%s'\n     (%x %x %x %x %x %x %x %x)\n\n",to_codeset,bytes_written+2,output,output[0],output[1],output[2],output[3],output[4],output[5],output[6],output[7]);
458 
459     return output;
460 }
461 
462 /*
463  * filename_from_display:
464  * @string: a UTF-8 string
465  *
466  * Convert a string from UTF-8 to the filesystem encoding.
467  *
468  * Returns: a newly-allocated filename in the GLib filename encoding on
469  *          success, or an escaped ASCII string on error
470  */
471 gchar *
filename_from_display(const gchar * string)472 filename_from_display (const gchar *string)
473 {
474     GError *error = NULL;
475     gchar *ret = NULL;
476     const gchar **filename_encodings;
477 
478     g_return_val_if_fail (string != NULL, NULL);
479     g_return_val_if_fail (g_utf8_validate (string, -1, NULL), NULL);
480 
481     ret = g_filename_from_utf8 (string, -1, NULL, NULL, &error);
482 
483     if (!ret)
484     {
485         g_debug ("Error while converting filename from display to GLib encoding: %s",
486                  error->message);
487         g_clear_error (&error);
488     }
489     else
490     {
491         return ret;
492     }
493 
494     /* If the target encoding is not UTF-8, try the user-chosen alternative. */
495     if (!g_get_filename_charsets (&filename_encodings))
496     {
497         EtRenameEncoding enc_option = g_settings_get_enum (MainSettings,
498                                                            "rename-encoding");
499 
500         switch (enc_option)
501         {
502             case ET_RENAME_ENCODING_TRY_ALTERNATIVE:
503                 /* Already called g_filename_from_utf8(). */
504                 break;
505             case ET_RENAME_ENCODING_TRANSLITERATE:
506             {
507                 /* iconv_open (3):
508                  * When the string "//TRANSLIT" is appended to tocode,
509                  * transliteration is activated. This means that when a
510                  * character cannot be represented in the target character set,
511                  * it can be approximated through one or several similarly
512                  * looking characters.
513                  */
514                 /* TODO: Use g_str_to_ascii() in GLib 2.40. */
515                 gchar *enc = g_strconcat (*filename_encodings, "//TRANSLIT", NULL);
516                 ret = g_convert (string, -1, enc, "UTF-8", NULL, NULL, &error);
517 
518                 if (!ret)
519                 {
520                     g_debug ("Error while converting filename from display to transliterated encoding '%s': %s",
521                              enc, error->message);
522                     g_clear_error (&error);
523                 }
524 
525                 g_free (enc);
526                 break;
527             }
528             case ET_RENAME_ENCODING_IGNORE:
529             {
530                 /* iconv_open (3):
531                  * When the string "//IGNORE" is appended to tocode, characters
532                  * that cannot be represented in the target character set will
533                  * be silently discarded.
534                  */
535                 gchar *enc = g_strconcat (*filename_encodings, "//IGNORE", NULL);
536                 ret = g_convert (string, -1, enc, "UTF-8", NULL, NULL, &error);
537 
538                 if (!ret)
539                 {
540                     g_debug ("Error while converting filename from display to encoding with ignored failures '%s': %s",
541                              enc, error->message);
542                     g_clear_error (&error);
543                 }
544 
545                 g_free (enc);
546                 break;
547             }
548             default:
549                 g_assert_not_reached ();
550         }
551     }
552 
553     /* Try alternative encodings. */
554     if (!ret)
555     {
556         const gchar *legacy_encoding;
557 
558         /* Guess the legacy (pre-Unicode) filesystem encoding from the locale.
559          * For example, fr_FR.UTF-8 => fr_FR => ISO-8859-1. */
560         legacy_encoding = get_encoding_from_locale (get_locale ());
561         ret = g_convert (string, -1, legacy_encoding, "UTF-8", NULL, NULL,
562                          &error);
563 
564         if (!ret)
565         {
566             g_debug ("Error while converting filename from display to legacy encoding '%s': %s",
567                      legacy_encoding, error->message);
568             g_clear_error (&error);
569         }
570     }
571 
572     if (!ret)
573     {
574         /* Failing that, try ISO-8859-1. */
575         ret = g_convert (string, -1, "ISO-8859-1", "UTF-8", NULL, NULL,
576                          &error);
577 
578         if (!ret)
579         {
580             g_debug ("Error while converting filename from display to ISO-8859-1: %s",
581                      error->message);
582             g_clear_error (&error);
583         }
584     }
585 
586     /* If all conversions fail, return an escaped version of the supplied UTF-8
587      * string. */
588     if (!ret)
589     {
590         gchar *escaped_str = g_strescape (string, NULL);
591 
592         /* TODO: Improve error string. */
593         Log_Print (LOG_ERROR,
594                    _("The UTF-8 string ‘%s’ could not be converted into filename encoding: %s"),
595                    string, _("Invalid UTF-8"));
596 
597         ret = escaped_str;
598     }
599 
600     return ret;
601 }
602 
603 /*
604  * Try_To_Validate_Utf8_String:
605  * @string: a string in unknown encoding
606  *
607  * Validate that @string is in UTF-8 encoding, or try to convert it to be so.
608  * Several alternative encodings are attempted, based on the current locale and
609  * some hardcoded fallbacks, before falling back to escaping the string.
610  *
611  * Returns: a newly-allocated UTF-8 encoded string
612  */
613 gchar *
Try_To_Validate_Utf8_String(const gchar * string)614 Try_To_Validate_Utf8_String (const gchar *string)
615 {
616     gchar *ret = NULL;
617     GError *error = NULL;
618 
619     g_return_val_if_fail (string != NULL, NULL);
620 
621     if (g_utf8_validate (string, -1, NULL))
622     {
623         /* String already in UTF-8. */
624         ret = g_strdup (string);
625     }
626     else
627     {
628         const gchar *legacy_encoding;
629 
630         /* Guess the legacy (pre-Unicode) encoding associated with the locale.
631          * For example, fr_FR.UTF-8 => fr_FR => ISO-8859-1. */
632         legacy_encoding = get_encoding_from_locale (get_locale ());
633         ret = g_convert (string, -1, "UTF-8", legacy_encoding, NULL, NULL,
634                          &error);
635 
636         if (!ret)
637         {
638             /* Failing that, try ISO-8859-1. */
639             g_debug ("Error converting string to legacy encoding '%s': %s",
640                      legacy_encoding, error->message);
641             g_clear_error (&error);
642             ret = g_convert (string, -1, "UTF-8", "ISO-8859-1", NULL, NULL,
643                              &error);
644         }
645 
646         if (!ret)
647         {
648             gchar *escaped_str = g_strescape (string, NULL);
649 
650             /* TODO: Improve error string. */
651             Log_Print (LOG_ERROR,
652                        _("The string ‘%s’ could not be converted into UTF-8: %s"),
653                        escaped_str, error->message);
654             g_clear_error (&error);
655 
656             ret = escaped_str;
657         }
658     }
659 
660     return ret;
661 }
662 
663 void
Charset_Populate_Combobox(GtkComboBox * combo,gint select_charset)664 Charset_Populate_Combobox (GtkComboBox *combo, gint select_charset)
665 {
666     gsize i;
667 
668     for (i = 0; i < CHARSET_TRANS_ARRAY_LEN; i++)
669     {
670         gtk_combo_box_text_append_text (GTK_COMBO_BOX_TEXT (combo),
671                                         _(charset_trans_array[i].charset_title));
672 
673     }
674 
675     gtk_combo_box_set_active (combo, select_charset);
676 }
677 
678 const gchar *
et_charset_get_name_from_index(guint index)679 et_charset_get_name_from_index (guint index)
680 {
681     g_return_val_if_fail (index <= CHARSET_TRANS_ARRAY_LEN, NULL);
682 
683     return charset_trans_array[index].charset_name;
684 }
685