1 /* GLIB - Library of useful routines for C programming
2  *
3  * gconvert.c: Convert between character sets using iconv
4  * Copyright Red Hat Inc., 2000
5  * Authors: Havoc Pennington <hp@redhat.com>, Owen Taylor <otaylor@redhat.com>
6  *
7  * This library is free software; you can redistribute it and/or
8  * modify it under the terms of the GNU Lesser General Public
9  * License as published by the Free Software Foundation; either
10  * version 2.1 of the License, or (at your option) any later version.
11  *
12  * This library is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15  * Lesser General Public License for more details.
16  *
17  * You should have received a copy of the GNU Lesser General Public
18  * License along with this library; if not, see <http://www.gnu.org/licenses/>.
19  */
20 
21 #include "config.h"
22 #include "glibconfig.h"
23 
24 #ifndef G_OS_WIN32
25 #include <iconv.h>
26 #endif
27 #include <errno.h>
28 #include <stdio.h>
29 #include <string.h>
30 #include <stdlib.h>
31 
32 #ifdef G_OS_WIN32
33 #include "win_iconv.c"
34 #endif
35 
36 #ifdef G_PLATFORM_WIN32
37 #define STRICT
38 #include <windows.h>
39 #undef STRICT
40 #endif
41 
42 #include "gconvert.h"
43 #include "gconvertprivate.h"
44 
45 #include "gcharsetprivate.h"
46 #include "gslist.h"
47 #include "gstrfuncs.h"
48 #include "gtestutils.h"
49 #include "gthread.h"
50 #include "gthreadprivate.h"
51 #include "gunicode.h"
52 #include "gfileutils.h"
53 #include "genviron.h"
54 
55 #include "glibintl.h"
56 
57 
58 /**
59  * SECTION:conversions
60  * @title: Character Set Conversion
61  * @short_description: convert strings between different character sets
62  *
63  * The g_convert() family of function wraps the functionality of iconv().
64  * In addition to pure character set conversions, GLib has functions to
65  * deal with the extra complications of encodings for file names.
66  *
67  * ## File Name Encodings
68  *
69  * Historically, UNIX has not had a defined encoding for file names:
70  * a file name is valid as long as it does not have path separators
71  * in it ("/"). However, displaying file names may require conversion:
72  * from the character set in which they were created, to the character
73  * set in which the application operates. Consider the Spanish file name
74  * "Presentación.sxi". If the application which created it uses
75  * ISO-8859-1 for its encoding,
76  * |[
77  * Character:  P  r  e  s  e  n  t  a  c  i  ó  n  .  s  x  i
78  * Hex code:   50 72 65 73 65 6e 74 61 63 69 f3 6e 2e 73 78 69
79  * ]|
80  * However, if the application use UTF-8, the actual file name on
81  * disk would look like this:
82  * |[
83  * Character:  P  r  e  s  e  n  t  a  c  i  ó     n  .  s  x  i
84  * Hex code:   50 72 65 73 65 6e 74 61 63 69 c3 b3 6e 2e 73 78 69
85  * ]|
86  * Glib uses UTF-8 for its strings, and GUI toolkits like GTK+ that use
87  * GLib do the same thing. If you get a file name from the file system,
88  * for example, from readdir() or from g_dir_read_name(), and you wish
89  * to display the file name to the user, you  will need to convert it
90  * into UTF-8. The opposite case is when the user types the name of a
91  * file they wish to save: the toolkit will give you that string in
92  * UTF-8 encoding, and you will need to convert it to the character
93  * set used for file names before you can create the file with open()
94  * or fopen().
95  *
96  * By default, GLib assumes that file names on disk are in UTF-8
97  * encoding. This is a valid assumption for file systems which
98  * were created relatively recently: most applications use UTF-8
99  * encoding for their strings, and that is also what they use for
100  * the file names they create. However, older file systems may
101  * still contain file names created in "older" encodings, such as
102  * ISO-8859-1. In this case, for compatibility reasons, you may want
103  * to instruct GLib to use that particular encoding for file names
104  * rather than UTF-8. You can do this by specifying the encoding for
105  * file names in the [`G_FILENAME_ENCODING`][G_FILENAME_ENCODING]
106  * environment variable. For example, if your installation uses
107  * ISO-8859-1 for file names, you can put this in your `~/.profile`:
108  * |[
109  * export G_FILENAME_ENCODING=ISO-8859-1
110  * ]|
111  * GLib provides the functions g_filename_to_utf8() and
112  * g_filename_from_utf8() to perform the necessary conversions.
113  * These functions convert file names from the encoding specified
114  * in `G_FILENAME_ENCODING` to UTF-8 and vice-versa. This
115  * [diagram][file-name-encodings-diagram] illustrates how
116  * these functions are used to convert between UTF-8 and the
117  * encoding for file names in the file system.
118  *
119  * ## Conversion between file name encodings # {#file-name-encodings-diagram)
120  *
121  * ![](file-name-encodings.png)
122  *
123  * ## Checklist for Application Writers
124  *
125  * This section is a practical summary of the detailed
126  * things to do to make sure your applications process file
127  * name encodings correctly.
128  *
129  * 1. If you get a file name from the file system from a function
130  *    such as readdir() or gtk_file_chooser_get_filename(), you do
131  *    not need to do any conversion to pass that file name to
132  *    functions like open(), rename(), or fopen() -- those are "raw"
133  *    file names which the file system understands.
134  *
135  * 2. If you need to display a file name, convert it to UTF-8 first
136  *    by using g_filename_to_utf8(). If conversion fails, display a
137  *    string like "Unknown file name". Do not convert this string back
138  *    into the encoding used for file names if you wish to pass it to
139  *    the file system; use the original file name instead.
140  *
141  *    For example, the document window of a word processor could display
142  *    "Unknown file name" in its title bar but still let the user save
143  *    the file, as it would keep the raw file name internally. This
144  *    can happen if the user has not set the `G_FILENAME_ENCODING`
145  *    environment variable even though he has files whose names are
146  *    not encoded in UTF-8.
147  *
148  * 3. If your user interface lets the user type a file name for saving
149  *    or renaming, convert it to the encoding used for file names in
150  *    the file system by using g_filename_from_utf8(). Pass the converted
151  *    file name to functions like fopen(). If conversion fails, ask the
152  *    user to enter a different file name. This can happen if the user
153  *    types Japanese characters when `G_FILENAME_ENCODING` is set to
154  *    `ISO-8859-1`, for example.
155  */
156 
157 /* We try to terminate strings in unknown charsets with this many zero bytes
158  * to ensure that multibyte strings really are nul-terminated when we return
159  * them from g_convert() and friends.
160  */
161 #define NUL_TERMINATOR_LENGTH 4
162 
G_DEFINE_QUARK(g_convert_error,g_convert_error)163 G_DEFINE_QUARK (g_convert_error, g_convert_error)
164 
165 static gboolean
166 try_conversion (const char *to_codeset,
167 		const char *from_codeset,
168 		iconv_t    *cd)
169 {
170   *cd = iconv_open (to_codeset, from_codeset);
171 
172   if (*cd == (iconv_t)-1 && errno == EINVAL)
173     return FALSE;
174   else
175     return TRUE;
176 }
177 
178 static gboolean
try_to_aliases(const char ** to_aliases,const char * from_codeset,iconv_t * cd)179 try_to_aliases (const char **to_aliases,
180 		const char  *from_codeset,
181 		iconv_t     *cd)
182 {
183   if (to_aliases)
184     {
185       const char **p = to_aliases;
186       while (*p)
187 	{
188 	  if (try_conversion (*p, from_codeset, cd))
189 	    return TRUE;
190 
191 	  p++;
192 	}
193     }
194 
195   return FALSE;
196 }
197 
198 /**
199  * g_iconv_open: (skip)
200  * @to_codeset: destination codeset
201  * @from_codeset: source codeset
202  *
203  * Same as the standard UNIX routine iconv_open(), but
204  * may be implemented via libiconv on UNIX flavors that lack
205  * a native implementation.
206  *
207  * GLib provides g_convert() and g_locale_to_utf8() which are likely
208  * more convenient than the raw iconv wrappers.
209  *
210  * Returns: a "conversion descriptor", or (GIConv)-1 if
211  *  opening the converter failed.
212  **/
213 GIConv
g_iconv_open(const gchar * to_codeset,const gchar * from_codeset)214 g_iconv_open (const gchar  *to_codeset,
215 	      const gchar  *from_codeset)
216 {
217   iconv_t cd;
218 
219   if (!try_conversion (to_codeset, from_codeset, &cd))
220     {
221       const char **to_aliases = _g_charset_get_aliases (to_codeset);
222       const char **from_aliases = _g_charset_get_aliases (from_codeset);
223 
224       if (from_aliases)
225 	{
226 	  const char **p = from_aliases;
227 	  while (*p)
228 	    {
229 	      if (try_conversion (to_codeset, *p, &cd))
230 		goto out;
231 
232 	      if (try_to_aliases (to_aliases, *p, &cd))
233 		goto out;
234 
235 	      p++;
236 	    }
237 	}
238 
239       if (try_to_aliases (to_aliases, from_codeset, &cd))
240 	goto out;
241     }
242 
243  out:
244   return (cd == (iconv_t)-1) ? (GIConv)-1 : (GIConv)cd;
245 }
246 
247 /**
248  * g_iconv: (skip)
249  * @converter: conversion descriptor from g_iconv_open()
250  * @inbuf: bytes to convert
251  * @inbytes_left: inout parameter, bytes remaining to convert in @inbuf
252  * @outbuf: converted output bytes
253  * @outbytes_left: inout parameter, bytes available to fill in @outbuf
254  *
255  * Same as the standard UNIX routine iconv(), but
256  * may be implemented via libiconv on UNIX flavors that lack
257  * a native implementation.
258  *
259  * GLib provides g_convert() and g_locale_to_utf8() which are likely
260  * more convenient than the raw iconv wrappers.
261  *
262  * Returns: count of non-reversible conversions, or -1 on error
263  **/
264 gsize
g_iconv(GIConv converter,gchar ** inbuf,gsize * inbytes_left,gchar ** outbuf,gsize * outbytes_left)265 g_iconv (GIConv   converter,
266 	 gchar  **inbuf,
267 	 gsize   *inbytes_left,
268 	 gchar  **outbuf,
269 	 gsize   *outbytes_left)
270 {
271   iconv_t cd = (iconv_t)converter;
272 
273   return iconv (cd, inbuf, inbytes_left, outbuf, outbytes_left);
274 }
275 
276 /**
277  * g_iconv_close: (skip)
278  * @converter: a conversion descriptor from g_iconv_open()
279  *
280  * Same as the standard UNIX routine iconv_close(), but
281  * may be implemented via libiconv on UNIX flavors that lack
282  * a native implementation. Should be called to clean up
283  * the conversion descriptor from g_iconv_open() when
284  * you are done converting things.
285  *
286  * GLib provides g_convert() and g_locale_to_utf8() which are likely
287  * more convenient than the raw iconv wrappers.
288  *
289  * Returns: -1 on error, 0 on success
290  **/
291 gint
g_iconv_close(GIConv converter)292 g_iconv_close (GIConv converter)
293 {
294   iconv_t cd = (iconv_t)converter;
295 
296   return iconv_close (cd);
297 }
298 
299 static GIConv
open_converter(const gchar * to_codeset,const gchar * from_codeset,GError ** error)300 open_converter (const gchar *to_codeset,
301 		const gchar *from_codeset,
302 		GError     **error)
303 {
304   GIConv cd;
305 
306   cd = g_iconv_open (to_codeset, from_codeset);
307 
308   if (cd == (GIConv) -1)
309     {
310       /* Something went wrong.  */
311       if (error)
312 	{
313 	  if (errno == EINVAL)
314 	    g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_NO_CONVERSION,
315 			 _("Conversion from character set “%s” to “%s” is not supported"),
316 			 from_codeset, to_codeset);
317 	  else
318 	    g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_FAILED,
319 			 _("Could not open converter from “%s” to “%s”"),
320 			 from_codeset, to_codeset);
321 	}
322     }
323 
324   return cd;
325 }
326 
327 static int
close_converter(GIConv cd)328 close_converter (GIConv cd)
329 {
330   if (cd == (GIConv) -1)
331     return 0;
332 
333   return g_iconv_close (cd);
334 }
335 
336 /**
337  * g_convert_with_iconv: (skip)
338  * @str:           (array length=len) (element-type guint8):
339  *                 the string to convert.
340  * @len:           the length of the string in bytes, or -1 if the string is
341  *                 nul-terminated (Note that some encodings may allow nul
342  *                 bytes to occur inside strings. In that case, using -1
343  *                 for the @len parameter is unsafe)
344  * @converter:     conversion descriptor from g_iconv_open()
345  * @bytes_read:    (out) (optional): location to store the number of bytes in
346  *                 the input string that were successfully converted, or %NULL.
347  *                 Even if the conversion was successful, this may be
348  *                 less than @len if there were partial characters
349  *                 at the end of the input. If the error
350  *                 #G_CONVERT_ERROR_ILLEGAL_SEQUENCE occurs, the value
351  *                 stored will be the byte offset after the last valid
352  *                 input sequence.
353  * @bytes_written: (out) (optional): the number of bytes stored in
354  *                 the output buffer (not including the terminating nul).
355  * @error:         location to store the error occurring, or %NULL to ignore
356  *                 errors. Any of the errors in #GConvertError may occur.
357  *
358  * Converts a string from one character set to another.
359  *
360  * Note that you should use g_iconv() for streaming conversions.
361  * Despite the fact that @bytes_read can return information about partial
362  * characters, the g_convert_... functions are not generally suitable
363  * for streaming. If the underlying converter maintains internal state,
364  * then this won't be preserved across successive calls to g_convert(),
365  * g_convert_with_iconv() or g_convert_with_fallback(). (An example of
366  * this is the GNU C converter for CP1255 which does not emit a base
367  * character until it knows that the next character is not a mark that
368  * could combine with the base character.)
369  *
370  * Returns: (array length=bytes_written) (element-type guint8) (transfer full):
371  *               If the conversion was successful, a newly allocated buffer
372  *               containing the converted string, which must be freed with
373  *               g_free(). Otherwise %NULL and @error will be set.
374  **/
375 gchar*
g_convert_with_iconv(const gchar * str,gssize len,GIConv converter,gsize * bytes_read,gsize * bytes_written,GError ** error)376 g_convert_with_iconv (const gchar *str,
377 		      gssize       len,
378 		      GIConv       converter,
379 		      gsize       *bytes_read,
380 		      gsize       *bytes_written,
381 		      GError     **error)
382 {
383   gchar *dest;
384   gchar *outp;
385   const gchar *p;
386   gsize inbytes_remaining;
387   gsize outbytes_remaining;
388   gsize err;
389   gsize outbuf_size;
390   gboolean have_error = FALSE;
391   gboolean done = FALSE;
392   gboolean reset = FALSE;
393 
394   g_return_val_if_fail (converter != (GIConv) -1, NULL);
395 
396   if (len < 0)
397     len = strlen (str);
398 
399   p = str;
400   inbytes_remaining = len;
401   outbuf_size = len + NUL_TERMINATOR_LENGTH;
402 
403   outbytes_remaining = outbuf_size - NUL_TERMINATOR_LENGTH;
404   outp = dest = g_malloc (outbuf_size);
405 
406   while (!done && !have_error)
407     {
408       if (reset)
409         err = g_iconv (converter, NULL, &inbytes_remaining, &outp, &outbytes_remaining);
410       else
411         err = g_iconv (converter, (char **)&p, &inbytes_remaining, &outp, &outbytes_remaining);
412 
413       if (err == (gsize) -1)
414 	{
415 	  switch (errno)
416 	    {
417 	    case EINVAL:
418 	      /* Incomplete text, do not report an error */
419 	      done = TRUE;
420 	      break;
421 	    case E2BIG:
422 	      {
423 		gsize used = outp - dest;
424 
425 		outbuf_size *= 2;
426 		dest = g_realloc (dest, outbuf_size);
427 
428 		outp = dest + used;
429 		outbytes_remaining = outbuf_size - used - NUL_TERMINATOR_LENGTH;
430 	      }
431 	      break;
432 	    case EILSEQ:
433               g_set_error_literal (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
434                                    _("Invalid byte sequence in conversion input"));
435 	      have_error = TRUE;
436 	      break;
437 	    default:
438               {
439                 int errsv = errno;
440 
441                 g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_FAILED,
442                              _("Error during conversion: %s"),
443                              g_strerror (errsv));
444               }
445 	      have_error = TRUE;
446 	      break;
447 	    }
448 	}
449       else
450 	{
451 	  if (!reset)
452 	    {
453 	      /* call g_iconv with NULL inbuf to cleanup shift state */
454 	      reset = TRUE;
455 	      inbytes_remaining = 0;
456 	    }
457 	  else
458 	    done = TRUE;
459 	}
460     }
461 
462   memset (outp, 0, NUL_TERMINATOR_LENGTH);
463 
464   if (bytes_read)
465     *bytes_read = p - str;
466   else
467     {
468       if ((p - str) != len)
469 	{
470           if (!have_error)
471             {
472               g_set_error_literal (error, G_CONVERT_ERROR, G_CONVERT_ERROR_PARTIAL_INPUT,
473                                    _("Partial character sequence at end of input"));
474               have_error = TRUE;
475             }
476 	}
477     }
478 
479   if (bytes_written)
480     *bytes_written = outp - dest;	/* Doesn't include '\0' */
481 
482   if (have_error)
483     {
484       g_free (dest);
485       return NULL;
486     }
487   else
488     return dest;
489 }
490 
491 /**
492  * g_convert:
493  * @str:           (array length=len) (element-type guint8):
494  *                 the string to convert.
495  * @len:           the length of the string in bytes, or -1 if the string is
496  *                 nul-terminated (Note that some encodings may allow nul
497  *                 bytes to occur inside strings. In that case, using -1
498  *                 for the @len parameter is unsafe)
499  * @to_codeset:    name of character set into which to convert @str
500  * @from_codeset:  character set of @str.
501  * @bytes_read:    (out) (optional): location to store the number of bytes in
502  *                 the input string that were successfully converted, or %NULL.
503  *                 Even if the conversion was successful, this may be
504  *                 less than @len if there were partial characters
505  *                 at the end of the input. If the error
506  *                 #G_CONVERT_ERROR_ILLEGAL_SEQUENCE occurs, the value
507  *                 stored will be the byte offset after the last valid
508  *                 input sequence.
509  * @bytes_written: (out) (optional): the number of bytes stored in
510  *                 the output buffer (not including the terminating nul).
511  * @error:         location to store the error occurring, or %NULL to ignore
512  *                 errors. Any of the errors in #GConvertError may occur.
513  *
514  * Converts a string from one character set to another.
515  *
516  * Note that you should use g_iconv() for streaming conversions.
517  * Despite the fact that @bytes_read can return information about partial
518  * characters, the g_convert_... functions are not generally suitable
519  * for streaming. If the underlying converter maintains internal state,
520  * then this won't be preserved across successive calls to g_convert(),
521  * g_convert_with_iconv() or g_convert_with_fallback(). (An example of
522  * this is the GNU C converter for CP1255 which does not emit a base
523  * character until it knows that the next character is not a mark that
524  * could combine with the base character.)
525  *
526  * Using extensions such as "//TRANSLIT" may not work (or may not work
527  * well) on many platforms.  Consider using g_str_to_ascii() instead.
528  *
529  * Returns: (array length=bytes_written) (element-type guint8) (transfer full):
530  *          If the conversion was successful, a newly allocated buffer
531  *          containing the converted string, which must be freed with g_free().
532  *          Otherwise %NULL and @error will be set.
533  **/
534 gchar*
g_convert(const gchar * str,gssize len,const gchar * to_codeset,const gchar * from_codeset,gsize * bytes_read,gsize * bytes_written,GError ** error)535 g_convert (const gchar *str,
536            gssize       len,
537            const gchar *to_codeset,
538            const gchar *from_codeset,
539            gsize       *bytes_read,
540 	   gsize       *bytes_written,
541 	   GError     **error)
542 {
543   gchar *res;
544   GIConv cd;
545 
546   g_return_val_if_fail (str != NULL, NULL);
547   g_return_val_if_fail (to_codeset != NULL, NULL);
548   g_return_val_if_fail (from_codeset != NULL, NULL);
549 
550   cd = open_converter (to_codeset, from_codeset, error);
551 
552   if (cd == (GIConv) -1)
553     {
554       if (bytes_read)
555         *bytes_read = 0;
556 
557       if (bytes_written)
558         *bytes_written = 0;
559 
560       return NULL;
561     }
562 
563   res = g_convert_with_iconv (str, len, cd,
564 			      bytes_read, bytes_written,
565 			      error);
566 
567   close_converter (cd);
568 
569   return res;
570 }
571 
572 /**
573  * g_convert_with_fallback:
574  * @str:          (array length=len) (element-type guint8):
575  *                the string to convert.
576  * @len:          the length of the string in bytes, or -1 if the string is
577  *                 nul-terminated (Note that some encodings may allow nul
578  *                 bytes to occur inside strings. In that case, using -1
579  *                 for the @len parameter is unsafe)
580  * @to_codeset:   name of character set into which to convert @str
581  * @from_codeset: character set of @str.
582  * @fallback:     UTF-8 string to use in place of characters not
583  *                present in the target encoding. (The string must be
584  *                representable in the target encoding).
585  *                If %NULL, characters not in the target encoding will
586  *                be represented as Unicode escapes \uxxxx or \Uxxxxyyyy.
587  * @bytes_read:   (out) (optional): location to store the number of bytes in
588  *                the input string that were successfully converted, or %NULL.
589  *                Even if the conversion was successful, this may be
590  *                less than @len if there were partial characters
591  *                at the end of the input.
592  * @bytes_written: (out) (optional): the number of bytes stored in
593  *                 the output buffer (not including the terminating nul).
594  * @error:        location to store the error occurring, or %NULL to ignore
595  *                errors. Any of the errors in #GConvertError may occur.
596  *
597  * Converts a string from one character set to another, possibly
598  * including fallback sequences for characters not representable
599  * in the output. Note that it is not guaranteed that the specification
600  * for the fallback sequences in @fallback will be honored. Some
601  * systems may do an approximate conversion from @from_codeset
602  * to @to_codeset in their iconv() functions,
603  * in which case GLib will simply return that approximate conversion.
604  *
605  * Note that you should use g_iconv() for streaming conversions.
606  * Despite the fact that @bytes_read can return information about partial
607  * characters, the g_convert_... functions are not generally suitable
608  * for streaming. If the underlying converter maintains internal state,
609  * then this won't be preserved across successive calls to g_convert(),
610  * g_convert_with_iconv() or g_convert_with_fallback(). (An example of
611  * this is the GNU C converter for CP1255 which does not emit a base
612  * character until it knows that the next character is not a mark that
613  * could combine with the base character.)
614  *
615  * Returns: (array length=bytes_written) (element-type guint8) (transfer full):
616  *          If the conversion was successful, a newly allocated buffer
617  *          containing the converted string, which must be freed with g_free().
618  *          Otherwise %NULL and @error will be set.
619  **/
620 gchar*
g_convert_with_fallback(const gchar * str,gssize len,const gchar * to_codeset,const gchar * from_codeset,const gchar * fallback,gsize * bytes_read,gsize * bytes_written,GError ** error)621 g_convert_with_fallback (const gchar *str,
622 			 gssize       len,
623 			 const gchar *to_codeset,
624 			 const gchar *from_codeset,
625 			 const gchar *fallback,
626 			 gsize       *bytes_read,
627 			 gsize       *bytes_written,
628 			 GError     **error)
629 {
630   gchar *utf8;
631   gchar *dest;
632   gchar *outp;
633   const gchar *insert_str = NULL;
634   const gchar *p;
635   gsize inbytes_remaining;
636   const gchar *save_p = NULL;
637   gsize save_inbytes = 0;
638   gsize outbytes_remaining;
639   gsize err;
640   GIConv cd;
641   gsize outbuf_size;
642   gboolean have_error = FALSE;
643   gboolean done = FALSE;
644 
645   GError *local_error = NULL;
646 
647   g_return_val_if_fail (str != NULL, NULL);
648   g_return_val_if_fail (to_codeset != NULL, NULL);
649   g_return_val_if_fail (from_codeset != NULL, NULL);
650 
651   if (len < 0)
652     len = strlen (str);
653 
654   /* Try an exact conversion; we only proceed if this fails
655    * due to an illegal sequence in the input string.
656    */
657   dest = g_convert (str, len, to_codeset, from_codeset,
658 		    bytes_read, bytes_written, &local_error);
659   if (!local_error)
660     return dest;
661 
662   if (!g_error_matches (local_error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE))
663     {
664       g_propagate_error (error, local_error);
665       return NULL;
666     }
667   else
668     g_error_free (local_error);
669 
670   local_error = NULL;
671 
672   /* No go; to proceed, we need a converter from "UTF-8" to
673    * to_codeset, and the string as UTF-8.
674    */
675   cd = open_converter (to_codeset, "UTF-8", error);
676   if (cd == (GIConv) -1)
677     {
678       if (bytes_read)
679         *bytes_read = 0;
680 
681       if (bytes_written)
682         *bytes_written = 0;
683 
684       return NULL;
685     }
686 
687   utf8 = g_convert (str, len, "UTF-8", from_codeset,
688 		    bytes_read, &inbytes_remaining, error);
689   if (!utf8)
690     {
691       close_converter (cd);
692       if (bytes_written)
693         *bytes_written = 0;
694       return NULL;
695     }
696 
697   /* Now the heart of the code. We loop through the UTF-8 string, and
698    * whenever we hit an offending character, we form fallback, convert
699    * the fallback to the target codeset, and then go back to
700    * converting the original string after finishing with the fallback.
701    *
702    * The variables save_p and save_inbytes store the input state
703    * for the original string while we are converting the fallback
704    */
705   p = utf8;
706 
707   outbuf_size = len + NUL_TERMINATOR_LENGTH;
708   outbytes_remaining = outbuf_size - NUL_TERMINATOR_LENGTH;
709   outp = dest = g_malloc (outbuf_size);
710 
711   while (!done && !have_error)
712     {
713       gsize inbytes_tmp = inbytes_remaining;
714       err = g_iconv (cd, (char **)&p, &inbytes_tmp, &outp, &outbytes_remaining);
715       inbytes_remaining = inbytes_tmp;
716 
717       if (err == (gsize) -1)
718 	{
719 	  switch (errno)
720 	    {
721 	    case EINVAL:
722 	      g_assert_not_reached();
723 	      break;
724 	    case E2BIG:
725 	      {
726 		gsize used = outp - dest;
727 
728 		outbuf_size *= 2;
729 		dest = g_realloc (dest, outbuf_size);
730 
731 		outp = dest + used;
732 		outbytes_remaining = outbuf_size - used - NUL_TERMINATOR_LENGTH;
733 
734 		break;
735 	      }
736 	    case EILSEQ:
737 	      if (save_p)
738 		{
739 		  /* Error converting fallback string - fatal
740 		   */
741 		  g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
742 			       _("Cannot convert fallback “%s” to codeset “%s”"),
743 			       insert_str, to_codeset);
744 		  have_error = TRUE;
745 		  break;
746 		}
747 	      else if (p)
748 		{
749 		  if (!fallback)
750 		    {
751 		      gunichar ch = g_utf8_get_char (p);
752 		      insert_str = g_strdup_printf (ch < 0x10000 ? "\\u%04x" : "\\U%08x",
753 						    ch);
754 		    }
755 		  else
756 		    insert_str = fallback;
757 
758 		  save_p = g_utf8_next_char (p);
759 		  save_inbytes = inbytes_remaining - (save_p - p);
760 		  p = insert_str;
761 		  inbytes_remaining = strlen (p);
762 		  break;
763 		}
764               /* if p is null */
765               G_GNUC_FALLTHROUGH;
766 	    default:
767               {
768                 int errsv = errno;
769 
770                 g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_FAILED,
771                              _("Error during conversion: %s"),
772                              g_strerror (errsv));
773               }
774 
775 	      have_error = TRUE;
776 	      break;
777 	    }
778 	}
779       else
780 	{
781 	  if (save_p)
782 	    {
783 	      if (!fallback)
784 		g_free ((gchar *)insert_str);
785 	      p = save_p;
786 	      inbytes_remaining = save_inbytes;
787 	      save_p = NULL;
788 	    }
789 	  else if (p)
790 	    {
791 	      /* call g_iconv with NULL inbuf to cleanup shift state */
792 	      p = NULL;
793 	      inbytes_remaining = 0;
794 	    }
795 	  else
796 	    done = TRUE;
797 	}
798     }
799 
800   /* Cleanup
801    */
802   memset (outp, 0, NUL_TERMINATOR_LENGTH);
803 
804   close_converter (cd);
805 
806   if (bytes_written)
807     *bytes_written = outp - dest;	/* Doesn't include '\0' */
808 
809   g_free (utf8);
810 
811   if (have_error)
812     {
813       if (save_p && !fallback)
814 	g_free ((gchar *)insert_str);
815       g_free (dest);
816       return NULL;
817     }
818   else
819     return dest;
820 }
821 
822 /*
823  * g_locale_to_utf8
824  *
825  *
826  */
827 
828 /*
829  * Validate @string as UTF-8. @len can be negative if @string is
830  * nul-terminated, or a non-negative value in bytes. If @string ends in an
831  * incomplete sequence, or contains any illegal sequences or nul codepoints,
832  * %NULL will be returned and the error set to
833  * %G_CONVERT_ERROR_ILLEGAL_SEQUENCE.
834  * On success, @bytes_read and @bytes_written, if provided, will be set to
835  * the number of bytes in @string up to @len or the terminating nul byte.
836  * On error, @bytes_read will be set to the byte offset after the last valid
837  * and non-nul UTF-8 sequence in @string, and @bytes_written will be set to 0.
838  */
839 static gchar *
strdup_len(const gchar * string,gssize len,gsize * bytes_read,gsize * bytes_written,GError ** error)840 strdup_len (const gchar *string,
841 	    gssize       len,
842 	    gsize       *bytes_read,
843 	    gsize       *bytes_written,
844 	    GError     **error)
845 {
846   gsize real_len;
847   const gchar *end_valid;
848 
849   if (!g_utf8_validate (string, len, &end_valid))
850     {
851       if (bytes_read)
852 	*bytes_read = end_valid - string;
853       if (bytes_written)
854 	*bytes_written = 0;
855 
856       g_set_error_literal (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
857                            _("Invalid byte sequence in conversion input"));
858       return NULL;
859     }
860 
861   real_len = end_valid - string;
862 
863   if (bytes_read)
864     *bytes_read = real_len;
865   if (bytes_written)
866     *bytes_written = real_len;
867 
868   return g_strndup (string, real_len);
869 }
870 
871 typedef enum
872 {
873   CONVERT_CHECK_NO_NULS_IN_INPUT  = 1 << 0,
874   CONVERT_CHECK_NO_NULS_IN_OUTPUT = 1 << 1
875 } ConvertCheckFlags;
876 
877 /*
878  * Convert from @string in the encoding identified by @from_codeset,
879  * returning a string in the encoding identifed by @to_codeset.
880  * @len can be negative if @string is nul-terminated, or a non-negative
881  * value in bytes. Flags defined in #ConvertCheckFlags can be set in @flags
882  * to check the input, the output, or both, for embedded nul bytes.
883  * On success, @bytes_read, if provided, will be set to the number of bytes
884  * in @string up to @len or the terminating nul byte, and @bytes_written, if
885  * provided, will be set to the number of output bytes written into the
886  * returned buffer, excluding the terminating nul sequence.
887  * On error, @bytes_read will be set to the byte offset after the last valid
888  * sequence in @string, and @bytes_written will be set to 0.
889  */
890 static gchar *
convert_checked(const gchar * string,gssize len,const gchar * to_codeset,const gchar * from_codeset,ConvertCheckFlags flags,gsize * bytes_read,gsize * bytes_written,GError ** error)891 convert_checked (const gchar      *string,
892                  gssize            len,
893                  const gchar      *to_codeset,
894                  const gchar      *from_codeset,
895                  ConvertCheckFlags flags,
896                  gsize            *bytes_read,
897                  gsize            *bytes_written,
898                  GError          **error)
899 {
900   gchar *out;
901   gsize outbytes;
902 
903   if ((flags & CONVERT_CHECK_NO_NULS_IN_INPUT) && len > 0)
904     {
905       const gchar *early_nul = memchr (string, '\0', len);
906       if (early_nul != NULL)
907         {
908           if (bytes_read)
909             *bytes_read = early_nul - string;
910           if (bytes_written)
911             *bytes_written = 0;
912 
913           g_set_error_literal (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
914                                _("Embedded NUL byte in conversion input"));
915           return NULL;
916         }
917     }
918 
919   out = g_convert (string, len, to_codeset, from_codeset,
920                    bytes_read, &outbytes, error);
921   if (out == NULL)
922     {
923       if (bytes_written)
924         *bytes_written = 0;
925       return NULL;
926     }
927 
928   if ((flags & CONVERT_CHECK_NO_NULS_IN_OUTPUT)
929       && memchr (out, '\0', outbytes) != NULL)
930     {
931       g_free (out);
932       if (bytes_written)
933         *bytes_written = 0;
934       g_set_error_literal (error, G_CONVERT_ERROR, G_CONVERT_ERROR_EMBEDDED_NUL,
935                            _("Embedded NUL byte in conversion output"));
936       return NULL;
937     }
938 
939   if (bytes_written)
940     *bytes_written = outbytes;
941   return out;
942 }
943 
944 /**
945  * g_locale_to_utf8:
946  * @opsysstring:   (array length=len) (element-type guint8): a string in the
947  *                 encoding of the current locale. On Windows
948  *                 this means the system codepage.
949  * @len:           the length of the string, or -1 if the string is
950  *                 nul-terminated (Note that some encodings may allow nul
951  *                 bytes to occur inside strings. In that case, using -1
952  *                 for the @len parameter is unsafe)
953  * @bytes_read: (out) (optional): location to store the number of bytes in the
954  *                 input string that were successfully converted, or %NULL.
955  *                 Even if the conversion was successful, this may be
956  *                 less than @len if there were partial characters
957  *                 at the end of the input. If the error
958  *                 %G_CONVERT_ERROR_ILLEGAL_SEQUENCE occurs, the value
959  *                 stored will be the byte offset after the last valid
960  *                 input sequence.
961  * @bytes_written: (out) (optional): the number of bytes stored in the output
962  *                 buffer (not including the terminating nul).
963  * @error:         location to store the error occurring, or %NULL to ignore
964  *                 errors. Any of the errors in #GConvertError may occur.
965  *
966  * Converts a string which is in the encoding used for strings by
967  * the C runtime (usually the same as that used by the operating
968  * system) in the [current locale][setlocale] into a UTF-8 string.
969  *
970  * If the source encoding is not UTF-8 and the conversion output contains a
971  * nul character, the error %G_CONVERT_ERROR_EMBEDDED_NUL is set and the
972  * function returns %NULL.
973  * If the source encoding is UTF-8, an embedded nul character is treated with
974  * the %G_CONVERT_ERROR_ILLEGAL_SEQUENCE error for backward compatibility with
975  * earlier versions of this library. Use g_convert() to produce output that
976  * may contain embedded nul characters.
977  *
978  * Returns: (type utf8): The converted string, or %NULL on an error.
979  **/
980 gchar *
g_locale_to_utf8(const gchar * opsysstring,gssize len,gsize * bytes_read,gsize * bytes_written,GError ** error)981 g_locale_to_utf8 (const gchar  *opsysstring,
982 		  gssize        len,
983 		  gsize        *bytes_read,
984 		  gsize        *bytes_written,
985 		  GError      **error)
986 {
987   const char *charset;
988 
989   if (g_get_charset (&charset))
990     return strdup_len (opsysstring, len, bytes_read, bytes_written, error);
991   else
992     return convert_checked (opsysstring, len, "UTF-8", charset,
993                             CONVERT_CHECK_NO_NULS_IN_OUTPUT,
994                             bytes_read, bytes_written, error);
995 }
996 
997 /*
998  * Do the exact same as g_locale_to_utf8 except that the charset would
999  * be retrieved from _g_get_time_charset (which uses LC_TIME)
1000  *
1001  * Returns: The converted string, or %NULL on an error.
1002  */
1003 gchar *
_g_time_locale_to_utf8(const gchar * opsysstring,gssize len,gsize * bytes_read,gsize * bytes_written,GError ** error)1004 _g_time_locale_to_utf8 (const gchar *opsysstring,
1005                         gssize       len,
1006                         gsize       *bytes_read,
1007                         gsize       *bytes_written,
1008                         GError     **error)
1009 {
1010   const char *charset;
1011 
1012   if (_g_get_time_charset (&charset))
1013     return strdup_len (opsysstring, len, bytes_read, bytes_written, error);
1014   else
1015     return convert_checked (opsysstring, len, "UTF-8", charset,
1016                             CONVERT_CHECK_NO_NULS_IN_OUTPUT,
1017                             bytes_read, bytes_written, error);
1018 }
1019 
1020 /*
1021  * Do the exact same as g_locale_to_utf8 except that the charset would
1022  * be retrieved from _g_get_ctype_charset (which uses LC_CTYPE)
1023  *
1024  * Returns: The converted string, or %NULL on an error.
1025  */
1026 gchar *
_g_ctype_locale_to_utf8(const gchar * opsysstring,gssize len,gsize * bytes_read,gsize * bytes_written,GError ** error)1027 _g_ctype_locale_to_utf8 (const gchar *opsysstring,
1028                          gssize       len,
1029                          gsize       *bytes_read,
1030                          gsize       *bytes_written,
1031                          GError     **error)
1032 {
1033   const char *charset;
1034 
1035   if (_g_get_ctype_charset (&charset))
1036     return strdup_len (opsysstring, len, bytes_read, bytes_written, error);
1037   else
1038     return convert_checked (opsysstring, len, "UTF-8", charset,
1039                             CONVERT_CHECK_NO_NULS_IN_OUTPUT,
1040                             bytes_read, bytes_written, error);
1041 }
1042 
1043 /**
1044  * g_locale_from_utf8:
1045  * @utf8string:    a UTF-8 encoded string
1046  * @len:           the length of the string, or -1 if the string is
1047  *                 nul-terminated.
1048  * @bytes_read: (out) (optional): location to store the number of bytes in the
1049  *                 input string that were successfully converted, or %NULL.
1050  *                 Even if the conversion was successful, this may be
1051  *                 less than @len if there were partial characters
1052  *                 at the end of the input. If the error
1053  *                 %G_CONVERT_ERROR_ILLEGAL_SEQUENCE occurs, the value
1054  *                 stored will be the byte offset after the last valid
1055  *                 input sequence.
1056  * @bytes_written: (out) (optional): the number of bytes stored in the output
1057  *                 buffer (not including the terminating nul).
1058  * @error:         location to store the error occurring, or %NULL to ignore
1059  *                 errors. Any of the errors in #GConvertError may occur.
1060  *
1061  * Converts a string from UTF-8 to the encoding used for strings by
1062  * the C runtime (usually the same as that used by the operating
1063  * system) in the [current locale][setlocale]. On Windows this means
1064  * the system codepage.
1065  *
1066  * The input string shall not contain nul characters even if the @len
1067  * argument is positive. A nul character found inside the string will result
1068  * in error %G_CONVERT_ERROR_ILLEGAL_SEQUENCE. Use g_convert() to convert
1069  * input that may contain embedded nul characters.
1070  *
1071  * Returns: (array length=bytes_written) (element-type guint8) (transfer full):
1072  *          A newly-allocated buffer containing the converted string,
1073  *          or %NULL on an error, and error will be set.
1074  **/
1075 gchar *
g_locale_from_utf8(const gchar * utf8string,gssize len,gsize * bytes_read,gsize * bytes_written,GError ** error)1076 g_locale_from_utf8 (const gchar *utf8string,
1077 		    gssize       len,
1078 		    gsize       *bytes_read,
1079 		    gsize       *bytes_written,
1080 		    GError     **error)
1081 {
1082   const gchar *charset;
1083 
1084   if (g_get_charset (&charset))
1085     return strdup_len (utf8string, len, bytes_read, bytes_written, error);
1086   else
1087     return convert_checked (utf8string, len, charset, "UTF-8",
1088                             CONVERT_CHECK_NO_NULS_IN_INPUT,
1089                             bytes_read, bytes_written, error);
1090 }
1091 
1092 #ifndef G_PLATFORM_WIN32
1093 
1094 typedef struct _GFilenameCharsetCache GFilenameCharsetCache;
1095 
1096 struct _GFilenameCharsetCache {
1097   gboolean is_utf8;
1098   gchar *charset;
1099   gchar **filename_charsets;
1100 };
1101 
1102 static void
filename_charset_cache_free(gpointer data)1103 filename_charset_cache_free (gpointer data)
1104 {
1105   GFilenameCharsetCache *cache = data;
1106   g_free (cache->charset);
1107   g_strfreev (cache->filename_charsets);
1108   g_free (cache);
1109 }
1110 
1111 /**
1112  * g_get_filename_charsets:
1113  * @filename_charsets: (out) (transfer none) (array zero-terminated=1):
1114  *    return location for the %NULL-terminated list of encoding names
1115  *
1116  * Determines the preferred character sets used for filenames.
1117  * The first character set from the @charsets is the filename encoding, the
1118  * subsequent character sets are used when trying to generate a displayable
1119  * representation of a filename, see g_filename_display_name().
1120  *
1121  * On Unix, the character sets are determined by consulting the
1122  * environment variables `G_FILENAME_ENCODING` and `G_BROKEN_FILENAMES`.
1123  * On Windows, the character set used in the GLib API is always UTF-8
1124  * and said environment variables have no effect.
1125  *
1126  * `G_FILENAME_ENCODING` may be set to a comma-separated list of
1127  * character set names. The special token "\@locale" is taken
1128  * to  mean the character set for the [current locale][setlocale].
1129  * If `G_FILENAME_ENCODING` is not set, but `G_BROKEN_FILENAMES` is,
1130  * the character set of the current locale is taken as the filename
1131  * encoding. If neither environment variable  is set, UTF-8 is taken
1132  * as the filename encoding, but the character set of the current locale
1133  * is also put in the list of encodings.
1134  *
1135  * The returned @charsets belong to GLib and must not be freed.
1136  *
1137  * Note that on Unix, regardless of the locale character set or
1138  * `G_FILENAME_ENCODING` value, the actual file names present
1139  * on a system might be in any random encoding or just gibberish.
1140  *
1141  * Returns: %TRUE if the filename encoding is UTF-8.
1142  *
1143  * Since: 2.6
1144  */
1145 gboolean
g_get_filename_charsets(const gchar *** filename_charsets)1146 g_get_filename_charsets (const gchar ***filename_charsets)
1147 {
1148   static GPrivate cache_private = G_PRIVATE_INIT (filename_charset_cache_free);
1149   GFilenameCharsetCache *cache = g_private_get (&cache_private);
1150   const gchar *charset;
1151 
1152   if (!cache)
1153     cache = g_private_set_alloc0 (&cache_private, sizeof (GFilenameCharsetCache));
1154 
1155   g_get_charset (&charset);
1156 
1157   if (!(cache->charset && strcmp (cache->charset, charset) == 0))
1158     {
1159       const gchar *new_charset;
1160       const gchar *p;
1161       gint i;
1162 
1163       g_free (cache->charset);
1164       g_strfreev (cache->filename_charsets);
1165       cache->charset = g_strdup (charset);
1166 
1167       p = g_getenv ("G_FILENAME_ENCODING");
1168       if (p != NULL && p[0] != '\0')
1169 	{
1170 	  cache->filename_charsets = g_strsplit (p, ",", 0);
1171 	  cache->is_utf8 = (strcmp (cache->filename_charsets[0], "UTF-8") == 0);
1172 
1173 	  for (i = 0; cache->filename_charsets[i]; i++)
1174 	    {
1175 	      if (strcmp ("@locale", cache->filename_charsets[i]) == 0)
1176 		{
1177 		  g_get_charset (&new_charset);
1178 		  g_free (cache->filename_charsets[i]);
1179 		  cache->filename_charsets[i] = g_strdup (new_charset);
1180 		}
1181 	    }
1182 	}
1183       else if (g_getenv ("G_BROKEN_FILENAMES") != NULL)
1184 	{
1185 	  cache->filename_charsets = g_new0 (gchar *, 2);
1186 	  cache->is_utf8 = g_get_charset (&new_charset);
1187 	  cache->filename_charsets[0] = g_strdup (new_charset);
1188 	}
1189       else
1190 	{
1191 	  cache->filename_charsets = g_new0 (gchar *, 3);
1192 	  cache->is_utf8 = TRUE;
1193 	  cache->filename_charsets[0] = g_strdup ("UTF-8");
1194 	  if (!g_get_charset (&new_charset))
1195 	    cache->filename_charsets[1] = g_strdup (new_charset);
1196 	}
1197     }
1198 
1199   if (filename_charsets)
1200     *filename_charsets = (const gchar **)cache->filename_charsets;
1201 
1202   return cache->is_utf8;
1203 }
1204 
1205 #else /* G_PLATFORM_WIN32 */
1206 
1207 gboolean
g_get_filename_charsets(const gchar *** filename_charsets)1208 g_get_filename_charsets (const gchar ***filename_charsets)
1209 {
1210   static const gchar *charsets[] = {
1211     "UTF-8",
1212     NULL
1213   };
1214 
1215 #ifdef G_OS_WIN32
1216   /* On Windows GLib pretends that the filename charset is UTF-8 */
1217   if (filename_charsets)
1218     *filename_charsets = charsets;
1219 
1220   return TRUE;
1221 #else
1222   gboolean result;
1223 
1224   /* Cygwin works like before */
1225   result = g_get_charset (&(charsets[0]));
1226 
1227   if (filename_charsets)
1228     *filename_charsets = charsets;
1229 
1230   return result;
1231 #endif
1232 }
1233 
1234 #endif /* G_PLATFORM_WIN32 */
1235 
1236 static gboolean
get_filename_charset(const gchar ** filename_charset)1237 get_filename_charset (const gchar **filename_charset)
1238 {
1239   const gchar **charsets;
1240   gboolean is_utf8;
1241 
1242   is_utf8 = g_get_filename_charsets (&charsets);
1243 
1244   if (filename_charset)
1245     *filename_charset = charsets[0];
1246 
1247   return is_utf8;
1248 }
1249 
1250 /**
1251  * g_filename_to_utf8:
1252  * @opsysstring: (type filename): a string in the encoding for filenames
1253  * @len:           the length of the string, or -1 if the string is
1254  *                 nul-terminated (Note that some encodings may allow nul
1255  *                 bytes to occur inside strings. In that case, using -1
1256  *                 for the @len parameter is unsafe)
1257  * @bytes_read: (out) (optional): location to store the number of bytes in the
1258  *                 input string that were successfully converted, or %NULL.
1259  *                 Even if the conversion was successful, this may be
1260  *                 less than @len if there were partial characters
1261  *                 at the end of the input. If the error
1262  *                 %G_CONVERT_ERROR_ILLEGAL_SEQUENCE occurs, the value
1263  *                 stored will be the byte offset after the last valid
1264  *                 input sequence.
1265  * @bytes_written: (out) (optional): the number of bytes stored in the output
1266  *                 buffer (not including the terminating nul).
1267  * @error:         location to store the error occurring, or %NULL to ignore
1268  *                 errors. Any of the errors in #GConvertError may occur.
1269  *
1270  * Converts a string which is in the encoding used by GLib for
1271  * filenames into a UTF-8 string. Note that on Windows GLib uses UTF-8
1272  * for filenames; on other platforms, this function indirectly depends on
1273  * the [current locale][setlocale].
1274  *
1275  * The input string shall not contain nul characters even if the @len
1276  * argument is positive. A nul character found inside the string will result
1277  * in error %G_CONVERT_ERROR_ILLEGAL_SEQUENCE.
1278  * If the source encoding is not UTF-8 and the conversion output contains a
1279  * nul character, the error %G_CONVERT_ERROR_EMBEDDED_NUL is set and the
1280  * function returns %NULL. Use g_convert() to produce output that
1281  * may contain embedded nul characters.
1282  *
1283  * Returns: (type utf8): The converted string, or %NULL on an error.
1284  **/
1285 gchar*
g_filename_to_utf8(const gchar * opsysstring,gssize len,gsize * bytes_read,gsize * bytes_written,GError ** error)1286 g_filename_to_utf8 (const gchar *opsysstring,
1287 		    gssize       len,
1288 		    gsize       *bytes_read,
1289 		    gsize       *bytes_written,
1290 		    GError     **error)
1291 {
1292   const gchar *charset;
1293 
1294   g_return_val_if_fail (opsysstring != NULL, NULL);
1295 
1296   if (get_filename_charset (&charset))
1297     return strdup_len (opsysstring, len, bytes_read, bytes_written, error);
1298   else
1299     return convert_checked (opsysstring, len, "UTF-8", charset,
1300                             CONVERT_CHECK_NO_NULS_IN_INPUT |
1301                             CONVERT_CHECK_NO_NULS_IN_OUTPUT,
1302                             bytes_read, bytes_written, error);
1303 }
1304 
1305 /**
1306  * g_filename_from_utf8:
1307  * @utf8string:    (type utf8): a UTF-8 encoded string.
1308  * @len:           the length of the string, or -1 if the string is
1309  *                 nul-terminated.
1310  * @bytes_read:    (out) (optional): location to store the number of bytes in
1311  *                 the input string that were successfully converted, or %NULL.
1312  *                 Even if the conversion was successful, this may be
1313  *                 less than @len if there were partial characters
1314  *                 at the end of the input. If the error
1315  *                 %G_CONVERT_ERROR_ILLEGAL_SEQUENCE occurs, the value
1316  *                 stored will be the byte offset after the last valid
1317  *                 input sequence.
1318  * @bytes_written: (out) (optional): the number of bytes stored in
1319  *                 the output buffer (not including the terminating nul).
1320  * @error:         location to store the error occurring, or %NULL to ignore
1321  *                 errors. Any of the errors in #GConvertError may occur.
1322  *
1323  * Converts a string from UTF-8 to the encoding GLib uses for
1324  * filenames. Note that on Windows GLib uses UTF-8 for filenames;
1325  * on other platforms, this function indirectly depends on the
1326  * [current locale][setlocale].
1327  *
1328  * The input string shall not contain nul characters even if the @len
1329  * argument is positive. A nul character found inside the string will result
1330  * in error %G_CONVERT_ERROR_ILLEGAL_SEQUENCE. If the filename encoding is
1331  * not UTF-8 and the conversion output contains a nul character, the error
1332  * %G_CONVERT_ERROR_EMBEDDED_NUL is set and the function returns %NULL.
1333  *
1334  * Returns: (type filename):
1335  *               The converted string, or %NULL on an error.
1336  **/
1337 gchar*
g_filename_from_utf8(const gchar * utf8string,gssize len,gsize * bytes_read,gsize * bytes_written,GError ** error)1338 g_filename_from_utf8 (const gchar *utf8string,
1339 		      gssize       len,
1340 		      gsize       *bytes_read,
1341 		      gsize       *bytes_written,
1342 		      GError     **error)
1343 {
1344   const gchar *charset;
1345 
1346   if (get_filename_charset (&charset))
1347     return strdup_len (utf8string, len, bytes_read, bytes_written, error);
1348   else
1349     return convert_checked (utf8string, len, charset, "UTF-8",
1350                             CONVERT_CHECK_NO_NULS_IN_INPUT |
1351                             CONVERT_CHECK_NO_NULS_IN_OUTPUT,
1352                             bytes_read, bytes_written, error);
1353 }
1354 
1355 /* Test of haystack has the needle prefix, comparing case
1356  * insensitive. haystack may be UTF-8, but needle must
1357  * contain only ascii. */
1358 static gboolean
has_case_prefix(const gchar * haystack,const gchar * needle)1359 has_case_prefix (const gchar *haystack, const gchar *needle)
1360 {
1361   const gchar *h, *n;
1362 
1363   /* Eat one character at a time. */
1364   h = haystack;
1365   n = needle;
1366 
1367   while (*n && *h &&
1368 	 g_ascii_tolower (*n) == g_ascii_tolower (*h))
1369     {
1370       n++;
1371       h++;
1372     }
1373 
1374   return *n == '\0';
1375 }
1376 
1377 typedef enum {
1378   UNSAFE_ALL        = 0x1,  /* Escape all unsafe characters   */
1379   UNSAFE_ALLOW_PLUS = 0x2,  /* Allows '+'  */
1380   UNSAFE_PATH       = 0x8,  /* Allows '/', '&', '=', ':', '@', '+', '$' and ',' */
1381   UNSAFE_HOST       = 0x10, /* Allows '/' and ':' and '@' */
1382   UNSAFE_SLASHES    = 0x20  /* Allows all characters except for '/' and '%' */
1383 } UnsafeCharacterSet;
1384 
1385 static const guchar acceptable[96] = {
1386   /* A table of the ASCII chars from space (32) to DEL (127) */
1387   /*      !    "    #    $    %    &    '    (    )    *    +    ,    -    .    / */
1388   0x00,0x3F,0x20,0x20,0x28,0x00,0x2C,0x3F,0x3F,0x3F,0x3F,0x2A,0x28,0x3F,0x3F,0x1C,
1389   /* 0    1    2    3    4    5    6    7    8    9    :    ;    <    =    >    ? */
1390   0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x38,0x20,0x20,0x2C,0x20,0x20,
1391   /* @    A    B    C    D    E    F    G    H    I    J    K    L    M    N    O */
1392   0x38,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,
1393   /* P    Q    R    S    T    U    V    W    X    Y    Z    [    \    ]    ^    _ */
1394   0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x20,0x20,0x20,0x20,0x3F,
1395   /* `    a    b    c    d    e    f    g    h    i    j    k    l    m    n    o */
1396   0x20,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,
1397   /* p    q    r    s    t    u    v    w    x    y    z    {    |    }    ~  DEL */
1398   0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x20,0x20,0x20,0x3F,0x20
1399 };
1400 
1401 static const gchar hex[16] = "0123456789ABCDEF";
1402 
1403 /* Note: This escape function works on file: URIs, but if you want to
1404  * escape something else, please read RFC-2396 */
1405 static gchar *
g_escape_uri_string(const gchar * string,UnsafeCharacterSet mask)1406 g_escape_uri_string (const gchar *string,
1407 		     UnsafeCharacterSet mask)
1408 {
1409 #define ACCEPTABLE(a) ((a)>=32 && (a)<128 && (acceptable[(a)-32] & use_mask))
1410 
1411   const gchar *p;
1412   gchar *q;
1413   gchar *result;
1414   int c;
1415   gint unacceptable;
1416   UnsafeCharacterSet use_mask;
1417 
1418   g_return_val_if_fail (mask == UNSAFE_ALL
1419 			|| mask == UNSAFE_ALLOW_PLUS
1420 			|| mask == UNSAFE_PATH
1421 			|| mask == UNSAFE_HOST
1422 			|| mask == UNSAFE_SLASHES, NULL);
1423 
1424   unacceptable = 0;
1425   use_mask = mask;
1426   for (p = string; *p != '\0'; p++)
1427     {
1428       c = (guchar) *p;
1429       if (!ACCEPTABLE (c))
1430 	unacceptable++;
1431     }
1432 
1433   result = g_malloc (p - string + unacceptable * 2 + 1);
1434 
1435   use_mask = mask;
1436   for (q = result, p = string; *p != '\0'; p++)
1437     {
1438       c = (guchar) *p;
1439 
1440       if (!ACCEPTABLE (c))
1441 	{
1442 	  *q++ = '%'; /* means hex coming */
1443 	  *q++ = hex[c >> 4];
1444 	  *q++ = hex[c & 15];
1445 	}
1446       else
1447 	*q++ = *p;
1448     }
1449 
1450   *q = '\0';
1451 
1452   return result;
1453 }
1454 
1455 
1456 static gchar *
g_escape_file_uri(const gchar * hostname,const gchar * pathname)1457 g_escape_file_uri (const gchar *hostname,
1458 		   const gchar *pathname)
1459 {
1460   char *escaped_hostname = NULL;
1461   char *escaped_path;
1462   char *res;
1463 
1464 #ifdef G_OS_WIN32
1465   char *p, *backslash;
1466 
1467   /* Turn backslashes into forward slashes. That's what Netscape
1468    * does, and they are actually more or less equivalent in Windows.
1469    */
1470 
1471   pathname = g_strdup (pathname);
1472   p = (char *) pathname;
1473 
1474   while ((backslash = strchr (p, '\\')) != NULL)
1475     {
1476       *backslash = '/';
1477       p = backslash + 1;
1478     }
1479 #endif
1480 
1481   if (hostname && *hostname != '\0')
1482     {
1483       escaped_hostname = g_escape_uri_string (hostname, UNSAFE_HOST);
1484     }
1485 
1486   escaped_path = g_escape_uri_string (pathname, UNSAFE_PATH);
1487 
1488   res = g_strconcat ("file://",
1489 		     (escaped_hostname) ? escaped_hostname : "",
1490 		     (*escaped_path != '/') ? "/" : "",
1491 		     escaped_path,
1492 		     NULL);
1493 
1494 #ifdef G_OS_WIN32
1495   g_free ((char *) pathname);
1496 #endif
1497 
1498   g_free (escaped_hostname);
1499   g_free (escaped_path);
1500 
1501   return res;
1502 }
1503 
1504 static int
unescape_character(const char * scanner)1505 unescape_character (const char *scanner)
1506 {
1507   int first_digit;
1508   int second_digit;
1509 
1510   first_digit = g_ascii_xdigit_value (scanner[0]);
1511   if (first_digit < 0)
1512     return -1;
1513 
1514   second_digit = g_ascii_xdigit_value (scanner[1]);
1515   if (second_digit < 0)
1516     return -1;
1517 
1518   return (first_digit << 4) | second_digit;
1519 }
1520 
1521 static gchar *
g_unescape_uri_string(const char * escaped,int len,const char * illegal_escaped_characters,gboolean ascii_must_not_be_escaped)1522 g_unescape_uri_string (const char *escaped,
1523 		       int         len,
1524 		       const char *illegal_escaped_characters,
1525 		       gboolean    ascii_must_not_be_escaped)
1526 {
1527   const gchar *in, *in_end;
1528   gchar *out, *result;
1529   int c;
1530 
1531   if (escaped == NULL)
1532     return NULL;
1533 
1534   if (len < 0)
1535     len = strlen (escaped);
1536 
1537   result = g_malloc (len + 1);
1538 
1539   out = result;
1540   for (in = escaped, in_end = escaped + len; in < in_end; in++)
1541     {
1542       c = *in;
1543 
1544       if (c == '%')
1545 	{
1546 	  /* catch partial escape sequences past the end of the substring */
1547 	  if (in + 3 > in_end)
1548 	    break;
1549 
1550 	  c = unescape_character (in + 1);
1551 
1552 	  /* catch bad escape sequences and NUL characters */
1553 	  if (c <= 0)
1554 	    break;
1555 
1556 	  /* catch escaped ASCII */
1557 	  if (ascii_must_not_be_escaped && c <= 0x7F)
1558 	    break;
1559 
1560 	  /* catch other illegal escaped characters */
1561 	  if (strchr (illegal_escaped_characters, c) != NULL)
1562 	    break;
1563 
1564 	  in += 2;
1565 	}
1566 
1567       *out++ = c;
1568     }
1569 
1570   g_assert (out - result <= len);
1571   *out = '\0';
1572 
1573   if (in != in_end)
1574     {
1575       g_free (result);
1576       return NULL;
1577     }
1578 
1579   return result;
1580 }
1581 
1582 static gboolean
is_asciialphanum(gunichar c)1583 is_asciialphanum (gunichar c)
1584 {
1585   return c <= 0x7F && g_ascii_isalnum (c);
1586 }
1587 
1588 static gboolean
is_asciialpha(gunichar c)1589 is_asciialpha (gunichar c)
1590 {
1591   return c <= 0x7F && g_ascii_isalpha (c);
1592 }
1593 
1594 /* allows an empty string */
1595 static gboolean
hostname_validate(const char * hostname)1596 hostname_validate (const char *hostname)
1597 {
1598   const char *p;
1599   gunichar c, first_char, last_char;
1600 
1601   p = hostname;
1602   if (*p == '\0')
1603     return TRUE;
1604   do
1605     {
1606       /* read in a label */
1607       c = g_utf8_get_char (p);
1608       p = g_utf8_next_char (p);
1609       if (!is_asciialphanum (c))
1610 	return FALSE;
1611       first_char = c;
1612       do
1613 	{
1614 	  last_char = c;
1615 	  c = g_utf8_get_char (p);
1616 	  p = g_utf8_next_char (p);
1617 	}
1618       while (is_asciialphanum (c) || c == '-');
1619       if (last_char == '-')
1620 	return FALSE;
1621 
1622       /* if that was the last label, check that it was a toplabel */
1623       if (c == '\0' || (c == '.' && *p == '\0'))
1624 	return is_asciialpha (first_char);
1625     }
1626   while (c == '.');
1627   return FALSE;
1628 }
1629 
1630 /**
1631  * g_filename_from_uri:
1632  * @uri: a uri describing a filename (escaped, encoded in ASCII).
1633  * @hostname: (out) (optional) (nullable): Location to store hostname for the URI.
1634  *            If there is no hostname in the URI, %NULL will be
1635  *            stored in this location.
1636  * @error: location to store the error occurring, or %NULL to ignore
1637  *         errors. Any of the errors in #GConvertError may occur.
1638  *
1639  * Converts an escaped ASCII-encoded URI to a local filename in the
1640  * encoding used for filenames.
1641  *
1642  * Returns: (type filename): a newly-allocated string holding
1643  *               the resulting filename, or %NULL on an error.
1644  **/
1645 gchar *
g_filename_from_uri(const gchar * uri,gchar ** hostname,GError ** error)1646 g_filename_from_uri (const gchar *uri,
1647 		     gchar      **hostname,
1648 		     GError     **error)
1649 {
1650   const char *path_part;
1651   const char *host_part;
1652   char *unescaped_hostname;
1653   char *result;
1654   char *filename;
1655   int offs;
1656 #ifdef G_OS_WIN32
1657   char *p, *slash;
1658 #endif
1659 
1660   if (hostname)
1661     *hostname = NULL;
1662 
1663   if (!has_case_prefix (uri, "file:/"))
1664     {
1665       g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_BAD_URI,
1666 		   _("The URI “%s” is not an absolute URI using the “file” scheme"),
1667 		   uri);
1668       return NULL;
1669     }
1670 
1671   path_part = uri + strlen ("file:");
1672 
1673   if (strchr (path_part, '#') != NULL)
1674     {
1675       g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_BAD_URI,
1676 		   _("The local file URI “%s” may not include a “#”"),
1677 		   uri);
1678       return NULL;
1679     }
1680 
1681   if (has_case_prefix (path_part, "///"))
1682     path_part += 2;
1683   else if (has_case_prefix (path_part, "//"))
1684     {
1685       path_part += 2;
1686       host_part = path_part;
1687 
1688       path_part = strchr (path_part, '/');
1689 
1690       if (path_part == NULL)
1691 	{
1692 	  g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_BAD_URI,
1693 		       _("The URI “%s” is invalid"),
1694 		       uri);
1695 	  return NULL;
1696 	}
1697 
1698       unescaped_hostname = g_unescape_uri_string (host_part, path_part - host_part, "", TRUE);
1699 
1700       if (unescaped_hostname == NULL ||
1701 	  !hostname_validate (unescaped_hostname))
1702 	{
1703 	  g_free (unescaped_hostname);
1704 	  g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_BAD_URI,
1705 		       _("The hostname of the URI “%s” is invalid"),
1706 		       uri);
1707 	  return NULL;
1708 	}
1709 
1710       if (hostname)
1711 	*hostname = unescaped_hostname;
1712       else
1713 	g_free (unescaped_hostname);
1714     }
1715 
1716   filename = g_unescape_uri_string (path_part, -1, "/", FALSE);
1717 
1718   if (filename == NULL)
1719     {
1720       g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_BAD_URI,
1721 		   _("The URI “%s” contains invalidly escaped characters"),
1722 		   uri);
1723       return NULL;
1724     }
1725 
1726   offs = 0;
1727 #ifdef G_OS_WIN32
1728   /* Drop localhost */
1729   if (hostname && *hostname != NULL &&
1730       g_ascii_strcasecmp (*hostname, "localhost") == 0)
1731     {
1732       g_free (*hostname);
1733       *hostname = NULL;
1734     }
1735 
1736   /* Turn slashes into backslashes, because that's the canonical spelling */
1737   p = filename;
1738   while ((slash = strchr (p, '/')) != NULL)
1739     {
1740       *slash = '\\';
1741       p = slash + 1;
1742     }
1743 
1744   /* Windows URIs with a drive letter can be like "file://host/c:/foo"
1745    * or "file://host/c|/foo" (some Netscape versions). In those cases, start
1746    * the filename from the drive letter.
1747    */
1748   if (g_ascii_isalpha (filename[1]))
1749     {
1750       if (filename[2] == ':')
1751 	offs = 1;
1752       else if (filename[2] == '|')
1753 	{
1754 	  filename[2] = ':';
1755 	  offs = 1;
1756 	}
1757     }
1758 #endif
1759 
1760   result = g_strdup (filename + offs);
1761   g_free (filename);
1762 
1763   return result;
1764 }
1765 
1766 /**
1767  * g_filename_to_uri:
1768  * @filename: (type filename): an absolute filename specified in the GLib file
1769  *     name encoding, which is the on-disk file name bytes on Unix, and UTF-8
1770  *     on Windows
1771  * @hostname: (nullable): A UTF-8 encoded hostname, or %NULL for none.
1772  * @error: location to store the error occurring, or %NULL to ignore
1773  *         errors. Any of the errors in #GConvertError may occur.
1774  *
1775  * Converts an absolute filename to an escaped ASCII-encoded URI, with the path
1776  * component following Section 3.3. of RFC 2396.
1777  *
1778  * Returns: a newly-allocated string holding the resulting
1779  *               URI, or %NULL on an error.
1780  **/
1781 gchar *
g_filename_to_uri(const gchar * filename,const gchar * hostname,GError ** error)1782 g_filename_to_uri (const gchar *filename,
1783 		   const gchar *hostname,
1784 		   GError     **error)
1785 {
1786   char *escaped_uri;
1787 
1788   g_return_val_if_fail (filename != NULL, NULL);
1789 
1790   if (!g_path_is_absolute (filename))
1791     {
1792       g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_NOT_ABSOLUTE_PATH,
1793 		   _("The pathname “%s” is not an absolute path"),
1794 		   filename);
1795       return NULL;
1796     }
1797 
1798   if (hostname &&
1799       !(g_utf8_validate (hostname, -1, NULL)
1800 	&& hostname_validate (hostname)))
1801     {
1802       g_set_error_literal (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
1803                            _("Invalid hostname"));
1804       return NULL;
1805     }
1806 
1807 #ifdef G_OS_WIN32
1808   /* Don't use localhost unnecessarily */
1809   if (hostname && g_ascii_strcasecmp (hostname, "localhost") == 0)
1810     hostname = NULL;
1811 #endif
1812 
1813   escaped_uri = g_escape_file_uri (hostname, filename);
1814 
1815   return escaped_uri;
1816 }
1817 
1818 /**
1819  * g_uri_list_extract_uris:
1820  * @uri_list: an URI list
1821  *
1822  * Splits an URI list conforming to the text/uri-list
1823  * mime type defined in RFC 2483 into individual URIs,
1824  * discarding any comments. The URIs are not validated.
1825  *
1826  * Returns: (transfer full): a newly allocated %NULL-terminated list
1827  *   of strings holding the individual URIs. The array should be freed
1828  *   with g_strfreev().
1829  *
1830  * Since: 2.6
1831  */
1832 gchar **
g_uri_list_extract_uris(const gchar * uri_list)1833 g_uri_list_extract_uris (const gchar *uri_list)
1834 {
1835   GPtrArray *uris;
1836   const gchar *p, *q;
1837 
1838   uris = g_ptr_array_new ();
1839 
1840   p = uri_list;
1841 
1842   /* We don't actually try to validate the URI according to RFC
1843    * 2396, or even check for allowed characters - we just ignore
1844    * comments and trim whitespace off the ends.  We also
1845    * allow LF delimination as well as the specified CRLF.
1846    *
1847    * We do allow comments like specified in RFC 2483.
1848    */
1849   while (p)
1850     {
1851       if (*p != '#')
1852 	{
1853 	  while (g_ascii_isspace (*p))
1854 	    p++;
1855 
1856 	  q = p;
1857 	  while (*q && (*q != '\n') && (*q != '\r'))
1858 	    q++;
1859 
1860 	  if (q > p)
1861 	    {
1862 	      q--;
1863 	      while (q > p && g_ascii_isspace (*q))
1864 		q--;
1865 
1866 	      if (q > p)
1867                 g_ptr_array_add (uris, g_strndup (p, q - p + 1));
1868             }
1869         }
1870       p = strchr (p, '\n');
1871       if (p)
1872 	p++;
1873     }
1874 
1875   g_ptr_array_add (uris, NULL);
1876 
1877   return (gchar **) g_ptr_array_free (uris, FALSE);
1878 }
1879 
1880 /**
1881  * g_filename_display_basename:
1882  * @filename: (type filename): an absolute pathname in the
1883  *     GLib file name encoding
1884  *
1885  * Returns the display basename for the particular filename, guaranteed
1886  * to be valid UTF-8. The display name might not be identical to the filename,
1887  * for instance there might be problems converting it to UTF-8, and some files
1888  * can be translated in the display.
1889  *
1890  * If GLib cannot make sense of the encoding of @filename, as a last resort it
1891  * replaces unknown characters with U+FFFD, the Unicode replacement character.
1892  * You can search the result for the UTF-8 encoding of this character (which is
1893  * "\357\277\275" in octal notation) to find out if @filename was in an invalid
1894  * encoding.
1895  *
1896  * You must pass the whole absolute pathname to this functions so that
1897  * translation of well known locations can be done.
1898  *
1899  * This function is preferred over g_filename_display_name() if you know the
1900  * whole path, as it allows translation.
1901  *
1902  * Returns: a newly allocated string containing
1903  *   a rendition of the basename of the filename in valid UTF-8
1904  *
1905  * Since: 2.6
1906  **/
1907 gchar *
g_filename_display_basename(const gchar * filename)1908 g_filename_display_basename (const gchar *filename)
1909 {
1910   char *basename;
1911   char *display_name;
1912 
1913   g_return_val_if_fail (filename != NULL, NULL);
1914 
1915   basename = g_path_get_basename (filename);
1916   display_name = g_filename_display_name (basename);
1917   g_free (basename);
1918   return display_name;
1919 }
1920 
1921 /**
1922  * g_filename_display_name:
1923  * @filename: (type filename): a pathname hopefully in the
1924  *     GLib file name encoding
1925  *
1926  * Converts a filename into a valid UTF-8 string. The conversion is
1927  * not necessarily reversible, so you should keep the original around
1928  * and use the return value of this function only for display purposes.
1929  * Unlike g_filename_to_utf8(), the result is guaranteed to be non-%NULL
1930  * even if the filename actually isn't in the GLib file name encoding.
1931  *
1932  * If GLib cannot make sense of the encoding of @filename, as a last resort it
1933  * replaces unknown characters with U+FFFD, the Unicode replacement character.
1934  * You can search the result for the UTF-8 encoding of this character (which is
1935  * "\357\277\275" in octal notation) to find out if @filename was in an invalid
1936  * encoding.
1937  *
1938  * If you know the whole pathname of the file you should use
1939  * g_filename_display_basename(), since that allows location-based
1940  * translation of filenames.
1941  *
1942  * Returns: a newly allocated string containing
1943  *   a rendition of the filename in valid UTF-8
1944  *
1945  * Since: 2.6
1946  **/
1947 gchar *
g_filename_display_name(const gchar * filename)1948 g_filename_display_name (const gchar *filename)
1949 {
1950   gint i;
1951   const gchar **charsets;
1952   gchar *display_name = NULL;
1953   gboolean is_utf8;
1954 
1955   is_utf8 = g_get_filename_charsets (&charsets);
1956 
1957   if (is_utf8)
1958     {
1959       if (g_utf8_validate (filename, -1, NULL))
1960 	display_name = g_strdup (filename);
1961     }
1962 
1963   if (!display_name)
1964     {
1965       /* Try to convert from the filename charsets to UTF-8.
1966        * Skip the first charset if it is UTF-8.
1967        */
1968       for (i = is_utf8 ? 1 : 0; charsets[i]; i++)
1969 	{
1970 	  display_name = g_convert (filename, -1, "UTF-8", charsets[i],
1971 				    NULL, NULL, NULL);
1972 
1973 	  if (display_name)
1974 	    break;
1975 	}
1976     }
1977 
1978   /* if all conversions failed, we replace invalid UTF-8
1979    * by a question mark
1980    */
1981   if (!display_name)
1982     display_name = g_utf8_make_valid (filename, -1);
1983 
1984   return display_name;
1985 }
1986 
1987 #ifdef G_OS_WIN32
1988 
1989 /* Binary compatibility versions. Not for newly compiled code. */
1990 
1991 _GLIB_EXTERN gchar *g_filename_to_utf8_utf8   (const gchar  *opsysstring,
1992                                                gssize        len,
1993                                                gsize        *bytes_read,
1994                                                gsize        *bytes_written,
1995                                                GError      **error) G_GNUC_MALLOC;
1996 _GLIB_EXTERN gchar *g_filename_from_utf8_utf8 (const gchar  *utf8string,
1997                                                gssize        len,
1998                                                gsize        *bytes_read,
1999                                                gsize        *bytes_written,
2000                                                GError      **error) G_GNUC_MALLOC;
2001 _GLIB_EXTERN gchar *g_filename_from_uri_utf8  (const gchar  *uri,
2002                                                gchar       **hostname,
2003                                                GError      **error) G_GNUC_MALLOC;
2004 _GLIB_EXTERN gchar *g_filename_to_uri_utf8    (const gchar  *filename,
2005                                                const gchar  *hostname,
2006                                                GError      **error) G_GNUC_MALLOC;
2007 
2008 gchar *
g_filename_to_utf8_utf8(const gchar * opsysstring,gssize len,gsize * bytes_read,gsize * bytes_written,GError ** error)2009 g_filename_to_utf8_utf8 (const gchar *opsysstring,
2010                          gssize       len,
2011                          gsize       *bytes_read,
2012                          gsize       *bytes_written,
2013                          GError     **error)
2014 {
2015   return g_filename_to_utf8 (opsysstring, len, bytes_read, bytes_written, error);
2016 }
2017 
2018 gchar *
g_filename_from_utf8_utf8(const gchar * utf8string,gssize len,gsize * bytes_read,gsize * bytes_written,GError ** error)2019 g_filename_from_utf8_utf8 (const gchar *utf8string,
2020                            gssize       len,
2021                            gsize       *bytes_read,
2022                            gsize       *bytes_written,
2023                            GError     **error)
2024 {
2025   return g_filename_from_utf8 (utf8string, len, bytes_read, bytes_written, error);
2026 }
2027 
2028 gchar *
g_filename_from_uri_utf8(const gchar * uri,gchar ** hostname,GError ** error)2029 g_filename_from_uri_utf8 (const gchar *uri,
2030                           gchar      **hostname,
2031                           GError     **error)
2032 {
2033   return g_filename_from_uri (uri, hostname, error);
2034 }
2035 
2036 gchar *
g_filename_to_uri_utf8(const gchar * filename,const gchar * hostname,GError ** error)2037 g_filename_to_uri_utf8 (const gchar *filename,
2038                         const gchar *hostname,
2039                         GError     **error)
2040 {
2041   return g_filename_to_uri (filename, hostname, error);
2042 }
2043 
2044 #endif
2045