1 /* PSPP - a program for statistical analysis.
2    Copyright (C) 2006, 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016 Free Software Foundation, Inc.
3 
4    This program is free software: you can redistribute it and/or modify
5    it under the terms of the GNU General Public License as published by
6    the Free Software Foundation, either version 3 of the License, or
7    (at your option) any later version.
8 
9    This program is distributed in the hope that it will be useful,
10    but WITHOUT ANY WARRANTY; without even the implied warranty of
11    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12    GNU General Public License for more details.
13 
14    You should have received a copy of the GNU General Public License
15    along with this program.  If not, see <http://www.gnu.org/licenses/>. */
16 
17 #include <config.h>
18 
19 #include "libpspp/i18n.h"
20 
21 #include <assert.h>
22 #include <errno.h>
23 #include <iconv.h>
24 #include <langinfo.h>
25 #include <locale.h>
26 #include <stdio.h>
27 #include <stdlib.h>
28 #include <string.h>
29 #include <unicase.h>
30 #include <unigbrk.h>
31 
32 #include "libpspp/assertion.h"
33 #include "libpspp/compiler.h"
34 #include "libpspp/hmapx.h"
35 #include "libpspp/hash-functions.h"
36 #include "libpspp/pool.h"
37 #include "libpspp/str.h"
38 #include "libpspp/version.h"
39 
40 #include "gl/c-ctype.h"
41 #include "gl/c-strcase.h"
42 #include "gl/localcharset.h"
43 #include <gl/localename.h>
44 #include "gl/minmax.h"
45 #include "gl/xalloc.h"
46 #include "gl/relocatable.h"
47 #include "gl/xstrndup.h"
48 
49 #include "gettext.h"
50 #define _(msgid) gettext (msgid)
51 
52 struct converter
53 {
54   char *tocode;
55   char *fromcode;
56   iconv_t conv;
57   int null_char_width;
58 };
59 
60 static char *default_encoding;
61 static struct hmapx map;
62 
63 /* A wrapper around iconv_open */
64 static struct converter *
create_iconv(const char * tocode,const char * fromcode)65 create_iconv (const char* tocode, const char* fromcode)
66 {
67   size_t hash;
68   struct hmapx_node *node;
69   struct converter *converter;
70   assert (fromcode);
71 
72   hash = hash_string (tocode, hash_string (fromcode, 0));
73   HMAPX_FOR_EACH_WITH_HASH (converter, node, hash, &map)
74     {
75       if (!converter)
76 	return NULL;
77 
78       if (!strcmp (tocode, converter->tocode)
79 	  && !strcmp (fromcode, converter->fromcode))
80 	return converter;
81     }
82 
83   converter = xmalloc (sizeof *converter);
84   converter->tocode = xstrdup (tocode);
85   converter->fromcode = xstrdup (fromcode);
86   converter->conv = iconv_open (tocode, fromcode);
87   int error = converter->conv == (iconv_t) ~0 ? errno : 0;
88   /* I don't think it's safe to translate this string or to use messaging
89      as the converters have not yet been set up */
90   if (error && strcmp (tocode, fromcode))
91     {
92       fprintf (stderr,
93                "Warning: "
94                "cannot create a converter for `%s' to `%s': %s\n",
95                fromcode, tocode, strerror (error));
96 
97       free (converter->tocode);
98       free (converter->fromcode);
99       free (converter);
100 
101       hmapx_insert (&map, NULL, hash);
102       return NULL;
103     }
104 
105   /* Find out how many bytes there are in a null char in the target
106      encoding */
107   iconv_t bconv = iconv_open (tocode, "ASCII");
108   if (bconv != (iconv_t) -1)
109     {
110       ICONV_CONST  char *nullstr = strdup ("");
111       ICONV_CONST  char *outbuf = strdup ("XXXXXXXX");
112       ICONV_CONST  char *snullstr = nullstr;
113       ICONV_CONST  char *soutbuf = outbuf;
114 
115       size_t inbytes = 1;
116       const size_t bytes = 8;
117       size_t outbytes = bytes;
118       if (-1 != iconv (bconv, &nullstr, &inbytes, &outbuf, &outbytes))
119 	converter->null_char_width = bytes - outbytes;
120       free (snullstr);
121       free (soutbuf);
122       iconv_close (bconv);
123     }
124 
125   hmapx_insert (&map, converter, hash);
126 
127   return converter;
128 }
129 
130 
131 /* Converts the single byte C from encoding FROM to TO, returning the first
132    byte of the result.
133 
134    This function probably shouldn't be used at all, but some code still does
135    use it. */
136 char
recode_byte(const char * to,const char * from,char c)137 recode_byte (const char *to, const char *from, char c)
138 {
139   char x;
140   char *s = recode_string (to, from, &c, 1);
141   x = s[0];
142   free (s);
143   return x;
144 }
145 
146 /* Similar to recode_string_pool, but allocates the returned value on the heap
147    instead of in a pool.  It is the caller's responsibility to free the
148    returned value. */
149 char *
recode_string(const char * to,const char * from,const char * text,int length)150 recode_string (const char *to, const char *from,
151 	       const char *text, int length)
152 {
153   return recode_string_pool (to, from, text, length, NULL);
154 }
155 
156 /* Returns the length, in bytes, of the string that a similar recode_string()
157    call would return. */
158 size_t
recode_string_len(const char * to,const char * from,const char * text,int length)159 recode_string_len (const char *to, const char *from,
160                    const char *text, int length)
161 {
162   char *s = recode_string (to, from, text, length);
163   size_t len = strlen (s);
164   free (s);
165   return len;
166 }
167 
168 /* Uses CONV to convert the INBYTES starting at IP into the OUTBYTES starting
169    at OP, and appends a null terminator to the output.
170 
171    Returns the output length if successful, -1 if the output buffer is too
172    small. */
173 static ssize_t
try_recode(struct converter * cvtr,char fallbackchar,const char * in,size_t inbytes,char * out_,size_t outbytes)174 try_recode (struct converter *cvtr, char fallbackchar,
175             const char *in, size_t inbytes,
176             char *out_, size_t outbytes)
177 {
178   char *out = out_;
179   int i, j;
180 
181   int null_bytes = cvtr->null_char_width;
182 
183   /* Put the converter into the initial shift state, in case there was any
184      state information left over from its last usage. */
185   iconv (cvtr->conv, NULL, 0, NULL, 0);
186 
187   /* Do two rounds of iconv() calls:
188 
189      - The first round does the bulk of the conversion using the
190      caller-supplied input data..
191 
192      - The second round flushes any leftover output.  This has a real effect
193      with input encodings that use combining diacritics, e.g. without the
194      second round the last character tends to gets dropped when converting
195      from windows-1258 to other encodings.
196   */
197   for (i = 0; i < 2; i++)
198     {
199       ICONV_CONST char **inp = i ? NULL : (ICONV_CONST char **) &in;
200       size_t *inbytesp = i ? NULL : &inbytes;
201 
202       while (iconv (cvtr->conv, inp, inbytesp, &out, &outbytes) == -1)
203         switch (errno)
204           {
205           case EINVAL:
206             if (outbytes < null_bytes + 1)
207               return -E2BIG;
208             if (!fallbackchar)
209               return -EINVAL;
210             *out++ = fallbackchar;
211 	    for (j = 0 ; j < null_bytes ; ++j)
212 	      *out++ = '\0';
213             return out - 1 - out_;
214 
215           case EILSEQ:
216             if (outbytes == 0)
217               return -E2BIG;
218             if (!fallbackchar)
219               return -EILSEQ;
220             *out++ = fallbackchar;
221             outbytes--;
222             if (inp)
223               {
224                 in++;
225                 inbytes--;
226               }
227             break;
228 
229           case E2BIG:
230             return -E2BIG;
231 
232           default:
233             /* should never happen */
234             fprintf (stderr, "Character conversion error: %s\n",
235                      strerror (errno));
236             NOT_REACHED ();
237             break;
238           }
239     }
240 
241   if (outbytes <= null_bytes - 1)
242     return -E2BIG;
243 
244   for (i = 0 ; i < null_bytes ; ++i)
245     *out++ = '\0';
246 
247   return out - 1 - out_;
248 }
249 
250 /* Converts the string TEXT, which should be encoded in FROM-encoding, to a
251    dynamically allocated string in TO-encoding.  Any characters which cannot be
252    converted will be represented by '?'.
253 
254    LENGTH should be the length of the string or -1, if null terminated.
255 
256    The returned string will be allocated on POOL.
257 
258    This function's behaviour differs from that of g_convert_with_fallback
259    provided by GLib.  The GLib function will fail (returns NULL) if any part of
260    the input string is not valid in the declared input encoding.  This function
261    however perseveres even in the presence of badly encoded input. */
262 char *
recode_string_pool(const char * to,const char * from,const char * text,int length,struct pool * pool)263 recode_string_pool (const char *to, const char *from,
264                     const char *text, int length, struct pool *pool)
265 {
266   struct substring out;
267 
268   if (text == NULL)
269     return NULL;
270 
271   if (length == -1)
272     length = strlen (text);
273 
274   out = recode_substring_pool (to, from, ss_buffer (text, length), pool);
275   return out.string;
276 }
277 
278 /* Returns the name of the encoding that should be used for file names.
279 
280    This is meant to be the same encoding used by g_filename_from_uri() and
281    g_filename_to_uri() in GLib. */
282 static const char *
filename_encoding(void)283 filename_encoding (void)
284 {
285 #if defined _WIN32 || defined __WIN32__
286   return "UTF-8";
287 #else
288   return locale_charset ();
289 #endif
290 }
291 
292 static char *
xconcat2(const char * a,size_t a_len,const char * b,size_t b_len)293 xconcat2 (const char *a, size_t a_len,
294           const char *b, size_t b_len)
295 {
296   char *s = xmalloc (a_len + b_len + 1);
297   memcpy (s, a, a_len);
298   memcpy (s + a_len, b, b_len);
299   s[a_len + b_len] = '\0';
300   return s;
301 }
302 
303 /* Conceptually, this function concatenates HEAD_LEN-byte string HEAD and
304    TAIL_LEN-byte string TAIL, both encoded in UTF-8, then converts them to
305    ENCODING.  If the re-encoded result is no more than MAX_LEN bytes long, then
306    it returns HEAD_LEN.  Otherwise, it drops one character[*] from the end of
307    HEAD and tries again, repeating as necessary until the concatenated result
308    fits or until HEAD_LEN reaches 0.
309 
310    [*] Actually this function drops grapheme clusters instead of characters, so
311    that, e.g. a Unicode character followed by a combining accent character
312    is either completely included or completely excluded from HEAD_LEN.  See
313    UAX #29 at http://unicode.org/reports/tr29/ for more information on
314    grapheme clusters.
315 
316    A null ENCODING is treated as UTF-8.
317 
318    Sometimes this function has to actually construct the concatenated string to
319    measure its length.  When this happens, it sets *RESULTP to that
320    null-terminated string, allocated with malloc(), for the caller to use if it
321    needs it.  Otherwise, it sets *RESULTP to NULL.
322 
323    Simple examples for encoding="UTF-8", max_len=6:
324 
325    head="abc",  tail="xyz"     => 3
326    head="abcd", tail="xyz"     => 3 ("d" dropped).
327    head="abc",  tail="uvwxyz"  => 0 ("abc" dropped).
328    head="abc",  tail="tuvwxyz" => 0 ("abc" dropped).
329 
330    Examples for encoding="ISO-8859-1", max_len=6:
331 
332    head="éèä",  tail="xyz"     => 6
333    (each letter in head is only 1 byte in ISO-8859-1 even though they
334    each take 2 bytes in UTF-8 encoding)
335 */
336 static size_t
utf8_encoding_concat__(const char * head,size_t head_len,const char * tail,size_t tail_len,const char * encoding,size_t max_len,char ** resultp)337 utf8_encoding_concat__ (const char *head, size_t head_len,
338                         const char *tail, size_t tail_len,
339                         const char *encoding, size_t max_len,
340                         char **resultp)
341 {
342   *resultp = NULL;
343   if (head_len == 0)
344     return 0;
345   else if (encoding == NULL || !c_strcasecmp (encoding, "UTF-8"))
346     {
347       if (head_len + tail_len <= max_len)
348         return head_len;
349       else if (tail_len >= max_len)
350         return 0;
351       else
352         {
353           size_t copy_len;
354           ucs4_t prev;
355           size_t ofs;
356           int mblen;
357 
358           copy_len = 0;
359           for (ofs = u8_mbtouc (&prev, CHAR_CAST (const uint8_t *, head),
360                                 head_len);
361                ofs <= max_len - tail_len;
362                ofs += mblen)
363             {
364               ucs4_t next;
365 
366               mblen = u8_mbtouc (&next,
367                                  CHAR_CAST (const uint8_t *, head + ofs),
368                                  head_len - ofs);
369               if (uc_is_grapheme_break (prev, next))
370                 copy_len = ofs;
371 
372               prev = next;
373             }
374 
375           return copy_len;
376         }
377     }
378   else
379     {
380       char *result;
381 
382       result = (tail_len > 0
383                 ? xconcat2 (head, head_len, tail, tail_len)
384                 : CONST_CAST (char *, head));
385       if (recode_string_len (encoding, "UTF-8", result,
386                              head_len + tail_len) <= max_len)
387         {
388           *resultp = result != head ? result : NULL;
389           return head_len;
390         }
391       else
392         {
393           bool correct_result = false;
394           size_t copy_len;
395           ucs4_t prev;
396           size_t ofs;
397           int mblen;
398 
399           copy_len = 0;
400           for (ofs = u8_mbtouc (&prev, CHAR_CAST (const uint8_t *, head),
401                                 head_len);
402                ofs <= head_len;
403                ofs += mblen)
404             {
405               ucs4_t next;
406 
407               mblen = u8_mbtouc (&next,
408                                  CHAR_CAST (const uint8_t *, head + ofs),
409                                  head_len - ofs);
410               if (uc_is_grapheme_break (prev, next))
411                 {
412                   if (result != head)
413                     {
414                       memcpy (result, head, ofs);
415                       memcpy (result + ofs, tail, tail_len);
416                       result[ofs + tail_len] = '\0';
417                     }
418 
419                   if (recode_string_len (encoding, "UTF-8", result,
420                                          ofs + tail_len) <= max_len)
421                     {
422                       correct_result = true;
423                       copy_len = ofs;
424                     }
425                   else
426                     correct_result = false;
427                 }
428 
429               prev = next;
430             }
431 
432           if (result != head)
433             {
434               if (correct_result)
435                 *resultp = result;
436               else
437                 free (result);
438             }
439 
440           return copy_len;
441         }
442     }
443 }
444 
445 /* Concatenates a prefix of HEAD with all of TAIL and returns the result as a
446    null-terminated string owned by the caller.  HEAD, TAIL, and the returned
447    string are all encoded in UTF-8.  As many characters[*] from the beginning
448    of HEAD are included as will fit within MAX_LEN bytes supposing that the
449    resulting string were to be re-encoded in ENCODING.  All of TAIL is always
450    included, even if TAIL by itself is longer than MAX_LEN in ENCODING.
451 
452    [*] Actually this function drops grapheme clusters instead of characters, so
453    that, e.g. a Unicode character followed by a combining accent character
454    is either completely included or completely excluded from the returned
455    string.  See UAX #29 at http://unicode.org/reports/tr29/ for more
456    information on grapheme clusters.
457 
458    A null ENCODING is treated as UTF-8.
459 
460    Simple examples for encoding="UTF-8", max_len=6:
461 
462    head="abc",  tail="xyz"     => "abcxyz"
463    head="abcd", tail="xyz"     => "abcxyz"
464    head="abc",  tail="uvwxyz"  => "uvwxyz"
465    head="abc",  tail="tuvwxyz" => "tuvwxyz"
466 
467    Examples for encoding="ISO-8859-1", max_len=6:
468 
469    head="éèä",  tail="xyz"    => "éèäxyz"
470    (each letter in HEAD is only 1 byte in ISO-8859-1 even though they
471    each take 2 bytes in UTF-8 encoding)
472 */
473 char *
utf8_encoding_concat(const char * head,const char * tail,const char * encoding,size_t max_len)474 utf8_encoding_concat (const char *head, const char *tail,
475                       const char *encoding, size_t max_len)
476 {
477   size_t tail_len = strlen (tail);
478   size_t prefix_len;
479   char *result;
480 
481   prefix_len = utf8_encoding_concat__ (head, strlen (head), tail, tail_len,
482                                        encoding, max_len, &result);
483   return (result != NULL
484           ? result
485           : xconcat2 (head, prefix_len, tail, tail_len));
486 }
487 
488 /* Returns the length, in bytes, of the string that would be returned by
489    utf8_encoding_concat() if passed the same arguments, but the implementation
490    is often more efficient. */
491 size_t
utf8_encoding_concat_len(const char * head,const char * tail,const char * encoding,size_t max_len)492 utf8_encoding_concat_len (const char *head, const char *tail,
493                           const char *encoding, size_t max_len)
494 {
495   size_t tail_len = strlen (tail);
496   size_t prefix_len;
497   char *result;
498 
499   prefix_len = utf8_encoding_concat__ (head, strlen (head), tail, tail_len,
500                                        encoding, max_len, &result);
501   free (result);
502   return prefix_len + tail_len;
503 }
504 
505 /* Returns an allocated, null-terminated string, owned by the caller,
506    containing as many characters[*] from the beginning of S that would fit
507    within MAX_LEN bytes if the returned string were to be re-encoded in
508    ENCODING.  Both S and the returned string are encoded in UTF-8.
509 
510    [*] Actually this function drops grapheme clusters instead of characters, so
511    that, e.g. a Unicode character followed by a combining accent character
512    is either completely included or completely excluded from the returned
513    string.  See UAX #29 at http://unicode.org/reports/tr29/ for more
514    information on grapheme clusters.
515 
516    A null ENCODING is treated as UTF-8.
517 */
518 char *
utf8_encoding_trunc(const char * s,const char * encoding,size_t max_len)519 utf8_encoding_trunc (const char *s, const char *encoding, size_t max_len)
520 {
521   return utf8_encoding_concat (s, "", encoding, max_len);
522 }
523 
524 /* Returns the length, in bytes, of the string that would be returned by
525    utf8_encoding_trunc() if passed the same arguments, but the implementation
526    is often more efficient. */
527 size_t
utf8_encoding_trunc_len(const char * s,const char * encoding,size_t max_len)528 utf8_encoding_trunc_len (const char *s, const char *encoding, size_t max_len)
529 {
530   return utf8_encoding_concat_len (s, "", encoding, max_len);
531 }
532 
533 /* Returns FILENAME converted from UTF-8 to the filename encoding.
534    On Windows the filename encoding is UTF-8; elsewhere it is based on the
535    current locale. */
536 char *
utf8_to_filename(const char * filename)537 utf8_to_filename (const char *filename)
538 {
539   return recode_string (filename_encoding (), "UTF-8", filename, -1);
540 }
541 
542 /* Returns FILENAME converted from the filename encoding to UTF-8.
543    On Windows the filename encoding is UTF-8; elsewhere it is based on the
544    current locale. */
545 char *
filename_to_utf8(const char * filename)546 filename_to_utf8 (const char *filename)
547 {
548   return recode_string ("UTF-8", filename_encoding (), filename, -1);
549 }
550 
551 static int
recode_substring_pool__(const char * to,const char * from,struct substring text,char fallbackchar,struct pool * pool,struct substring * out)552 recode_substring_pool__ (const char *to, const char *from,
553                          struct substring text, char fallbackchar,
554                          struct pool *pool, struct substring *out)
555 {
556   size_t bufsize;
557   struct converter *conv;
558 
559   if (to == NULL)
560     to = default_encoding;
561 
562   if (from == NULL)
563     from = default_encoding;
564 
565   conv = create_iconv (to, from);
566 
567   if (NULL == conv)
568     {
569       if (fallbackchar)
570         {
571           out->string = pool_malloc (pool, text.length + 1);
572           out->length = text.length;
573           memcpy (out->string, text.string, text.length);
574           out->string[out->length] = '\0';
575           return 0;
576         }
577       else
578         return EPROTO;
579     }
580 
581   for (bufsize = text.length + 1; bufsize > text.length; bufsize *= 2)
582     {
583       char *output = pool_malloc (pool, bufsize);
584       ssize_t retval;
585 
586       retval = try_recode (conv, fallbackchar, text.string, text.length,
587                            output, bufsize);
588       if (retval >= 0)
589         {
590           *out = ss_buffer (output, retval);
591           return 0;
592         }
593       pool_free (pool, output);
594 
595       if (retval != -E2BIG)
596         return -retval;
597     }
598 
599   NOT_REACHED ();
600 }
601 
602 /* Converts the string TEXT, which should be encoded in FROM-encoding, to a
603    dynamically allocated string in TO-encoding.  Any characters which cannot be
604    converted will be represented by '?'.
605 
606    The returned string will be null-terminated and allocated on POOL with
607    pool_malloc().
608 
609    This function's behaviour differs from that of g_convert_with_fallback
610    provided by GLib.  The GLib function will fail (returns NULL) if any part of
611    the input string is not valid in the declared input encoding.  This function
612    however perseveres even in the presence of badly encoded input. */
613 struct substring
recode_substring_pool(const char * to,const char * from,struct substring text,struct pool * pool)614 recode_substring_pool (const char *to, const char *from,
615                        struct substring text, struct pool *pool)
616 {
617   struct substring out;
618 
619   recode_substring_pool__ (to, from, text, '?', pool, &out);
620   return out;
621 }
622 
623 /* Converts the string TEXT, which should be encoded in FROM-encoding, to a
624    dynamically allocated string in TO-encoding.  On success, returns 0, and the
625    converted null-terminated string, allocated from POOL with pool_malloc(), is
626    stored in *OUT.  On failure, returns a positive errno value.
627 
628    The function fails with an error if any part of the input string is not
629    valid in the declared input encoding. */
630 int
recode_pedantically(const char * to,const char * from,struct substring text,struct pool * pool,struct substring * out)631 recode_pedantically (const char *to, const char *from,
632                      struct substring text, struct pool *pool,
633                      struct substring *out)
634 {
635   int error;
636 
637   error = recode_substring_pool__ (to, from, text, 0, pool, out);
638   if (error)
639     *out = ss_empty ();
640   return error;
641 }
642 
643 void
i18n_init(void)644 i18n_init (void)
645 {
646   setlocale (LC_ALL, "");
647   bindtextdomain (PACKAGE, relocate(locale_dir));
648   textdomain (PACKAGE);
649 
650   assert (default_encoding == NULL);
651   default_encoding = xstrdup (locale_charset ());
652 
653   hmapx_init (&map);
654 }
655 
656 const char *
get_default_encoding(void)657 get_default_encoding (void)
658 {
659   return default_encoding;
660 }
661 
662 void
set_default_encoding(const char * enc)663 set_default_encoding (const char *enc)
664 {
665   free (default_encoding);
666   default_encoding = xstrdup (enc);
667 }
668 
669 /* Return the ISO two letter code for the current LC_MESSAGES
670    locale category.  */
671 char *
get_language(void)672 get_language (void)
673 {
674   const char *localename = gl_locale_name (LC_MESSAGES, "LC_MESSAGES");
675   if (0 == strcmp (localename, "C"))
676     return NULL;
677   char *ln = xstrdup (localename);
678   char *end = strchr (ln, '_');
679   if (end)
680     *end = '\0';
681   return ln;
682 }
683 
684 
685 /* Attempts to set the encoding from a locale name
686    returns true if successful.
687    This function does not (should not!) alter the current locale.
688 */
689 bool
set_encoding_from_locale(const char * loc)690 set_encoding_from_locale (const char *loc)
691 {
692   bool ok = true;
693   char *c_encoding;
694   char *loc_encoding;
695   char *tmp = xstrdup (setlocale (LC_CTYPE, NULL));
696 
697   setlocale (LC_CTYPE, "C");
698   c_encoding = xstrdup (locale_charset ());
699 
700   setlocale (LC_CTYPE, loc);
701   loc_encoding = xstrdup (locale_charset ());
702 
703 
704   if (0 == strcmp (loc_encoding, c_encoding))
705     {
706       ok = false;
707     }
708 
709   setlocale (LC_CTYPE, tmp);
710 
711   free (tmp);
712 
713   if (ok)
714     {
715       free (default_encoding);
716       default_encoding = loc_encoding;
717     }
718   else
719     free (loc_encoding);
720 
721   free (c_encoding);
722 
723   return ok;
724 }
725 
726 void
i18n_done(void)727 i18n_done (void)
728 {
729   struct hmapx_node *node;
730   struct converter *cvtr;
731 
732   HMAPX_FOR_EACH (cvtr, node, &map)
733     {
734       if (cvtr == NULL)
735 	continue;
736       free (cvtr->tocode);
737       free (cvtr->fromcode);
738       if (cvtr->conv != (iconv_t) -1)
739         iconv_close (cvtr->conv);
740       free (cvtr);
741     }
742 
743   hmapx_destroy (&map);
744 
745   free (default_encoding);
746   default_encoding = NULL;
747 }
748 
749 
750 
751 bool
valid_encoding(const char * enc)752 valid_encoding (const char *enc)
753 {
754   iconv_t conv = iconv_open (UTF8, enc);
755 
756   if (conv == (iconv_t) -1)
757     return false;
758 
759   iconv_close (conv);
760 
761   return true;
762 }
763 
764 
765 /* Return the system local's idea of the
766    decimal separator character */
767 char
get_system_decimal(void)768 get_system_decimal (void)
769 {
770   char radix_char;
771 
772 #if HAVE_NL_LANGINFO
773   radix_char = nl_langinfo (RADIXCHAR)[0];
774 #else
775   {
776     char buf[10];
777     snprintf (buf, sizeof buf, "%f", 2.5);
778     radix_char = buf[1];
779   }
780 #endif
781 
782   return radix_char;
783 }
784 
785 const char *
uc_name(ucs4_t uc,char buffer[16])786 uc_name (ucs4_t uc, char buffer[16])
787 {
788   if (uc >= 0x20 && uc < 0x7f)
789     snprintf (buffer, 16, "`%c'", uc);
790   else
791     snprintf (buffer, 16, "U+%04X", uc);
792   return buffer;
793 }
794 
795 /* UTF-8 functions that deal with uppercase/lowercase distinctions. */
796 
797 /* Returns a hash value for the N bytes of UTF-8 encoded data starting at S,
798    with lowercase and uppercase letters treated as equal, starting from
799    BASIS. */
800 unsigned int
utf8_hash_case_bytes(const char * s,size_t n,unsigned int basis)801 utf8_hash_case_bytes (const char *s, size_t n, unsigned int basis)
802 {
803   uint8_t folded_buf[2048];
804   size_t folded_len = sizeof folded_buf;
805   uint8_t *folded_s;
806   unsigned int hash;
807 
808   folded_s = u8_casefold (CHAR_CAST (const uint8_t *, s), n,
809                           NULL, UNINORM_NFKD, folded_buf, &folded_len);
810   if (folded_s != NULL)
811     {
812       hash = hash_bytes (folded_s, folded_len, basis);
813       if (folded_s != folded_buf)
814         free (folded_s);
815     }
816   else
817     {
818       if (errno == ENOMEM)
819         xalloc_die ();
820       hash = hash_bytes (s, n, basis);
821     }
822 
823   return hash;
824 }
825 
826 /* Returns a hash value for null-terminated UTF-8 string S, with lowercase and
827    uppercase letters treated as equal, starting from BASIS. */
828 unsigned int
utf8_hash_case_string(const char * s,unsigned int basis)829 utf8_hash_case_string (const char *s, unsigned int basis)
830 {
831   return utf8_hash_case_bytes (s, strlen (s), basis);
832 }
833 
834 /* Compares UTF-8 strings A and B case-insensitively.
835    Returns a negative value if A < B, zero if A == B, positive if A > B. */
836 int
utf8_strcasecmp(const char * a,const char * b)837 utf8_strcasecmp (const char *a, const char *b)
838 {
839   return utf8_strncasecmp (a, strlen (a), b, strlen (b));
840 }
841 
842 /* Compares UTF-8 strings A (with length AN) and B (with length BN)
843    case-insensitively.
844    Returns a negative value if A < B, zero if A == B, positive if A > B. */
845 int
utf8_strncasecmp(const char * a,size_t an,const char * b,size_t bn)846 utf8_strncasecmp (const char *a, size_t an, const char *b, size_t bn)
847 {
848   int result;
849 
850   if (u8_casecmp (CHAR_CAST (const uint8_t *, a), an,
851                   CHAR_CAST (const uint8_t *, b), bn,
852                   NULL, UNINORM_NFKD, &result))
853     {
854       if (errno == ENOMEM)
855         xalloc_die ();
856 
857       result = memcmp (a, b, MIN (an, bn));
858       if (result == 0)
859         result = an < bn ? -1 : an > bn;
860     }
861 
862   return result;
863 }
864 
865 static bool
is_all_digits(const uint8_t * s,size_t len)866 is_all_digits (const uint8_t *s, size_t len)
867 {
868   for (size_t i = 0; i < len; i++)
869     if (!c_isdigit (s[i]))
870       return false;
871   return true;
872 }
873 
874 /* Compares UTF-8 strings A and B case-insensitively.  If the strings end in a
875    number, then they are compared numerically.  Returns a negative value if A <
876    B, zero if A == B, positive if A > B. */
877 int
utf8_strverscasecmp(const char * a,const char * b)878 utf8_strverscasecmp (const char *a, const char *b)
879 {
880   /* Normalize A. */
881   uint8_t a_stub[64];
882   size_t a_len = sizeof a_stub;
883   uint8_t *a_norm = u8_casefold (CHAR_CAST (uint8_t *, a), strlen (a), NULL,
884                                  UNINORM_NFKD, a_stub, &a_len);
885 
886   /* Normalize B. */
887   uint8_t b_stub[64];
888   size_t b_len = sizeof b_stub;
889   uint8_t *b_norm = u8_casefold (CHAR_CAST (uint8_t *, b), strlen (b), NULL,
890                                  UNINORM_NFKD, b_stub, &b_len);
891 
892   int result;
893   if (!a_norm || !b_norm)
894     {
895       result = strcmp (a, b);
896       goto exit;
897     }
898 
899   size_t len = MIN (a_len, b_len);
900   for (size_t i = 0; i < len; i++)
901     if (a_norm[i] != b_norm[i])
902       {
903         /* If both strings end in digits, compare them numerically. */
904         if (is_all_digits (&a_norm[i], a_len - i)
905             && is_all_digits (&b_norm[i], b_len - i))
906           {
907             /* Start by stripping leading zeros, since those don't matter for
908                numerical comparison. */
909             size_t ap, bp;
910             for (ap = i; ap < a_len; ap++)
911               if (a_norm[ap] != '0')
912                 break;
913             for (bp = i; bp < b_len; bp++)
914               if (b_norm[bp] != '0')
915                 break;
916 
917             /* The number with more digits, if there is one, is larger. */
918             size_t a_digits = a_len - ap;
919             size_t b_digits = b_len - bp;
920             if (a_digits != b_digits)
921               result = a_digits > b_digits ? 1 : -1;
922             else
923               result = memcmp (&a_norm[ap], &b_norm[bp], a_digits);
924           }
925         else
926           result = a_norm[i] > b_norm[i] ? 1 : -1;
927         goto exit;
928       }
929   result = a_len < b_len ? -1 : a_len > b_len;
930 
931 exit:
932   if (a_norm != a_stub)
933     free (a_norm);
934   if (b_norm != b_stub)
935     free (b_norm);
936   return result;
937 }
938 
939 static char *
utf8_casemap(const char * s,uint8_t * (* f)(const uint8_t *,size_t,const char *,uninorm_t,uint8_t *,size_t *))940 utf8_casemap (const char *s,
941               uint8_t *(*f) (const uint8_t *, size_t, const char *, uninorm_t,
942                              uint8_t *, size_t *))
943 {
944   char *result;
945   size_t size;
946 
947   result = CHAR_CAST (char *,
948                       f (CHAR_CAST (const uint8_t *, s), strlen (s) + 1,
949                          NULL, NULL, NULL, &size));
950   if (result == NULL)
951     {
952       if (errno == ENOMEM)
953         xalloc_die ();
954 
955       result = xstrdup (s);
956     }
957   return result;
958 }
959 
960 char *
utf8_to_upper(const char * s)961 utf8_to_upper (const char *s)
962 {
963   return utf8_casemap (s, u8_toupper);
964 }
965 
966 char *
utf8_to_lower(const char * s)967 utf8_to_lower (const char *s)
968 {
969   return utf8_casemap (s, u8_tolower);
970 }
971 
972 char *
utf8_to_title(const char * s)973 utf8_to_title (const char *s)
974 {
975   return utf8_casemap (s, u8_totitle);
976 }
977 
978 bool
get_encoding_info(struct encoding_info * e,const char * name)979 get_encoding_info (struct encoding_info *e, const char *name)
980 {
981   const struct substring in = SS_LITERAL_INITIALIZER (
982 						      "\t\n\v\f\r "
983 						      "!\"#$%&'()*+,-./0123456789:;<=>?@"
984 						      "ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`"
985 						      "abcdefghijklmnopqrstuvwxyz{|}~");
986 
987   struct substring out, cr, lf, space;
988   bool ok;
989 
990   memset (e, 0, sizeof *e);
991 
992   cr = recode_substring_pool (name, "UTF-8", ss_cstr ("\r"), NULL);
993   lf = recode_substring_pool (name, "UTF-8", ss_cstr ("\n"), NULL);
994   space = recode_substring_pool (name, "UTF-8", ss_cstr (" "), NULL);
995   ok = (cr.length >= 1
996         && cr.length <= MAX_UNIT
997         && cr.length == lf.length
998         && cr.length == space.length);
999   if (!ok)
1000     {
1001       fprintf (stderr, "warning: encoding `%s' is not supported.\n", name);
1002       ss_dealloc (&cr);
1003       ss_dealloc (&lf);
1004       ss_dealloc (&space);
1005       ss_alloc_substring (&cr, ss_cstr ("\r"));
1006       ss_alloc_substring (&lf, ss_cstr ("\n"));
1007       ss_alloc_substring (&space, ss_cstr (" "));
1008     }
1009 
1010   e->unit = cr.length;
1011   memcpy (e->cr, cr.string, e->unit);
1012   memcpy (e->lf, lf.string, e->unit);
1013   memcpy (e->space, space.string, e->unit);
1014 
1015   ss_dealloc (&cr);
1016   ss_dealloc (&lf);
1017   ss_dealloc (&space);
1018 
1019   out = recode_substring_pool ("UTF-8", name, in, NULL);
1020   e->is_ascii_compatible = ss_equals (in, out);
1021   ss_dealloc (&out);
1022 
1023   if (!e->is_ascii_compatible && e->unit == 1)
1024     {
1025       out = recode_substring_pool ("UTF-8", name, ss_cstr ("A"), NULL);
1026       e->is_ebcdic_compatible = (out.length == 1
1027                                  && (uint8_t) out.string[0] == 0xc1);
1028       ss_dealloc (&out);
1029     }
1030   else
1031     e->is_ebcdic_compatible = false;
1032 
1033   return ok;
1034 }
1035 
1036 bool
is_encoding_ascii_compatible(const char * encoding)1037 is_encoding_ascii_compatible (const char *encoding)
1038 {
1039   struct encoding_info e;
1040 
1041   get_encoding_info (&e, encoding);
1042   return e.is_ascii_compatible;
1043 }
1044 
1045 bool
is_encoding_ebcdic_compatible(const char * encoding)1046 is_encoding_ebcdic_compatible (const char *encoding)
1047 {
1048   struct encoding_info e;
1049 
1050   get_encoding_info (&e, encoding);
1051   return e.is_ebcdic_compatible;
1052 }
1053 
1054 /* Returns true if iconv can convert ENCODING to and from UTF-8,
1055    otherwise false. */
1056 bool
is_encoding_supported(const char * encoding)1057 is_encoding_supported (const char *encoding)
1058 {
1059   return (create_iconv ("UTF-8", encoding)
1060           && create_iconv (encoding, "UTF-8"));
1061 }
1062 
1063 /* Returns true if E is the name of a UTF-8 encoding.
1064 
1065    XXX Possibly we should test not E as a string but its properties via
1066    iconv. */
1067 bool
is_encoding_utf8(const char * e)1068 is_encoding_utf8 (const char *e)
1069 {
1070   return ((e[0] == 'u' || e[0] == 'U')
1071           && (e[1] == 't' || e[1] == 'T')
1072           && (e[2] == 'f' || e[2] == 'F')
1073           && ((e[3] == '8' && e[4] == '\0')
1074               || (e[3] == '-' && e[4] == '8' && e[5] == '\0')));
1075 }
1076 
1077 static struct encoding_category *categories;
1078 static int n_categories;
1079 
1080 static void SENTINEL (0)
add_category(size_t * allocated_categories,const char * category,...)1081   add_category (size_t *allocated_categories, const char *category, ...)
1082 {
1083   struct encoding_category *c;
1084   const char *encodings[16];
1085   va_list args;
1086   int i, n;
1087 
1088   /* Count encoding arguments. */
1089   va_start (args, category);
1090   n = 0;
1091   while ((encodings[n] = va_arg (args, const char *)) != NULL)
1092     {
1093       const char *encoding = encodings[n];
1094       if (!strcmp (encoding, "Auto") || is_encoding_supported (encoding))
1095         n++;
1096     }
1097   assert (n < sizeof encodings / sizeof *encodings);
1098   va_end (args);
1099 
1100   if (n == 0)
1101     return;
1102 
1103   if (n_categories >= *allocated_categories)
1104     categories = x2nrealloc (categories,
1105                              allocated_categories, sizeof *categories);
1106 
1107   c = &categories[n_categories++];
1108   c->category = category;
1109   c->encodings = xmalloc (n * sizeof *c->encodings);
1110   for (i = 0; i < n; i++)
1111     c->encodings[i] = encodings[i];
1112   c->n_encodings = n;
1113 }
1114 
1115 static void
init_encoding_categories(void)1116 init_encoding_categories (void)
1117 {
1118   static bool inited;
1119   size_t alloc;
1120 
1121   if (inited)
1122     return;
1123   inited = true;
1124 
1125   alloc = 0;
1126   add_category (&alloc, "Unicode", "UTF-8", "UTF-16", "UTF-16BE", "UTF-16LE",
1127                 "UTF-32", "UTF-32BE", "UTF-32LE", NULL_SENTINEL);
1128   add_category (&alloc, _("Arabic"), "IBM864", "ISO-8859-6", "Windows-1256",
1129                 NULL_SENTINEL);
1130   add_category (&alloc, _("Armenian"), "ARMSCII-8", NULL_SENTINEL);
1131   add_category (&alloc, _("Baltic"), "ISO-8859-13", "ISO-8859-4",
1132                 "Windows-1257", NULL_SENTINEL);
1133   add_category (&alloc, _("Celtic"), "ISO-8859-14", NULL_SENTINEL);
1134   add_category (&alloc, _("Central European"), "IBM852", "ISO-8859-2",
1135                 "Mac-CentralEurope", "Windows-1250", NULL_SENTINEL);
1136   add_category (&alloc, _("Chinese Simplified"), "GB18030", "GB2312", "GBK",
1137                 "HZ-GB-2312", "ISO-2022-CN", NULL_SENTINEL);
1138   add_category (&alloc, _("Chinese Traditional"), "Big5", "Big5-HKSCS",
1139                 "EUC-TW", NULL_SENTINEL);
1140   add_category (&alloc, _("Croatian"), "MacCroatian", NULL_SENTINEL);
1141   add_category (&alloc, _("Cyrillic"), "IBM855", "ISO-8859-5", "ISO-IR-111",
1142                 "KOI8-R", "MacCyrillic", NULL_SENTINEL);
1143   add_category (&alloc, _("Cyrillic/Russian"), "IBM866", NULL_SENTINEL);
1144   add_category (&alloc, _("Cyrillic/Ukrainian"), "KOI8-U", "MacUkrainian",
1145                 NULL_SENTINEL);
1146   add_category (&alloc, _("Georgian"), "GEOSTD8", NULL_SENTINEL);
1147   add_category (&alloc, _("Greek"), "ISO-8859-7", "MacGreek", NULL_SENTINEL);
1148   add_category (&alloc, _("Gujarati"), "MacGujarati", NULL_SENTINEL);
1149   add_category (&alloc, _("Gurmukhi"), "MacGurmukhi", NULL_SENTINEL);
1150   add_category (&alloc, _("Hebrew"), "IBM862", "ISO-8859-8-I", "Windows-1255",
1151                 NULL_SENTINEL);
1152   add_category (&alloc, _("Hebrew Visual"), "ISO-8859-8", NULL_SENTINEL);
1153   add_category (&alloc, _("Hindi"), "MacDevangari", NULL_SENTINEL);
1154   add_category (&alloc, _("Icelandic"), "MacIcelandic", NULL_SENTINEL);
1155   add_category (&alloc, _("Japanese"), "EUC-JP", "ISO-2022-JP", "Shift_JIS",
1156                 NULL_SENTINEL);
1157   add_category (&alloc, _("Korean"), "EUC-KR", "ISO-2022-KR", "JOHAB", "UHC",
1158                 NULL_SENTINEL);
1159   add_category (&alloc, _("Nordic"), "ISO-8859-10", NULL_SENTINEL);
1160   add_category (&alloc, _("Romanian"), "ISO-8859-16", "MacRomanian",
1161                 NULL_SENTINEL);
1162   add_category (&alloc, _("South European"), "ISO-8859-3", NULL_SENTINEL);
1163   add_category (&alloc, _("Thai"), "ISO-8859-11", "TIS-620", "Windows-874",
1164                 NULL_SENTINEL);
1165   add_category (&alloc, _("Turkish"), "IBM857", "ISO-8859-9", "Windows-1254",
1166                 NULL_SENTINEL);
1167   add_category (&alloc, _("Vietnamese"), "TVCN", "VISCII", "VPS",
1168                 "Windows-1258", NULL_SENTINEL);
1169   add_category (&alloc, _("Western European"), "ISO-8859-1", "ISO-8859-15",
1170                 "Windows-1252", "IBM850", "MacRoman", NULL_SENTINEL);
1171 }
1172 
1173 /* Returns an array of "struct encoding_category" that contains only the
1174    categories and encodings that the system supports. */
1175 struct encoding_category *
get_encoding_categories(void)1176 get_encoding_categories (void)
1177 {
1178   init_encoding_categories ();
1179   return categories;
1180 }
1181 
1182 /* Returns the number of elements in the array returned by
1183    get_encoding_categories().  */
1184 size_t
get_n_encoding_categories(void)1185 get_n_encoding_categories (void)
1186 {
1187   init_encoding_categories ();
1188   return n_categories;
1189 }
1190