1 /* PSPP - a program for statistical analysis.
2 Copyright (C) 2006, 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016 Free Software Foundation, Inc.
3
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
8
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
13
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>. */
16
17 #include <config.h>
18
19 #include "libpspp/i18n.h"
20
21 #include <assert.h>
22 #include <errno.h>
23 #include <iconv.h>
24 #include <langinfo.h>
25 #include <locale.h>
26 #include <stdio.h>
27 #include <stdlib.h>
28 #include <string.h>
29 #include <unicase.h>
30 #include <unigbrk.h>
31
32 #include "libpspp/assertion.h"
33 #include "libpspp/compiler.h"
34 #include "libpspp/hmapx.h"
35 #include "libpspp/hash-functions.h"
36 #include "libpspp/pool.h"
37 #include "libpspp/str.h"
38 #include "libpspp/version.h"
39
40 #include "gl/c-ctype.h"
41 #include "gl/c-strcase.h"
42 #include "gl/localcharset.h"
43 #include <gl/localename.h>
44 #include "gl/minmax.h"
45 #include "gl/xalloc.h"
46 #include "gl/relocatable.h"
47 #include "gl/xstrndup.h"
48
49 #include "gettext.h"
50 #define _(msgid) gettext (msgid)
51
52 struct converter
53 {
54 char *tocode;
55 char *fromcode;
56 iconv_t conv;
57 int null_char_width;
58 };
59
60 static char *default_encoding;
61 static struct hmapx map;
62
63 /* A wrapper around iconv_open */
64 static struct converter *
create_iconv(const char * tocode,const char * fromcode)65 create_iconv (const char* tocode, const char* fromcode)
66 {
67 size_t hash;
68 struct hmapx_node *node;
69 struct converter *converter;
70 assert (fromcode);
71
72 hash = hash_string (tocode, hash_string (fromcode, 0));
73 HMAPX_FOR_EACH_WITH_HASH (converter, node, hash, &map)
74 {
75 if (!converter)
76 return NULL;
77
78 if (!strcmp (tocode, converter->tocode)
79 && !strcmp (fromcode, converter->fromcode))
80 return converter;
81 }
82
83 converter = xmalloc (sizeof *converter);
84 converter->tocode = xstrdup (tocode);
85 converter->fromcode = xstrdup (fromcode);
86 converter->conv = iconv_open (tocode, fromcode);
87 int error = converter->conv == (iconv_t) ~0 ? errno : 0;
88 /* I don't think it's safe to translate this string or to use messaging
89 as the converters have not yet been set up */
90 if (error && strcmp (tocode, fromcode))
91 {
92 fprintf (stderr,
93 "Warning: "
94 "cannot create a converter for `%s' to `%s': %s\n",
95 fromcode, tocode, strerror (error));
96
97 free (converter->tocode);
98 free (converter->fromcode);
99 free (converter);
100
101 hmapx_insert (&map, NULL, hash);
102 return NULL;
103 }
104
105 /* Find out how many bytes there are in a null char in the target
106 encoding */
107 iconv_t bconv = iconv_open (tocode, "ASCII");
108 if (bconv != (iconv_t) -1)
109 {
110 ICONV_CONST char *nullstr = strdup ("");
111 ICONV_CONST char *outbuf = strdup ("XXXXXXXX");
112 ICONV_CONST char *snullstr = nullstr;
113 ICONV_CONST char *soutbuf = outbuf;
114
115 size_t inbytes = 1;
116 const size_t bytes = 8;
117 size_t outbytes = bytes;
118 if (-1 != iconv (bconv, &nullstr, &inbytes, &outbuf, &outbytes))
119 converter->null_char_width = bytes - outbytes;
120 free (snullstr);
121 free (soutbuf);
122 iconv_close (bconv);
123 }
124
125 hmapx_insert (&map, converter, hash);
126
127 return converter;
128 }
129
130
131 /* Converts the single byte C from encoding FROM to TO, returning the first
132 byte of the result.
133
134 This function probably shouldn't be used at all, but some code still does
135 use it. */
136 char
recode_byte(const char * to,const char * from,char c)137 recode_byte (const char *to, const char *from, char c)
138 {
139 char x;
140 char *s = recode_string (to, from, &c, 1);
141 x = s[0];
142 free (s);
143 return x;
144 }
145
146 /* Similar to recode_string_pool, but allocates the returned value on the heap
147 instead of in a pool. It is the caller's responsibility to free the
148 returned value. */
149 char *
recode_string(const char * to,const char * from,const char * text,int length)150 recode_string (const char *to, const char *from,
151 const char *text, int length)
152 {
153 return recode_string_pool (to, from, text, length, NULL);
154 }
155
156 /* Returns the length, in bytes, of the string that a similar recode_string()
157 call would return. */
158 size_t
recode_string_len(const char * to,const char * from,const char * text,int length)159 recode_string_len (const char *to, const char *from,
160 const char *text, int length)
161 {
162 char *s = recode_string (to, from, text, length);
163 size_t len = strlen (s);
164 free (s);
165 return len;
166 }
167
168 /* Uses CONV to convert the INBYTES starting at IP into the OUTBYTES starting
169 at OP, and appends a null terminator to the output.
170
171 Returns the output length if successful, -1 if the output buffer is too
172 small. */
173 static ssize_t
try_recode(struct converter * cvtr,char fallbackchar,const char * in,size_t inbytes,char * out_,size_t outbytes)174 try_recode (struct converter *cvtr, char fallbackchar,
175 const char *in, size_t inbytes,
176 char *out_, size_t outbytes)
177 {
178 char *out = out_;
179 int i, j;
180
181 int null_bytes = cvtr->null_char_width;
182
183 /* Put the converter into the initial shift state, in case there was any
184 state information left over from its last usage. */
185 iconv (cvtr->conv, NULL, 0, NULL, 0);
186
187 /* Do two rounds of iconv() calls:
188
189 - The first round does the bulk of the conversion using the
190 caller-supplied input data..
191
192 - The second round flushes any leftover output. This has a real effect
193 with input encodings that use combining diacritics, e.g. without the
194 second round the last character tends to gets dropped when converting
195 from windows-1258 to other encodings.
196 */
197 for (i = 0; i < 2; i++)
198 {
199 ICONV_CONST char **inp = i ? NULL : (ICONV_CONST char **) ∈
200 size_t *inbytesp = i ? NULL : &inbytes;
201
202 while (iconv (cvtr->conv, inp, inbytesp, &out, &outbytes) == -1)
203 switch (errno)
204 {
205 case EINVAL:
206 if (outbytes < null_bytes + 1)
207 return -E2BIG;
208 if (!fallbackchar)
209 return -EINVAL;
210 *out++ = fallbackchar;
211 for (j = 0 ; j < null_bytes ; ++j)
212 *out++ = '\0';
213 return out - 1 - out_;
214
215 case EILSEQ:
216 if (outbytes == 0)
217 return -E2BIG;
218 if (!fallbackchar)
219 return -EILSEQ;
220 *out++ = fallbackchar;
221 outbytes--;
222 if (inp)
223 {
224 in++;
225 inbytes--;
226 }
227 break;
228
229 case E2BIG:
230 return -E2BIG;
231
232 default:
233 /* should never happen */
234 fprintf (stderr, "Character conversion error: %s\n",
235 strerror (errno));
236 NOT_REACHED ();
237 break;
238 }
239 }
240
241 if (outbytes <= null_bytes - 1)
242 return -E2BIG;
243
244 for (i = 0 ; i < null_bytes ; ++i)
245 *out++ = '\0';
246
247 return out - 1 - out_;
248 }
249
250 /* Converts the string TEXT, which should be encoded in FROM-encoding, to a
251 dynamically allocated string in TO-encoding. Any characters which cannot be
252 converted will be represented by '?'.
253
254 LENGTH should be the length of the string or -1, if null terminated.
255
256 The returned string will be allocated on POOL.
257
258 This function's behaviour differs from that of g_convert_with_fallback
259 provided by GLib. The GLib function will fail (returns NULL) if any part of
260 the input string is not valid in the declared input encoding. This function
261 however perseveres even in the presence of badly encoded input. */
262 char *
recode_string_pool(const char * to,const char * from,const char * text,int length,struct pool * pool)263 recode_string_pool (const char *to, const char *from,
264 const char *text, int length, struct pool *pool)
265 {
266 struct substring out;
267
268 if (text == NULL)
269 return NULL;
270
271 if (length == -1)
272 length = strlen (text);
273
274 out = recode_substring_pool (to, from, ss_buffer (text, length), pool);
275 return out.string;
276 }
277
278 /* Returns the name of the encoding that should be used for file names.
279
280 This is meant to be the same encoding used by g_filename_from_uri() and
281 g_filename_to_uri() in GLib. */
282 static const char *
filename_encoding(void)283 filename_encoding (void)
284 {
285 #if defined _WIN32 || defined __WIN32__
286 return "UTF-8";
287 #else
288 return locale_charset ();
289 #endif
290 }
291
292 static char *
xconcat2(const char * a,size_t a_len,const char * b,size_t b_len)293 xconcat2 (const char *a, size_t a_len,
294 const char *b, size_t b_len)
295 {
296 char *s = xmalloc (a_len + b_len + 1);
297 memcpy (s, a, a_len);
298 memcpy (s + a_len, b, b_len);
299 s[a_len + b_len] = '\0';
300 return s;
301 }
302
303 /* Conceptually, this function concatenates HEAD_LEN-byte string HEAD and
304 TAIL_LEN-byte string TAIL, both encoded in UTF-8, then converts them to
305 ENCODING. If the re-encoded result is no more than MAX_LEN bytes long, then
306 it returns HEAD_LEN. Otherwise, it drops one character[*] from the end of
307 HEAD and tries again, repeating as necessary until the concatenated result
308 fits or until HEAD_LEN reaches 0.
309
310 [*] Actually this function drops grapheme clusters instead of characters, so
311 that, e.g. a Unicode character followed by a combining accent character
312 is either completely included or completely excluded from HEAD_LEN. See
313 UAX #29 at http://unicode.org/reports/tr29/ for more information on
314 grapheme clusters.
315
316 A null ENCODING is treated as UTF-8.
317
318 Sometimes this function has to actually construct the concatenated string to
319 measure its length. When this happens, it sets *RESULTP to that
320 null-terminated string, allocated with malloc(), for the caller to use if it
321 needs it. Otherwise, it sets *RESULTP to NULL.
322
323 Simple examples for encoding="UTF-8", max_len=6:
324
325 head="abc", tail="xyz" => 3
326 head="abcd", tail="xyz" => 3 ("d" dropped).
327 head="abc", tail="uvwxyz" => 0 ("abc" dropped).
328 head="abc", tail="tuvwxyz" => 0 ("abc" dropped).
329
330 Examples for encoding="ISO-8859-1", max_len=6:
331
332 head="éèä", tail="xyz" => 6
333 (each letter in head is only 1 byte in ISO-8859-1 even though they
334 each take 2 bytes in UTF-8 encoding)
335 */
336 static size_t
utf8_encoding_concat__(const char * head,size_t head_len,const char * tail,size_t tail_len,const char * encoding,size_t max_len,char ** resultp)337 utf8_encoding_concat__ (const char *head, size_t head_len,
338 const char *tail, size_t tail_len,
339 const char *encoding, size_t max_len,
340 char **resultp)
341 {
342 *resultp = NULL;
343 if (head_len == 0)
344 return 0;
345 else if (encoding == NULL || !c_strcasecmp (encoding, "UTF-8"))
346 {
347 if (head_len + tail_len <= max_len)
348 return head_len;
349 else if (tail_len >= max_len)
350 return 0;
351 else
352 {
353 size_t copy_len;
354 ucs4_t prev;
355 size_t ofs;
356 int mblen;
357
358 copy_len = 0;
359 for (ofs = u8_mbtouc (&prev, CHAR_CAST (const uint8_t *, head),
360 head_len);
361 ofs <= max_len - tail_len;
362 ofs += mblen)
363 {
364 ucs4_t next;
365
366 mblen = u8_mbtouc (&next,
367 CHAR_CAST (const uint8_t *, head + ofs),
368 head_len - ofs);
369 if (uc_is_grapheme_break (prev, next))
370 copy_len = ofs;
371
372 prev = next;
373 }
374
375 return copy_len;
376 }
377 }
378 else
379 {
380 char *result;
381
382 result = (tail_len > 0
383 ? xconcat2 (head, head_len, tail, tail_len)
384 : CONST_CAST (char *, head));
385 if (recode_string_len (encoding, "UTF-8", result,
386 head_len + tail_len) <= max_len)
387 {
388 *resultp = result != head ? result : NULL;
389 return head_len;
390 }
391 else
392 {
393 bool correct_result = false;
394 size_t copy_len;
395 ucs4_t prev;
396 size_t ofs;
397 int mblen;
398
399 copy_len = 0;
400 for (ofs = u8_mbtouc (&prev, CHAR_CAST (const uint8_t *, head),
401 head_len);
402 ofs <= head_len;
403 ofs += mblen)
404 {
405 ucs4_t next;
406
407 mblen = u8_mbtouc (&next,
408 CHAR_CAST (const uint8_t *, head + ofs),
409 head_len - ofs);
410 if (uc_is_grapheme_break (prev, next))
411 {
412 if (result != head)
413 {
414 memcpy (result, head, ofs);
415 memcpy (result + ofs, tail, tail_len);
416 result[ofs + tail_len] = '\0';
417 }
418
419 if (recode_string_len (encoding, "UTF-8", result,
420 ofs + tail_len) <= max_len)
421 {
422 correct_result = true;
423 copy_len = ofs;
424 }
425 else
426 correct_result = false;
427 }
428
429 prev = next;
430 }
431
432 if (result != head)
433 {
434 if (correct_result)
435 *resultp = result;
436 else
437 free (result);
438 }
439
440 return copy_len;
441 }
442 }
443 }
444
445 /* Concatenates a prefix of HEAD with all of TAIL and returns the result as a
446 null-terminated string owned by the caller. HEAD, TAIL, and the returned
447 string are all encoded in UTF-8. As many characters[*] from the beginning
448 of HEAD are included as will fit within MAX_LEN bytes supposing that the
449 resulting string were to be re-encoded in ENCODING. All of TAIL is always
450 included, even if TAIL by itself is longer than MAX_LEN in ENCODING.
451
452 [*] Actually this function drops grapheme clusters instead of characters, so
453 that, e.g. a Unicode character followed by a combining accent character
454 is either completely included or completely excluded from the returned
455 string. See UAX #29 at http://unicode.org/reports/tr29/ for more
456 information on grapheme clusters.
457
458 A null ENCODING is treated as UTF-8.
459
460 Simple examples for encoding="UTF-8", max_len=6:
461
462 head="abc", tail="xyz" => "abcxyz"
463 head="abcd", tail="xyz" => "abcxyz"
464 head="abc", tail="uvwxyz" => "uvwxyz"
465 head="abc", tail="tuvwxyz" => "tuvwxyz"
466
467 Examples for encoding="ISO-8859-1", max_len=6:
468
469 head="éèä", tail="xyz" => "éèäxyz"
470 (each letter in HEAD is only 1 byte in ISO-8859-1 even though they
471 each take 2 bytes in UTF-8 encoding)
472 */
473 char *
utf8_encoding_concat(const char * head,const char * tail,const char * encoding,size_t max_len)474 utf8_encoding_concat (const char *head, const char *tail,
475 const char *encoding, size_t max_len)
476 {
477 size_t tail_len = strlen (tail);
478 size_t prefix_len;
479 char *result;
480
481 prefix_len = utf8_encoding_concat__ (head, strlen (head), tail, tail_len,
482 encoding, max_len, &result);
483 return (result != NULL
484 ? result
485 : xconcat2 (head, prefix_len, tail, tail_len));
486 }
487
488 /* Returns the length, in bytes, of the string that would be returned by
489 utf8_encoding_concat() if passed the same arguments, but the implementation
490 is often more efficient. */
491 size_t
utf8_encoding_concat_len(const char * head,const char * tail,const char * encoding,size_t max_len)492 utf8_encoding_concat_len (const char *head, const char *tail,
493 const char *encoding, size_t max_len)
494 {
495 size_t tail_len = strlen (tail);
496 size_t prefix_len;
497 char *result;
498
499 prefix_len = utf8_encoding_concat__ (head, strlen (head), tail, tail_len,
500 encoding, max_len, &result);
501 free (result);
502 return prefix_len + tail_len;
503 }
504
505 /* Returns an allocated, null-terminated string, owned by the caller,
506 containing as many characters[*] from the beginning of S that would fit
507 within MAX_LEN bytes if the returned string were to be re-encoded in
508 ENCODING. Both S and the returned string are encoded in UTF-8.
509
510 [*] Actually this function drops grapheme clusters instead of characters, so
511 that, e.g. a Unicode character followed by a combining accent character
512 is either completely included or completely excluded from the returned
513 string. See UAX #29 at http://unicode.org/reports/tr29/ for more
514 information on grapheme clusters.
515
516 A null ENCODING is treated as UTF-8.
517 */
518 char *
utf8_encoding_trunc(const char * s,const char * encoding,size_t max_len)519 utf8_encoding_trunc (const char *s, const char *encoding, size_t max_len)
520 {
521 return utf8_encoding_concat (s, "", encoding, max_len);
522 }
523
524 /* Returns the length, in bytes, of the string that would be returned by
525 utf8_encoding_trunc() if passed the same arguments, but the implementation
526 is often more efficient. */
527 size_t
utf8_encoding_trunc_len(const char * s,const char * encoding,size_t max_len)528 utf8_encoding_trunc_len (const char *s, const char *encoding, size_t max_len)
529 {
530 return utf8_encoding_concat_len (s, "", encoding, max_len);
531 }
532
533 /* Returns FILENAME converted from UTF-8 to the filename encoding.
534 On Windows the filename encoding is UTF-8; elsewhere it is based on the
535 current locale. */
536 char *
utf8_to_filename(const char * filename)537 utf8_to_filename (const char *filename)
538 {
539 return recode_string (filename_encoding (), "UTF-8", filename, -1);
540 }
541
542 /* Returns FILENAME converted from the filename encoding to UTF-8.
543 On Windows the filename encoding is UTF-8; elsewhere it is based on the
544 current locale. */
545 char *
filename_to_utf8(const char * filename)546 filename_to_utf8 (const char *filename)
547 {
548 return recode_string ("UTF-8", filename_encoding (), filename, -1);
549 }
550
551 static int
recode_substring_pool__(const char * to,const char * from,struct substring text,char fallbackchar,struct pool * pool,struct substring * out)552 recode_substring_pool__ (const char *to, const char *from,
553 struct substring text, char fallbackchar,
554 struct pool *pool, struct substring *out)
555 {
556 size_t bufsize;
557 struct converter *conv;
558
559 if (to == NULL)
560 to = default_encoding;
561
562 if (from == NULL)
563 from = default_encoding;
564
565 conv = create_iconv (to, from);
566
567 if (NULL == conv)
568 {
569 if (fallbackchar)
570 {
571 out->string = pool_malloc (pool, text.length + 1);
572 out->length = text.length;
573 memcpy (out->string, text.string, text.length);
574 out->string[out->length] = '\0';
575 return 0;
576 }
577 else
578 return EPROTO;
579 }
580
581 for (bufsize = text.length + 1; bufsize > text.length; bufsize *= 2)
582 {
583 char *output = pool_malloc (pool, bufsize);
584 ssize_t retval;
585
586 retval = try_recode (conv, fallbackchar, text.string, text.length,
587 output, bufsize);
588 if (retval >= 0)
589 {
590 *out = ss_buffer (output, retval);
591 return 0;
592 }
593 pool_free (pool, output);
594
595 if (retval != -E2BIG)
596 return -retval;
597 }
598
599 NOT_REACHED ();
600 }
601
602 /* Converts the string TEXT, which should be encoded in FROM-encoding, to a
603 dynamically allocated string in TO-encoding. Any characters which cannot be
604 converted will be represented by '?'.
605
606 The returned string will be null-terminated and allocated on POOL with
607 pool_malloc().
608
609 This function's behaviour differs from that of g_convert_with_fallback
610 provided by GLib. The GLib function will fail (returns NULL) if any part of
611 the input string is not valid in the declared input encoding. This function
612 however perseveres even in the presence of badly encoded input. */
613 struct substring
recode_substring_pool(const char * to,const char * from,struct substring text,struct pool * pool)614 recode_substring_pool (const char *to, const char *from,
615 struct substring text, struct pool *pool)
616 {
617 struct substring out;
618
619 recode_substring_pool__ (to, from, text, '?', pool, &out);
620 return out;
621 }
622
623 /* Converts the string TEXT, which should be encoded in FROM-encoding, to a
624 dynamically allocated string in TO-encoding. On success, returns 0, and the
625 converted null-terminated string, allocated from POOL with pool_malloc(), is
626 stored in *OUT. On failure, returns a positive errno value.
627
628 The function fails with an error if any part of the input string is not
629 valid in the declared input encoding. */
630 int
recode_pedantically(const char * to,const char * from,struct substring text,struct pool * pool,struct substring * out)631 recode_pedantically (const char *to, const char *from,
632 struct substring text, struct pool *pool,
633 struct substring *out)
634 {
635 int error;
636
637 error = recode_substring_pool__ (to, from, text, 0, pool, out);
638 if (error)
639 *out = ss_empty ();
640 return error;
641 }
642
643 void
i18n_init(void)644 i18n_init (void)
645 {
646 setlocale (LC_ALL, "");
647 bindtextdomain (PACKAGE, relocate(locale_dir));
648 textdomain (PACKAGE);
649
650 assert (default_encoding == NULL);
651 default_encoding = xstrdup (locale_charset ());
652
653 hmapx_init (&map);
654 }
655
656 const char *
get_default_encoding(void)657 get_default_encoding (void)
658 {
659 return default_encoding;
660 }
661
662 void
set_default_encoding(const char * enc)663 set_default_encoding (const char *enc)
664 {
665 free (default_encoding);
666 default_encoding = xstrdup (enc);
667 }
668
669 /* Return the ISO two letter code for the current LC_MESSAGES
670 locale category. */
671 char *
get_language(void)672 get_language (void)
673 {
674 const char *localename = gl_locale_name (LC_MESSAGES, "LC_MESSAGES");
675 if (0 == strcmp (localename, "C"))
676 return NULL;
677 char *ln = xstrdup (localename);
678 char *end = strchr (ln, '_');
679 if (end)
680 *end = '\0';
681 return ln;
682 }
683
684
685 /* Attempts to set the encoding from a locale name
686 returns true if successful.
687 This function does not (should not!) alter the current locale.
688 */
689 bool
set_encoding_from_locale(const char * loc)690 set_encoding_from_locale (const char *loc)
691 {
692 bool ok = true;
693 char *c_encoding;
694 char *loc_encoding;
695 char *tmp = xstrdup (setlocale (LC_CTYPE, NULL));
696
697 setlocale (LC_CTYPE, "C");
698 c_encoding = xstrdup (locale_charset ());
699
700 setlocale (LC_CTYPE, loc);
701 loc_encoding = xstrdup (locale_charset ());
702
703
704 if (0 == strcmp (loc_encoding, c_encoding))
705 {
706 ok = false;
707 }
708
709 setlocale (LC_CTYPE, tmp);
710
711 free (tmp);
712
713 if (ok)
714 {
715 free (default_encoding);
716 default_encoding = loc_encoding;
717 }
718 else
719 free (loc_encoding);
720
721 free (c_encoding);
722
723 return ok;
724 }
725
726 void
i18n_done(void)727 i18n_done (void)
728 {
729 struct hmapx_node *node;
730 struct converter *cvtr;
731
732 HMAPX_FOR_EACH (cvtr, node, &map)
733 {
734 if (cvtr == NULL)
735 continue;
736 free (cvtr->tocode);
737 free (cvtr->fromcode);
738 if (cvtr->conv != (iconv_t) -1)
739 iconv_close (cvtr->conv);
740 free (cvtr);
741 }
742
743 hmapx_destroy (&map);
744
745 free (default_encoding);
746 default_encoding = NULL;
747 }
748
749
750
751 bool
valid_encoding(const char * enc)752 valid_encoding (const char *enc)
753 {
754 iconv_t conv = iconv_open (UTF8, enc);
755
756 if (conv == (iconv_t) -1)
757 return false;
758
759 iconv_close (conv);
760
761 return true;
762 }
763
764
765 /* Return the system local's idea of the
766 decimal separator character */
767 char
get_system_decimal(void)768 get_system_decimal (void)
769 {
770 char radix_char;
771
772 #if HAVE_NL_LANGINFO
773 radix_char = nl_langinfo (RADIXCHAR)[0];
774 #else
775 {
776 char buf[10];
777 snprintf (buf, sizeof buf, "%f", 2.5);
778 radix_char = buf[1];
779 }
780 #endif
781
782 return radix_char;
783 }
784
785 const char *
uc_name(ucs4_t uc,char buffer[16])786 uc_name (ucs4_t uc, char buffer[16])
787 {
788 if (uc >= 0x20 && uc < 0x7f)
789 snprintf (buffer, 16, "`%c'", uc);
790 else
791 snprintf (buffer, 16, "U+%04X", uc);
792 return buffer;
793 }
794
795 /* UTF-8 functions that deal with uppercase/lowercase distinctions. */
796
797 /* Returns a hash value for the N bytes of UTF-8 encoded data starting at S,
798 with lowercase and uppercase letters treated as equal, starting from
799 BASIS. */
800 unsigned int
utf8_hash_case_bytes(const char * s,size_t n,unsigned int basis)801 utf8_hash_case_bytes (const char *s, size_t n, unsigned int basis)
802 {
803 uint8_t folded_buf[2048];
804 size_t folded_len = sizeof folded_buf;
805 uint8_t *folded_s;
806 unsigned int hash;
807
808 folded_s = u8_casefold (CHAR_CAST (const uint8_t *, s), n,
809 NULL, UNINORM_NFKD, folded_buf, &folded_len);
810 if (folded_s != NULL)
811 {
812 hash = hash_bytes (folded_s, folded_len, basis);
813 if (folded_s != folded_buf)
814 free (folded_s);
815 }
816 else
817 {
818 if (errno == ENOMEM)
819 xalloc_die ();
820 hash = hash_bytes (s, n, basis);
821 }
822
823 return hash;
824 }
825
826 /* Returns a hash value for null-terminated UTF-8 string S, with lowercase and
827 uppercase letters treated as equal, starting from BASIS. */
828 unsigned int
utf8_hash_case_string(const char * s,unsigned int basis)829 utf8_hash_case_string (const char *s, unsigned int basis)
830 {
831 return utf8_hash_case_bytes (s, strlen (s), basis);
832 }
833
834 /* Compares UTF-8 strings A and B case-insensitively.
835 Returns a negative value if A < B, zero if A == B, positive if A > B. */
836 int
utf8_strcasecmp(const char * a,const char * b)837 utf8_strcasecmp (const char *a, const char *b)
838 {
839 return utf8_strncasecmp (a, strlen (a), b, strlen (b));
840 }
841
842 /* Compares UTF-8 strings A (with length AN) and B (with length BN)
843 case-insensitively.
844 Returns a negative value if A < B, zero if A == B, positive if A > B. */
845 int
utf8_strncasecmp(const char * a,size_t an,const char * b,size_t bn)846 utf8_strncasecmp (const char *a, size_t an, const char *b, size_t bn)
847 {
848 int result;
849
850 if (u8_casecmp (CHAR_CAST (const uint8_t *, a), an,
851 CHAR_CAST (const uint8_t *, b), bn,
852 NULL, UNINORM_NFKD, &result))
853 {
854 if (errno == ENOMEM)
855 xalloc_die ();
856
857 result = memcmp (a, b, MIN (an, bn));
858 if (result == 0)
859 result = an < bn ? -1 : an > bn;
860 }
861
862 return result;
863 }
864
865 static bool
is_all_digits(const uint8_t * s,size_t len)866 is_all_digits (const uint8_t *s, size_t len)
867 {
868 for (size_t i = 0; i < len; i++)
869 if (!c_isdigit (s[i]))
870 return false;
871 return true;
872 }
873
874 /* Compares UTF-8 strings A and B case-insensitively. If the strings end in a
875 number, then they are compared numerically. Returns a negative value if A <
876 B, zero if A == B, positive if A > B. */
877 int
utf8_strverscasecmp(const char * a,const char * b)878 utf8_strverscasecmp (const char *a, const char *b)
879 {
880 /* Normalize A. */
881 uint8_t a_stub[64];
882 size_t a_len = sizeof a_stub;
883 uint8_t *a_norm = u8_casefold (CHAR_CAST (uint8_t *, a), strlen (a), NULL,
884 UNINORM_NFKD, a_stub, &a_len);
885
886 /* Normalize B. */
887 uint8_t b_stub[64];
888 size_t b_len = sizeof b_stub;
889 uint8_t *b_norm = u8_casefold (CHAR_CAST (uint8_t *, b), strlen (b), NULL,
890 UNINORM_NFKD, b_stub, &b_len);
891
892 int result;
893 if (!a_norm || !b_norm)
894 {
895 result = strcmp (a, b);
896 goto exit;
897 }
898
899 size_t len = MIN (a_len, b_len);
900 for (size_t i = 0; i < len; i++)
901 if (a_norm[i] != b_norm[i])
902 {
903 /* If both strings end in digits, compare them numerically. */
904 if (is_all_digits (&a_norm[i], a_len - i)
905 && is_all_digits (&b_norm[i], b_len - i))
906 {
907 /* Start by stripping leading zeros, since those don't matter for
908 numerical comparison. */
909 size_t ap, bp;
910 for (ap = i; ap < a_len; ap++)
911 if (a_norm[ap] != '0')
912 break;
913 for (bp = i; bp < b_len; bp++)
914 if (b_norm[bp] != '0')
915 break;
916
917 /* The number with more digits, if there is one, is larger. */
918 size_t a_digits = a_len - ap;
919 size_t b_digits = b_len - bp;
920 if (a_digits != b_digits)
921 result = a_digits > b_digits ? 1 : -1;
922 else
923 result = memcmp (&a_norm[ap], &b_norm[bp], a_digits);
924 }
925 else
926 result = a_norm[i] > b_norm[i] ? 1 : -1;
927 goto exit;
928 }
929 result = a_len < b_len ? -1 : a_len > b_len;
930
931 exit:
932 if (a_norm != a_stub)
933 free (a_norm);
934 if (b_norm != b_stub)
935 free (b_norm);
936 return result;
937 }
938
939 static char *
utf8_casemap(const char * s,uint8_t * (* f)(const uint8_t *,size_t,const char *,uninorm_t,uint8_t *,size_t *))940 utf8_casemap (const char *s,
941 uint8_t *(*f) (const uint8_t *, size_t, const char *, uninorm_t,
942 uint8_t *, size_t *))
943 {
944 char *result;
945 size_t size;
946
947 result = CHAR_CAST (char *,
948 f (CHAR_CAST (const uint8_t *, s), strlen (s) + 1,
949 NULL, NULL, NULL, &size));
950 if (result == NULL)
951 {
952 if (errno == ENOMEM)
953 xalloc_die ();
954
955 result = xstrdup (s);
956 }
957 return result;
958 }
959
960 char *
utf8_to_upper(const char * s)961 utf8_to_upper (const char *s)
962 {
963 return utf8_casemap (s, u8_toupper);
964 }
965
966 char *
utf8_to_lower(const char * s)967 utf8_to_lower (const char *s)
968 {
969 return utf8_casemap (s, u8_tolower);
970 }
971
972 char *
utf8_to_title(const char * s)973 utf8_to_title (const char *s)
974 {
975 return utf8_casemap (s, u8_totitle);
976 }
977
978 bool
get_encoding_info(struct encoding_info * e,const char * name)979 get_encoding_info (struct encoding_info *e, const char *name)
980 {
981 const struct substring in = SS_LITERAL_INITIALIZER (
982 "\t\n\v\f\r "
983 "!\"#$%&'()*+,-./0123456789:;<=>?@"
984 "ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`"
985 "abcdefghijklmnopqrstuvwxyz{|}~");
986
987 struct substring out, cr, lf, space;
988 bool ok;
989
990 memset (e, 0, sizeof *e);
991
992 cr = recode_substring_pool (name, "UTF-8", ss_cstr ("\r"), NULL);
993 lf = recode_substring_pool (name, "UTF-8", ss_cstr ("\n"), NULL);
994 space = recode_substring_pool (name, "UTF-8", ss_cstr (" "), NULL);
995 ok = (cr.length >= 1
996 && cr.length <= MAX_UNIT
997 && cr.length == lf.length
998 && cr.length == space.length);
999 if (!ok)
1000 {
1001 fprintf (stderr, "warning: encoding `%s' is not supported.\n", name);
1002 ss_dealloc (&cr);
1003 ss_dealloc (&lf);
1004 ss_dealloc (&space);
1005 ss_alloc_substring (&cr, ss_cstr ("\r"));
1006 ss_alloc_substring (&lf, ss_cstr ("\n"));
1007 ss_alloc_substring (&space, ss_cstr (" "));
1008 }
1009
1010 e->unit = cr.length;
1011 memcpy (e->cr, cr.string, e->unit);
1012 memcpy (e->lf, lf.string, e->unit);
1013 memcpy (e->space, space.string, e->unit);
1014
1015 ss_dealloc (&cr);
1016 ss_dealloc (&lf);
1017 ss_dealloc (&space);
1018
1019 out = recode_substring_pool ("UTF-8", name, in, NULL);
1020 e->is_ascii_compatible = ss_equals (in, out);
1021 ss_dealloc (&out);
1022
1023 if (!e->is_ascii_compatible && e->unit == 1)
1024 {
1025 out = recode_substring_pool ("UTF-8", name, ss_cstr ("A"), NULL);
1026 e->is_ebcdic_compatible = (out.length == 1
1027 && (uint8_t) out.string[0] == 0xc1);
1028 ss_dealloc (&out);
1029 }
1030 else
1031 e->is_ebcdic_compatible = false;
1032
1033 return ok;
1034 }
1035
1036 bool
is_encoding_ascii_compatible(const char * encoding)1037 is_encoding_ascii_compatible (const char *encoding)
1038 {
1039 struct encoding_info e;
1040
1041 get_encoding_info (&e, encoding);
1042 return e.is_ascii_compatible;
1043 }
1044
1045 bool
is_encoding_ebcdic_compatible(const char * encoding)1046 is_encoding_ebcdic_compatible (const char *encoding)
1047 {
1048 struct encoding_info e;
1049
1050 get_encoding_info (&e, encoding);
1051 return e.is_ebcdic_compatible;
1052 }
1053
1054 /* Returns true if iconv can convert ENCODING to and from UTF-8,
1055 otherwise false. */
1056 bool
is_encoding_supported(const char * encoding)1057 is_encoding_supported (const char *encoding)
1058 {
1059 return (create_iconv ("UTF-8", encoding)
1060 && create_iconv (encoding, "UTF-8"));
1061 }
1062
1063 /* Returns true if E is the name of a UTF-8 encoding.
1064
1065 XXX Possibly we should test not E as a string but its properties via
1066 iconv. */
1067 bool
is_encoding_utf8(const char * e)1068 is_encoding_utf8 (const char *e)
1069 {
1070 return ((e[0] == 'u' || e[0] == 'U')
1071 && (e[1] == 't' || e[1] == 'T')
1072 && (e[2] == 'f' || e[2] == 'F')
1073 && ((e[3] == '8' && e[4] == '\0')
1074 || (e[3] == '-' && e[4] == '8' && e[5] == '\0')));
1075 }
1076
1077 static struct encoding_category *categories;
1078 static int n_categories;
1079
1080 static void SENTINEL (0)
add_category(size_t * allocated_categories,const char * category,...)1081 add_category (size_t *allocated_categories, const char *category, ...)
1082 {
1083 struct encoding_category *c;
1084 const char *encodings[16];
1085 va_list args;
1086 int i, n;
1087
1088 /* Count encoding arguments. */
1089 va_start (args, category);
1090 n = 0;
1091 while ((encodings[n] = va_arg (args, const char *)) != NULL)
1092 {
1093 const char *encoding = encodings[n];
1094 if (!strcmp (encoding, "Auto") || is_encoding_supported (encoding))
1095 n++;
1096 }
1097 assert (n < sizeof encodings / sizeof *encodings);
1098 va_end (args);
1099
1100 if (n == 0)
1101 return;
1102
1103 if (n_categories >= *allocated_categories)
1104 categories = x2nrealloc (categories,
1105 allocated_categories, sizeof *categories);
1106
1107 c = &categories[n_categories++];
1108 c->category = category;
1109 c->encodings = xmalloc (n * sizeof *c->encodings);
1110 for (i = 0; i < n; i++)
1111 c->encodings[i] = encodings[i];
1112 c->n_encodings = n;
1113 }
1114
1115 static void
init_encoding_categories(void)1116 init_encoding_categories (void)
1117 {
1118 static bool inited;
1119 size_t alloc;
1120
1121 if (inited)
1122 return;
1123 inited = true;
1124
1125 alloc = 0;
1126 add_category (&alloc, "Unicode", "UTF-8", "UTF-16", "UTF-16BE", "UTF-16LE",
1127 "UTF-32", "UTF-32BE", "UTF-32LE", NULL_SENTINEL);
1128 add_category (&alloc, _("Arabic"), "IBM864", "ISO-8859-6", "Windows-1256",
1129 NULL_SENTINEL);
1130 add_category (&alloc, _("Armenian"), "ARMSCII-8", NULL_SENTINEL);
1131 add_category (&alloc, _("Baltic"), "ISO-8859-13", "ISO-8859-4",
1132 "Windows-1257", NULL_SENTINEL);
1133 add_category (&alloc, _("Celtic"), "ISO-8859-14", NULL_SENTINEL);
1134 add_category (&alloc, _("Central European"), "IBM852", "ISO-8859-2",
1135 "Mac-CentralEurope", "Windows-1250", NULL_SENTINEL);
1136 add_category (&alloc, _("Chinese Simplified"), "GB18030", "GB2312", "GBK",
1137 "HZ-GB-2312", "ISO-2022-CN", NULL_SENTINEL);
1138 add_category (&alloc, _("Chinese Traditional"), "Big5", "Big5-HKSCS",
1139 "EUC-TW", NULL_SENTINEL);
1140 add_category (&alloc, _("Croatian"), "MacCroatian", NULL_SENTINEL);
1141 add_category (&alloc, _("Cyrillic"), "IBM855", "ISO-8859-5", "ISO-IR-111",
1142 "KOI8-R", "MacCyrillic", NULL_SENTINEL);
1143 add_category (&alloc, _("Cyrillic/Russian"), "IBM866", NULL_SENTINEL);
1144 add_category (&alloc, _("Cyrillic/Ukrainian"), "KOI8-U", "MacUkrainian",
1145 NULL_SENTINEL);
1146 add_category (&alloc, _("Georgian"), "GEOSTD8", NULL_SENTINEL);
1147 add_category (&alloc, _("Greek"), "ISO-8859-7", "MacGreek", NULL_SENTINEL);
1148 add_category (&alloc, _("Gujarati"), "MacGujarati", NULL_SENTINEL);
1149 add_category (&alloc, _("Gurmukhi"), "MacGurmukhi", NULL_SENTINEL);
1150 add_category (&alloc, _("Hebrew"), "IBM862", "ISO-8859-8-I", "Windows-1255",
1151 NULL_SENTINEL);
1152 add_category (&alloc, _("Hebrew Visual"), "ISO-8859-8", NULL_SENTINEL);
1153 add_category (&alloc, _("Hindi"), "MacDevangari", NULL_SENTINEL);
1154 add_category (&alloc, _("Icelandic"), "MacIcelandic", NULL_SENTINEL);
1155 add_category (&alloc, _("Japanese"), "EUC-JP", "ISO-2022-JP", "Shift_JIS",
1156 NULL_SENTINEL);
1157 add_category (&alloc, _("Korean"), "EUC-KR", "ISO-2022-KR", "JOHAB", "UHC",
1158 NULL_SENTINEL);
1159 add_category (&alloc, _("Nordic"), "ISO-8859-10", NULL_SENTINEL);
1160 add_category (&alloc, _("Romanian"), "ISO-8859-16", "MacRomanian",
1161 NULL_SENTINEL);
1162 add_category (&alloc, _("South European"), "ISO-8859-3", NULL_SENTINEL);
1163 add_category (&alloc, _("Thai"), "ISO-8859-11", "TIS-620", "Windows-874",
1164 NULL_SENTINEL);
1165 add_category (&alloc, _("Turkish"), "IBM857", "ISO-8859-9", "Windows-1254",
1166 NULL_SENTINEL);
1167 add_category (&alloc, _("Vietnamese"), "TVCN", "VISCII", "VPS",
1168 "Windows-1258", NULL_SENTINEL);
1169 add_category (&alloc, _("Western European"), "ISO-8859-1", "ISO-8859-15",
1170 "Windows-1252", "IBM850", "MacRoman", NULL_SENTINEL);
1171 }
1172
1173 /* Returns an array of "struct encoding_category" that contains only the
1174 categories and encodings that the system supports. */
1175 struct encoding_category *
get_encoding_categories(void)1176 get_encoding_categories (void)
1177 {
1178 init_encoding_categories ();
1179 return categories;
1180 }
1181
1182 /* Returns the number of elements in the array returned by
1183 get_encoding_categories(). */
1184 size_t
get_n_encoding_categories(void)1185 get_n_encoding_categories (void)
1186 {
1187 init_encoding_categories ();
1188 return n_categories;
1189 }
1190