1 /* Charset handling while reading PO files.
2    Copyright (C) 2001-2007, 2010, 2019-2020 Free Software Foundation, Inc.
3    Written by Bruno Haible <haible@clisp.cons.org>, 2001.
4 
5    This program is free software: you can redistribute it and/or modify
6    it under the terms of the GNU General Public License as published by
7    the Free Software Foundation; either version 3 of the License, or
8    (at your option) any later version.
9 
10    This program is distributed in the hope that it will be useful,
11    but WITHOUT ANY WARRANTY; without even the implied warranty of
12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13    GNU General Public License for more details.
14 
15    You should have received a copy of the GNU General Public License
16    along with this program.  If not, see <https://www.gnu.org/licenses/>.  */
17 
18 
19 #ifdef HAVE_CONFIG_H
20 # include "config.h"
21 #endif
22 #include <alloca.h>
23 
24 /* Specification.  */
25 #include "po-charset.h"
26 
27 #include <stdlib.h>
28 #include <string.h>
29 
30 #include "xmalloca.h"
31 #include "xvasprintf.h"
32 #include "po-xerror.h"
33 #if !IN_LIBGETTEXTPO
34 # include "basename-lgpl.h"
35 # include "progname.h"
36 #endif
37 #include "c-strstr.h"
38 #include "c-strcase.h"
39 #include "gettext.h"
40 
41 #define _(str) gettext (str)
42 
43 #define SIZEOF(a) (sizeof(a) / sizeof(a[0]))
44 
45 static const char ascii[] = "ASCII";
46 
47 /* The canonicalized encoding name for ASCII.  */
48 const char *po_charset_ascii = ascii;
49 
50 static const char utf8[] = "UTF-8";
51 
52 /* The canonicalized encoding name for UTF-8.  */
53 const char *po_charset_utf8 = utf8;
54 
55 /* Canonicalize an encoding name.  */
56 const char *
po_charset_canonicalize(const char * charset)57 po_charset_canonicalize (const char *charset)
58 {
59   /* The list of charsets supported by glibc's iconv() and by the portable
60      iconv() across platforms.  Taken from intl/localcharset.h.  */
61   static const char *standard_charsets[] =
62   {
63     ascii, "ANSI_X3.4-1968", "US-ASCII",        /* i = 0..2 */
64     "ISO-8859-1", "ISO_8859-1",                 /* i = 3, 4 */
65     "ISO-8859-2", "ISO_8859-2",
66     "ISO-8859-3", "ISO_8859-3",
67     "ISO-8859-4", "ISO_8859-4",
68     "ISO-8859-5", "ISO_8859-5",
69     "ISO-8859-6", "ISO_8859-6",
70     "ISO-8859-7", "ISO_8859-7",
71     "ISO-8859-8", "ISO_8859-8",
72     "ISO-8859-9", "ISO_8859-9",
73     "ISO-8859-13", "ISO_8859-13",
74     "ISO-8859-14", "ISO_8859-14",
75     "ISO-8859-15", "ISO_8859-15",               /* i = 25, 26 */
76     "KOI8-R",
77     "KOI8-U",
78     "KOI8-T",
79     "CP850",
80     "CP866",
81     "CP874",
82     "CP932",
83     "CP949",
84     "CP950",
85     "CP1250",
86     "CP1251",
87     "CP1252",
88     "CP1253",
89     "CP1254",
90     "CP1255",
91     "CP1256",
92     "CP1257",
93     "GB2312",
94     "EUC-JP",
95     "EUC-KR",
96     "EUC-TW",
97     "BIG5",
98     "BIG5-HKSCS",
99     "GBK",
100     "GB18030",
101     "SHIFT_JIS",
102     "JOHAB",
103     "TIS-620",
104     "VISCII",
105     "GEORGIAN-PS",
106     utf8
107   };
108   size_t i;
109 
110   for (i = 0; i < SIZEOF (standard_charsets); i++)
111     if (c_strcasecmp (charset, standard_charsets[i]) == 0)
112       return standard_charsets[i < 3 ? 0 : i < 27 ? ((i - 3) & ~1) + 3 : i];
113   return NULL;
114 }
115 
116 /* Test for ASCII compatibility.  */
117 bool
po_charset_ascii_compatible(const char * canon_charset)118 po_charset_ascii_compatible (const char *canon_charset)
119 {
120   /* There are only a few exceptions to ASCII compatibility.  */
121   if (strcmp (canon_charset, "SHIFT_JIS") == 0
122       || strcmp (canon_charset, "JOHAB") == 0
123       || strcmp (canon_charset, "VISCII") == 0)
124     return false;
125   else
126     return true;
127 }
128 
129 /* Test for a weird encoding, i.e. an encoding which has double-byte
130    characters ending in 0x5C.  */
po_is_charset_weird(const char * canon_charset)131 bool po_is_charset_weird (const char *canon_charset)
132 {
133   static const char *weird_charsets[] =
134   {
135     "BIG5",
136     "BIG5-HKSCS",
137     "GBK",
138     "GB18030",
139     "SHIFT_JIS",
140     "JOHAB"
141   };
142   size_t i;
143 
144   for (i = 0; i < SIZEOF (weird_charsets); i++)
145     if (strcmp (canon_charset, weird_charsets[i]) == 0)
146       return true;
147   return false;
148 }
149 
150 /* Test for a weird CJK encoding, i.e. a weird encoding with CJK structure.
151    An encoding has CJK structure if every valid character stream is composed
152    of single bytes in the range 0x{00..7F} and of byte pairs in the range
153    0x{80..FF}{30..FF}.  */
po_is_charset_weird_cjk(const char * canon_charset)154 bool po_is_charset_weird_cjk (const char *canon_charset)
155 {
156   static const char *weird_cjk_charsets[] =
157   {                     /* single bytes   double bytes       */
158     "BIG5",             /* 0x{00..7F},    0x{A1..F9}{40..FE} */
159     "BIG5-HKSCS",       /* 0x{00..7F},    0x{88..FE}{40..FE} */
160     "GBK",              /* 0x{00..7F},    0x{81..FE}{40..FE} */
161     "GB18030",          /* 0x{00..7F},    0x{81..FE}{30..FE} */
162     "SHIFT_JIS",        /* 0x{00..7F},    0x{81..F9}{40..FC} */
163     "JOHAB"             /* 0x{00..7F},    0x{84..F9}{31..FE} */
164   };
165   size_t i;
166 
167   for (i = 0; i < SIZEOF (weird_cjk_charsets); i++)
168     if (strcmp (canon_charset, weird_cjk_charsets[i]) == 0)
169       return true;
170   return false;
171 }
172 
173 /* Hardcoded iterator functions for all kinds of encodings.
174    We could also implement a general iterator function with iconv(),
175    but we need a fast one.  */
176 
177 /* Character iterator for 8-bit encodings.  */
178 static size_t
char_iterator(const char * s)179 char_iterator (const char *s)
180 {
181   return 1;
182 }
183 
184 /* Character iterator for GB2312.  See libiconv/lib/euc_cn.h.  */
185 /* Character iterator for EUC-KR.  See libiconv/lib/euc_kr.h.  */
186 static size_t
euc_character_iterator(const char * s)187 euc_character_iterator (const char *s)
188 {
189   unsigned char c = *s;
190   if (c >= 0xa1 && c < 0xff)
191     {
192       unsigned char c2 = s[1];
193       if (c2 >= 0xa1 && c2 < 0xff)
194         return 2;
195     }
196   return 1;
197 }
198 
199 /* Character iterator for EUC-JP.  See libiconv/lib/euc_jp.h.  */
200 static size_t
euc_jp_character_iterator(const char * s)201 euc_jp_character_iterator (const char *s)
202 {
203   unsigned char c = *s;
204   if (c >= 0xa1 && c < 0xff)
205     {
206       unsigned char c2 = s[1];
207       if (c2 >= 0xa1 && c2 < 0xff)
208         return 2;
209     }
210   else if (c == 0x8e)
211     {
212       unsigned char c2 = s[1];
213       if (c2 >= 0xa1 && c2 < 0xe0)
214         return 2;
215     }
216   else if (c == 0x8f)
217     {
218       unsigned char c2 = s[1];
219       if (c2 >= 0xa1 && c2 < 0xff)
220         {
221           unsigned char c3 = s[2];
222           if (c3 >= 0xa1 && c3 < 0xff)
223             return 3;
224         }
225     }
226   return 1;
227 }
228 
229 /* Character iterator for EUC-TW.  See libiconv/lib/euc_tw.h.  */
230 static size_t
euc_tw_character_iterator(const char * s)231 euc_tw_character_iterator (const char *s)
232 {
233   unsigned char c = *s;
234   if (c >= 0xa1 && c < 0xff)
235     {
236       unsigned char c2 = s[1];
237       if (c2 >= 0xa1 && c2 < 0xff)
238         return 2;
239     }
240   else if (c == 0x8e)
241     {
242       unsigned char c2 = s[1];
243       if (c2 >= 0xa1 && c2 <= 0xb0)
244         {
245           unsigned char c3 = s[2];
246           if (c3 >= 0xa1 && c3 < 0xff)
247             {
248               unsigned char c4 = s[3];
249               if (c4 >= 0xa1 && c4 < 0xff)
250                 return 4;
251             }
252         }
253     }
254   return 1;
255 }
256 
257 /* Character iterator for BIG5.  See libiconv/lib/ces_big5.h.  */
258 static size_t
big5_character_iterator(const char * s)259 big5_character_iterator (const char *s)
260 {
261   unsigned char c = *s;
262   if (c >= 0xa1 && c < 0xff)
263     {
264       unsigned char c2 = s[1];
265       if ((c2 >= 0x40 && c2 < 0x7f) || (c2 >= 0xa1 && c2 < 0xff))
266         return 2;
267     }
268   return 1;
269 }
270 
271 /* Character iterator for BIG5-HKSCS.  See libiconv/lib/big5hkscs.h.  */
272 static size_t
big5hkscs_character_iterator(const char * s)273 big5hkscs_character_iterator (const char *s)
274 {
275   unsigned char c = *s;
276   if (c >= 0x88 && c < 0xff)
277     {
278       unsigned char c2 = s[1];
279       if ((c2 >= 0x40 && c2 < 0x7f) || (c2 >= 0xa1 && c2 < 0xff))
280         return 2;
281     }
282   return 1;
283 }
284 
285 /* Character iterator for GBK.  See libiconv/lib/ces_gbk.h and
286    libiconv/lib/gbk.h.  */
287 static size_t
gbk_character_iterator(const char * s)288 gbk_character_iterator (const char *s)
289 {
290   unsigned char c = *s;
291   if (c >= 0x81 && c < 0xff)
292     {
293       unsigned char c2 = s[1];
294       if ((c2 >= 0x40 && c2 < 0x7f) || (c2 >= 0x80 && c2 < 0xff))
295         return 2;
296     }
297   return 1;
298 }
299 
300 /* Character iterator for GB18030.  See libiconv/lib/gb18030.h.  */
301 static size_t
gb18030_character_iterator(const char * s)302 gb18030_character_iterator (const char *s)
303 {
304   unsigned char c = *s;
305   if (c >= 0x81 && c < 0xff)
306     {
307       unsigned char c2 = s[1];
308       if ((c2 >= 0x40 && c2 < 0x7f) || (c2 >= 0x80 && c2 < 0xff))
309         return 2;
310     }
311   if (c >= 0x81 && c <= 0x84)
312     {
313       unsigned char c2 = s[1];
314       if (c2 >= 0x30 && c2 <= 0x39)
315         {
316           unsigned char c3 = s[2];
317           if (c3 >= 0x81 && c3 < 0xff)
318             {
319               unsigned char c4 = s[3];
320               if (c4 >= 0x30 && c4 <= 0x39)
321                 return 4;
322             }
323         }
324     }
325   return 1;
326 }
327 
328 /* Character iterator for SHIFT_JIS.  See libiconv/lib/sjis.h.  */
329 static size_t
shift_jis_character_iterator(const char * s)330 shift_jis_character_iterator (const char *s)
331 {
332   unsigned char c = *s;
333   if ((c >= 0x81 && c <= 0x9f) || (c >= 0xe0 && c <= 0xf9))
334     {
335       unsigned char c2 = s[1];
336       if ((c2 >= 0x40 && c2 <= 0x7e) || (c2 >= 0x80 && c2 <= 0xfc))
337         return 2;
338     }
339   return 1;
340 }
341 
342 /* Character iterator for JOHAB.  See libiconv/lib/johab.h and
343    libiconv/lib/johab_hangul.h.  */
344 static size_t
johab_character_iterator(const char * s)345 johab_character_iterator (const char *s)
346 {
347   unsigned char c = *s;
348   if (c >= 0x84 && c <= 0xd3)
349     {
350       unsigned char c2 = s[1];
351       if ((c2 >= 0x41 && c2 < 0x7f) || (c2 >= 0x81 && c2 < 0xff))
352         return 2;
353     }
354   else if (c >= 0xd9 && c <= 0xf9)
355     {
356       unsigned char c2 = s[1];
357       if ((c2 >= 0x31 && c2 <= 0x7e) || (c2 >= 0x91 && c2 <= 0xfe))
358         return 2;
359     }
360   return 1;
361 }
362 
363 /* Character iterator for UTF-8.  See libiconv/lib/utf8.h.  */
364 static size_t
utf8_character_iterator(const char * s)365 utf8_character_iterator (const char *s)
366 {
367   unsigned char c = *s;
368   if (c >= 0xc2)
369     {
370       if (c < 0xe0)
371         {
372           unsigned char c2 = s[1];
373           if (c2 >= 0x80 && c2 < 0xc0)
374             return 2;
375         }
376       else if (c < 0xf0)
377         {
378           unsigned char c2 = s[1];
379           if (c2 >= 0x80 && c2 < 0xc0)
380             {
381               unsigned char c3 = s[2];
382               if (c3 >= 0x80 && c3 < 0xc0)
383                 return 3;
384             }
385         }
386       else if (c < 0xf8)
387         {
388           unsigned char c2 = s[1];
389           if (c2 >= 0x80 && c2 < 0xc0)
390             {
391               unsigned char c3 = s[2];
392               if (c3 >= 0x80 && c3 < 0xc0)
393                 {
394                   unsigned char c4 = s[3];
395                   if (c4 >= 0x80 && c4 < 0xc0)
396                     return 4;
397                 }
398             }
399         }
400     }
401   return 1;
402 }
403 
404 /* Returns a character iterator for a given encoding.
405    Given a pointer into a string, it returns the number occupied by the next
406    single character.  If the piece of string is not valid or if the *s == '\0',
407    it returns 1.  */
408 character_iterator_t
po_charset_character_iterator(const char * canon_charset)409 po_charset_character_iterator (const char *canon_charset)
410 {
411   if (canon_charset == utf8)
412     return utf8_character_iterator;
413   if (strcmp (canon_charset, "GB2312") == 0
414       || strcmp (canon_charset, "EUC-KR") == 0)
415     return euc_character_iterator;
416   if (strcmp (canon_charset, "EUC-JP") == 0)
417     return euc_jp_character_iterator;
418   if (strcmp (canon_charset, "EUC-TW") == 0)
419     return euc_tw_character_iterator;
420   if (strcmp (canon_charset, "BIG5") == 0)
421     return big5_character_iterator;
422   if (strcmp (canon_charset, "BIG5-HKSCS") == 0)
423     return big5hkscs_character_iterator;
424   if (strcmp (canon_charset, "GBK") == 0)
425     return gbk_character_iterator;
426   if (strcmp (canon_charset, "GB18030") == 0)
427     return gb18030_character_iterator;
428   if (strcmp (canon_charset, "SHIFT_JIS") == 0)
429     return shift_jis_character_iterator;
430   if (strcmp (canon_charset, "JOHAB") == 0)
431     return johab_character_iterator;
432   return char_iterator;
433 }
434 
435 
436 /* The PO file's encoding, as specified in the header entry.  */
437 const char *po_lex_charset;
438 
439 #if HAVE_ICONV
440 /* Converter from the PO file's encoding to UTF-8.  */
441 iconv_t po_lex_iconv;
442 #endif
443 /* If no converter is available, some information about the structure of the
444    PO file's encoding.  */
445 bool po_lex_weird_cjk;
446 
447 void
po_lex_charset_init()448 po_lex_charset_init ()
449 {
450   po_lex_charset = NULL;
451 #if HAVE_ICONV
452   po_lex_iconv = (iconv_t)(-1);
453 #endif
454   po_lex_weird_cjk = false;
455 }
456 
457 void
po_lex_charset_set(const char * header_entry,const char * filename)458 po_lex_charset_set (const char *header_entry, const char *filename)
459 {
460   /* Verify the validity of CHARSET.  It is necessary
461      1. for the correct treatment of multibyte characters containing
462         0x5C bytes in the PO lexer,
463      2. so that at run time, gettext() can call iconv() to convert
464         msgstr.  */
465   const char *charsetstr = c_strstr (header_entry, "charset=");
466 
467   if (charsetstr != NULL)
468     {
469       size_t len;
470       char *charset;
471       const char *canon_charset;
472 
473       charsetstr += strlen ("charset=");
474       len = strcspn (charsetstr, " \t\n");
475       charset = (char *) xmalloca (len + 1);
476       memcpy (charset, charsetstr, len);
477       charset[len] = '\0';
478 
479       canon_charset = po_charset_canonicalize (charset);
480       if (canon_charset == NULL)
481         {
482           /* Don't warn for POT files, because POT files usually contain
483              only ASCII msgids.  */
484           size_t filenamelen = strlen (filename);
485 
486           if (!(filenamelen >= 4
487                 && memcmp (filename + filenamelen - 4, ".pot", 4) == 0
488                 && strcmp (charset, "CHARSET") == 0))
489             {
490               char *warning_message =
491                 xasprintf (_("\
492 Charset \"%s\" is not a portable encoding name.\n\
493 Message conversion to user's charset might not work.\n"),
494                            charset);
495               po_xerror (PO_SEVERITY_WARNING, NULL,
496                          filename, (size_t)(-1), (size_t)(-1), true,
497                          warning_message);
498               free (warning_message);
499             }
500         }
501       else
502         {
503           const char *envval;
504 
505           po_lex_charset = canon_charset;
506 #if HAVE_ICONV
507           if (po_lex_iconv != (iconv_t)(-1))
508             iconv_close (po_lex_iconv);
509 #endif
510 
511           /* The old Solaris/openwin msgfmt and GNU msgfmt <= 0.10.35
512              don't know about multibyte encodings, and require a spurious
513              backslash after every multibyte character whose last byte is
514              0x5C.  Some programs, like vim, distribute PO files in this
515              broken format.  GNU msgfmt must continue to support this old
516              PO file format when the Makefile requests it.  */
517           envval = getenv ("OLD_PO_FILE_INPUT");
518           if (envval != NULL && *envval != '\0')
519             {
520               /* Assume the PO file is in old format, with extraneous
521                  backslashes.  */
522 #if HAVE_ICONV
523               po_lex_iconv = (iconv_t)(-1);
524 #endif
525               po_lex_weird_cjk = false;
526             }
527           else
528             {
529               /* Use iconv() to parse multibyte characters.  */
530 #if HAVE_ICONV
531               /* Avoid glibc-2.1 bug with EUC-KR.  */
532 # if ((__GLIBC__ == 2 && __GLIBC_MINOR__ <= 1) && !defined __UCLIBC__) \
533      && !defined _LIBICONV_VERSION
534               if (strcmp (po_lex_charset, "EUC-KR") == 0)
535                 po_lex_iconv = (iconv_t)(-1);
536               else
537 # endif
538               /* Avoid Solaris 2.9 bug with GB2312, EUC-TW, BIG5, BIG5-HKSCS,
539                  GBK, GB18030.  */
540 # if defined __sun && !defined _LIBICONV_VERSION
541               if (   strcmp (po_lex_charset, "GB2312") == 0
542                   || strcmp (po_lex_charset, "EUC-TW") == 0
543                   || strcmp (po_lex_charset, "BIG5") == 0
544                   || strcmp (po_lex_charset, "BIG5-HKSCS") == 0
545                   || strcmp (po_lex_charset, "GBK") == 0
546                   || strcmp (po_lex_charset, "GB18030") == 0)
547                 po_lex_iconv = (iconv_t)(-1);
548               else
549 # endif
550               po_lex_iconv = iconv_open ("UTF-8", po_lex_charset);
551               if (po_lex_iconv == (iconv_t)(-1))
552                 {
553                   const char *progname;
554                   char *warning_message;
555                   const char *recommendation;
556                   const char *note;
557                   char *whole_message;
558 
559 # if IN_LIBGETTEXTPO
560                   progname = "libgettextpo";
561 # else
562                   progname = last_component (program_name);
563 # endif
564 
565                   warning_message =
566                     xasprintf (_("\
567 Charset \"%s\" is not supported. %s relies on iconv(),\n\
568 and iconv() does not support \"%s\".\n"),
569                                po_lex_charset, progname, po_lex_charset);
570 
571 # if !defined _LIBICONV_VERSION
572                   recommendation = _("\
573 Installing GNU libiconv and then reinstalling GNU gettext\n\
574 would fix this problem.\n");
575 # else
576                   recommendation = "";
577 # endif
578 
579                   /* Test for a charset which has double-byte characters
580                      ending in 0x5C.  For these encodings, the string parser
581                      is likely to be confused if it can't see the character
582                      boundaries.  */
583                   po_lex_weird_cjk = po_is_charset_weird_cjk (po_lex_charset);
584                   if (po_is_charset_weird (po_lex_charset)
585                       && !po_lex_weird_cjk)
586                     note = _("Continuing anyway, expect parse errors.");
587                   else
588                     note = _("Continuing anyway.");
589 
590                   whole_message =
591                     xasprintf ("%s%s%s\n",
592                                warning_message, recommendation, note);
593 
594                   po_xerror (PO_SEVERITY_WARNING, NULL,
595                              filename, (size_t)(-1), (size_t)(-1), true,
596                              whole_message);
597 
598                   free (whole_message);
599                   free (warning_message);
600                 }
601 #else
602               /* Test for a charset which has double-byte characters
603                  ending in 0x5C.  For these encodings, the string parser
604                  is likely to be confused if it can't see the character
605                  boundaries.  */
606               po_lex_weird_cjk = po_is_charset_weird_cjk (po_lex_charset);
607               if (po_is_charset_weird (po_lex_charset) && !po_lex_weird_cjk)
608                 {
609                   const char *progname;
610                   char *warning_message;
611                   const char *recommendation;
612                   const char *note;
613                   char *whole_message;
614 
615 # if IN_LIBGETTEXTPO
616                   progname = "libgettextpo";
617 # else
618                   progname = last_component (program_name);
619 # endif
620 
621                   warning_message =
622                     xasprintf (_("\
623 Charset \"%s\" is not supported. %s relies on iconv().\n\
624 This version was built without iconv().\n"),
625                                po_lex_charset, progname);
626 
627                   recommendation = _("\
628 Installing GNU libiconv and then reinstalling GNU gettext\n\
629 would fix this problem.\n");
630 
631                   note = _("Continuing anyway, expect parse errors.");
632 
633                   whole_message =
634                     xasprintf ("%s%s%s\n",
635                                warning_message, recommendation, note);
636 
637                   po_xerror (PO_SEVERITY_WARNING, NULL,
638                              filename, (size_t)(-1), (size_t)(-1), true,
639                              whole_message);
640 
641                   free (whole_message);
642                   free (warning_message);
643                 }
644 #endif
645             }
646         }
647       freea (charset);
648     }
649   else
650     {
651       /* Don't warn for POT files, because POT files usually contain
652          only ASCII msgids.  */
653       size_t filenamelen = strlen (filename);
654 
655       if (!(filenamelen >= 4
656             && memcmp (filename + filenamelen - 4, ".pot", 4) == 0))
657         po_xerror (PO_SEVERITY_WARNING,
658                    NULL, filename, (size_t)(-1), (size_t)(-1), true,
659                    _("\
660 Charset missing in header.\n\
661 Message conversion to user's charset will not work.\n"));
662     }
663 }
664 
665 void
po_lex_charset_close()666 po_lex_charset_close ()
667 {
668   po_lex_charset = NULL;
669 #if HAVE_ICONV
670   if (po_lex_iconv != (iconv_t)(-1))
671     {
672       iconv_close (po_lex_iconv);
673       po_lex_iconv = (iconv_t)(-1);
674     }
675 #endif
676   po_lex_weird_cjk = false;
677 }
678