1 /* Reading Java .properties files.
2    Copyright (C) 2003, 2005-2007, 2009, 2018, 2020 Free Software Foundation, Inc.
3    Written by Bruno Haible <bruno@clisp.org>, 2003.
4 
5    This program is free software: you can redistribute it and/or modify
6    it under the terms of the GNU General Public License as published by
7    the Free Software Foundation; either version 3 of the License, or
8    (at your option) any later version.
9 
10    This program is distributed in the hope that it will be useful,
11    but WITHOUT ANY WARRANTY; without even the implied warranty of
12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13    GNU General Public License for more details.
14 
15    You should have received a copy of the GNU General Public License
16    along with this program.  If not, see <https://www.gnu.org/licenses/>.  */
17 
18 #ifdef HAVE_CONFIG_H
19 # include <config.h>
20 #endif
21 
22 /* Specification.  */
23 #include "read-properties.h"
24 
25 #include <assert.h>
26 #include <errno.h>
27 #include <stdbool.h>
28 #include <stdio.h>
29 #include <stdlib.h>
30 #include <string.h>
31 
32 #include "error.h"
33 #include "error-progname.h"
34 #include "message.h"
35 #include "read-catalog-abstract.h"
36 #include "xalloc.h"
37 #include "xvasprintf.h"
38 #include "po-xerror.h"
39 #include "msgl-ascii.h"
40 #include "read-file.h"
41 #include "unistr.h"
42 #include "gettext.h"
43 
44 #define _(str) gettext (str)
45 
46 /* For compiling this file in C++ mode.  */
47 #ifdef __cplusplus
48 # define this thiss
49 #endif
50 
51 
52 /* The format of the Java .properties files is documented in the JDK
53    documentation for class java.util.Properties.  In the case of .properties
54    files for PropertyResourceBundle, each non-comment line contains a
55    key/value pair in the form "key = value" or "key : value" or "key value",
56    where the key is the msgid and the value is the msgstr.  Messages with
57    plurals are not supported in this format.
58 
59    The encoding of Java .properties files is:
60      - ASCII with Java \uxxxx escape sequences,
61      - ISO-8859-1 if non-ASCII bytes are encounterd,
62      - UTF-8 if non-ASCII bytes are encountered and the entire file is
63        valid UTF-8 (in Java 9 or newer), see
64        https://docs.oracle.com/javase/9/intl/internationalization-enhancements-jdk-9.htm */
65 
66 /* Handling of comments: We copy all comments from the .properties file to
67    the PO file. This is not really needed; it's a service for translators
68    who don't like PO files and prefer to maintain the .properties file.  */
69 
70 /* Real filename, used in error messages about the input file.  */
71 static const char *real_file_name;
72 
73 /* File name and line number.  */
74 extern lex_pos_ty gram_pos;
75 
76 /* The contents of the input file.  */
77 static char *contents;
78 static size_t contents_length;
79 
80 /* True if the input file is assumed to be in UTF-8 encoding.
81    False if it is assumed to be in ISO-8859-1 encoding.  */
82 static bool assume_utf8;
83 
84 /* Current position in contents.  */
85 static size_t position;
86 
87 /* Phase 1: Read an input byte.
88    Max. 1 pushback byte.  */
89 
90 static int
phase1_getc()91 phase1_getc ()
92 {
93   if (position == contents_length)
94     return EOF;
95 
96   return (unsigned char) contents[position++];
97 }
98 
99 static inline void
phase1_ungetc(int c)100 phase1_ungetc (int c)
101 {
102   if (c != EOF)
103     position--;
104 }
105 
106 
107 /* Phase 2: Read an input byte, treating CR/LF like a single LF.
108    Max. 2 pushback bytes.  */
109 
110 static unsigned char phase2_pushback[2];
111 static int phase2_pushback_length;
112 
113 static int
phase2_getc()114 phase2_getc ()
115 {
116   int c;
117 
118   if (phase2_pushback_length)
119     c = phase2_pushback[--phase2_pushback_length];
120   else
121     {
122       c = phase1_getc ();
123 
124       if (c == '\r')
125         {
126           int c2 = phase1_getc ();
127           if (c2 == '\n')
128             c = c2;
129           else
130             phase1_ungetc (c2);
131         }
132     }
133 
134   if (c == '\n')
135     gram_pos.line_number++;
136 
137   return c;
138 }
139 
140 static void
phase2_ungetc(int c)141 phase2_ungetc (int c)
142 {
143   if (c == '\n')
144     --gram_pos.line_number;
145   if (c != EOF)
146     phase2_pushback[phase2_pushback_length++] = c;
147 }
148 
149 
150 /* Phase 3: Read an input byte, treating CR/LF like a single LF,
151    with handling of continuation lines.
152    Max. 1 pushback character.  */
153 
154 static int
phase3_getc()155 phase3_getc ()
156 {
157   int c = phase2_getc ();
158 
159   for (;;)
160     {
161       if (c != '\\')
162         return c;
163 
164       c = phase2_getc ();
165       if (c != '\n')
166         {
167           phase2_ungetc (c);
168           return '\\';
169         }
170 
171       /* Skip the backslash-newline and all whitespace that follows it.  */
172       do
173         c = phase2_getc ();
174       while (c == ' ' || c == '\t' || c == '\r' || c == '\f');
175     }
176 }
177 
178 static inline void
phase3_ungetc(int c)179 phase3_ungetc (int c)
180 {
181   phase2_ungetc (c);
182 }
183 
184 
185 /* Converts a string from ISO-8859-1 encoding to UTF-8 encoding.  */
186 static char *
conv_from_iso_8859_1(char * string)187 conv_from_iso_8859_1 (char *string)
188 {
189   if (is_ascii_string (string))
190     return string;
191   else
192     {
193       size_t length = strlen (string);
194       /* Each ISO-8859-1 character needs 2 bytes at worst.  */
195       unsigned char *utf8_string = XNMALLOC (2 * length + 1, unsigned char);
196       unsigned char *q = utf8_string;
197       const char *str = string;
198       const char *str_limit = str + length;
199 
200       while (str < str_limit)
201         {
202           unsigned int uc = (unsigned char) *str++;
203           int n = u8_uctomb (q, uc, 6);
204           assert (n > 0);
205           q += n;
206         }
207       *q = '\0';
208       assert (q - utf8_string <= 2 * length);
209 
210       return (char *) utf8_string;
211     }
212 }
213 
214 
215 /* Converts a string from JAVA encoding (with \uxxxx sequences) to UTF-8
216    encoding.  May destructively modify the argument string.  */
217 static char *
conv_from_java(char * string)218 conv_from_java (char *string)
219 {
220   /* This conversion can only shrink the string, never increase its size.
221      So there is no need to xmalloc the result freshly.  */
222   const char *p = string;
223   unsigned char *q = (unsigned char *) string;
224 
225   while (*p != '\0')
226     {
227       if (p[0] == '\\' && p[1] == 'u')
228         {
229           unsigned int n = 0;
230           int i;
231 
232           for (i = 0; i < 4; i++)
233             {
234               int c1 = (unsigned char) p[2 + i];
235 
236               if (c1 >= '0' && c1 <= '9')
237                 n = (n << 4) + (c1 - '0');
238               else if (c1 >= 'A' && c1 <= 'F')
239                 n = (n << 4) + (c1 - 'A' + 10);
240               else if (c1 >= 'a' && c1 <= 'f')
241                 n = (n << 4) + (c1 - 'a' + 10);
242               else
243                 goto just_one_byte;
244             }
245 
246           if (i == 4)
247             {
248               unsigned int uc;
249 
250               if (n >= 0xd800 && n < 0xdc00)
251                 {
252                   if (p[6] == '\\' && p[7] == 'u')
253                     {
254                       unsigned int m = 0;
255 
256                       for (i = 0; i < 4; i++)
257                         {
258                           int c1 = (unsigned char) p[8 + i];
259 
260                           if (c1 >= '0' && c1 <= '9')
261                             m = (m << 4) + (c1 - '0');
262                           else if (c1 >= 'A' && c1 <= 'F')
263                             m = (m << 4) + (c1 - 'A' + 10);
264                           else if (c1 >= 'a' && c1 <= 'f')
265                             m = (m << 4) + (c1 - 'a' + 10);
266                           else
267                             goto just_one_byte;
268                         }
269 
270                       if (i == 4 && (m >= 0xdc00 && m < 0xe000))
271                         {
272                           /* Combine two UTF-16 words to a character.  */
273                           uc = 0x10000 + ((n - 0xd800) << 10) + (m - 0xdc00);
274                           p += 12;
275                         }
276                       else
277                         goto just_one_byte;
278                     }
279                   else
280                     goto just_one_byte;
281                 }
282               else
283                 {
284                   uc = n;
285                   p += 6;
286                 }
287 
288               q += u8_uctomb (q, uc, 6);
289               continue;
290             }
291         }
292       just_one_byte:
293         *q++ = (unsigned char) *p++;
294     }
295   *q = '\0';
296   return string;
297 }
298 
299 
300 /* Phase 4: Read the next single byte or UTF-16 code point,
301    treating CR/LF like a single LF, with handling of continuation lines
302    and of \uxxxx sequences.  */
303 
304 /* Return value of phase 4 when EOF is reached.  */
305 #define P4_EOF 0xffff
306 
307 /* Convert an UTF-16 code point to a return value that can be distinguished
308    from a single-byte return value.  */
309 #define UNICODE(code) (0x10000 + (code))
310 
311 /* Test a return value of phase 4 whether it designates an UTF-16 code
312    point.  */
313 #define IS_UNICODE(p4_result) ((p4_result) >= 0x10000)
314 
315 /* Extract the UTF-16 code of a return value that satisfies IS_UNICODE.  */
316 #define UTF16_VALUE(p4_result) ((p4_result) - 0x10000)
317 
318 static int
phase4_getuc()319 phase4_getuc ()
320 {
321   int c = phase3_getc ();
322 
323   if (c == EOF)
324     return P4_EOF;
325   if (c == '\\')
326     {
327       int c2 = phase3_getc ();
328 
329       if (c2 == 't')
330         return '\t';
331       if (c2 == 'n')
332         return '\n';
333       if (c2 == 'r')
334         return '\r';
335       if (c2 == 'f')
336         return '\f';
337       if (c2 == 'u')
338         {
339           unsigned int n = 0;
340           int i;
341 
342           for (i = 0; i < 4; i++)
343             {
344               int c1 = phase3_getc ();
345 
346               if (c1 >= '0' && c1 <= '9')
347                 n = (n << 4) + (c1 - '0');
348               else if (c1 >= 'A' && c1 <= 'F')
349                 n = (n << 4) + (c1 - 'A' + 10);
350               else if (c1 >= 'a' && c1 <= 'f')
351                 n = (n << 4) + (c1 - 'a' + 10);
352               else
353                 {
354                   phase3_ungetc (c1);
355                   po_xerror (PO_SEVERITY_ERROR, NULL,
356                              real_file_name, gram_pos.line_number, (size_t)(-1),
357                              false, _("warning: invalid \\uxxxx syntax for Unicode character"));
358                   return 'u';
359                 }
360             }
361           return UNICODE (n);
362         }
363 
364       return c2;
365     }
366   else
367     return c;
368 }
369 
370 
371 /* Reads a key or value string.
372    Returns the string in UTF-8 encoding, or NULL if the end of the logical
373    line is reached.
374    Parsing ends:
375      - when returning NULL, after the end of the logical line,
376      - otherwise, if in_key is true, after the whitespace and possibly the
377        separator that follows after the string,
378      - otherwise, if in_key is false, after the end of the logical line. */
379 
380 static char *
read_escaped_string(bool in_key)381 read_escaped_string (bool in_key)
382 {
383   /* The part of the string that has already been converted to UTF-8.  */
384   static unsigned char *utf8_buffer;
385   static size_t utf8_buflen;
386   static size_t utf8_allocated;
387   /* The first half of an UTF-16 surrogate character.  */
388   unsigned short utf16_surr;
389   /* Line in which this surrogate character occurred.  */
390   size_t utf16_surr_line;
391 
392   /* Ensures utf8_buffer has room for N bytes.  N must be <= 10.  */
393   #define utf8_buffer_ensure_available(n)  \
394     do                                                                        \
395       {                                                                       \
396         if (utf8_buflen + (n) > utf8_allocated)                               \
397           {                                                                   \
398             utf8_allocated = 2 * utf8_allocated + 10;                         \
399             utf8_buffer =                                                     \
400               (unsigned char *) xrealloc (utf8_buffer, utf8_allocated);       \
401           }                                                                   \
402       }                                                                       \
403     while (0)
404 
405   /* Appends a lone surrogate to utf8_buffer.  */
406   /* Note: A half surrogate is invalid in UTF-8:
407      - RFC 3629 says
408          "The definition of UTF-8 prohibits encoding character
409           numbers between U+D800 and U+DFFF".
410      - Unicode 4.0 chapter 3
411        <https://www.unicode.org/versions/Unicode4.0.0/ch03.pdf>
412        section 3.9, p.77, says
413          "Because surrogate code points are not Unicode scalar
414           values, any UTF-8 byte sequence that would otherwise
415           map to code points D800..DFFF is ill-formed."
416        and in table 3-6, p. 78, does not mention D800..DFFF.
417      - The unicode.org FAQ question "How do I convert an unpaired
418        UTF-16 surrogate to UTF-8?" has the answer
419          "By representing such an unpaired surrogate on its own
420           as a 3-byte sequence, the resulting UTF-8 data stream
421           would become ill-formed."
422      So use U+FFFD instead.  */
423   #define utf8_buffer_append_lone_surrogate(uc, line) \
424     do                                                                        \
425       {                                                                       \
426         error_with_progname = false;                                          \
427         po_xerror (PO_SEVERITY_ERROR, NULL,                                   \
428                    real_file_name, (line), (size_t)(-1), false,               \
429                    xasprintf (_("warning: lone surrogate U+%04X"), (uc)));    \
430         error_with_progname = true;                                           \
431         utf8_buffer_ensure_available (3);                                     \
432         utf8_buffer[utf8_buflen++] = 0xef;                                    \
433         utf8_buffer[utf8_buflen++] = 0xbf;                                    \
434         utf8_buffer[utf8_buflen++] = 0xbd;                                    \
435       }                                                                       \
436     while (0)
437 
438   int c;
439 
440   /* Skip whitespace before the string.  */
441   do
442     c = phase3_getc ();
443   while (c == ' ' || c == '\t' || c == '\r' || c == '\f');
444 
445   if (c == EOF || c == '\n')
446     /* Empty string.  */
447     return NULL;
448 
449   /* Start accumulating the string.  */
450   utf8_buflen = 0;
451   utf16_surr = 0;
452   utf16_surr_line = 0;
453   for (;;)
454     {
455       if (in_key && (c == '=' || c == ':'
456                      || c == ' ' || c == '\t' || c == '\r' || c == '\f'))
457         {
458           /* Skip whitespace after the string.  */
459           while (c == ' ' || c == '\t' || c == '\r' || c == '\f')
460             c = phase3_getc ();
461           /* Skip '=' or ':' separator.  */
462           if (!(c == '=' || c == ':'))
463             phase3_ungetc (c);
464           break;
465         }
466 
467       phase3_ungetc (c);
468 
469       /* Read the next byte or UTF-16 code point.  */
470       c = phase4_getuc ();
471       if (c == P4_EOF)
472         break;
473 
474       /* Append it to the buffer.  */
475       if (IS_UNICODE (c))
476         {
477           /* Append an UTF-16 code point.  */
478           /* Test whether this character and the previous one form a Unicode
479              surrogate pair.  */
480           if (utf16_surr != 0
481               && (c >= UNICODE (0xdc00) && c < UNICODE (0xe000)))
482             {
483               unsigned short utf16buf[2];
484               ucs4_t uc;
485               int len;
486 
487               utf16buf[0] = utf16_surr;
488               utf16buf[1] = UTF16_VALUE (c);
489               if (u16_mbtouc (&uc, utf16buf, 2) != 2)
490                 abort ();
491 
492               utf8_buffer_ensure_available (6);
493               len = u8_uctomb (utf8_buffer + utf8_buflen, uc, 6);
494               if (len < 0)
495                 {
496                   error_with_progname = false;
497                   po_xerror (PO_SEVERITY_ERROR, NULL,
498                              real_file_name, gram_pos.line_number, (size_t)(-1),
499                              false, _("warning: invalid Unicode character"));
500                   error_with_progname = true;
501                 }
502               else
503                 utf8_buflen += len;
504 
505               utf16_surr = 0;
506             }
507           else
508             {
509               if (utf16_surr != 0)
510                 {
511                   utf8_buffer_append_lone_surrogate (utf16_surr, utf16_surr_line);
512                   utf16_surr = 0;
513                 }
514 
515               if (c >= UNICODE (0xd800) && c < UNICODE (0xdc00))
516                 {
517                   utf16_surr = UTF16_VALUE (c);
518                   utf16_surr_line = gram_pos.line_number;
519                 }
520               else if (c >= UNICODE (0xdc00) && c < UNICODE (0xe000))
521                 utf8_buffer_append_lone_surrogate (UTF16_VALUE (c), gram_pos.line_number);
522               else
523                 {
524                   ucs4_t uc = UTF16_VALUE (c);
525                   int len;
526 
527                   utf8_buffer_ensure_available (3);
528                   len = u8_uctomb (utf8_buffer + utf8_buflen, uc, 3);
529                   if (len < 0)
530                     {
531                       error_with_progname = false;
532                       po_xerror (PO_SEVERITY_ERROR, NULL,
533                                  real_file_name, gram_pos.line_number, (size_t)(-1),
534                                  false, _("warning: invalid Unicode character"));
535                       error_with_progname = true;
536                     }
537                   else
538                     utf8_buflen += len;
539                 }
540             }
541         }
542       else
543         {
544           /* Append a single byte.  */
545           if (utf16_surr != 0)
546             {
547               utf8_buffer_append_lone_surrogate (utf16_surr, utf16_surr_line);
548               utf16_surr = 0;
549             }
550 
551           if (assume_utf8)
552             {
553               /* No conversion needed.  */
554               utf8_buffer_ensure_available (1);
555               utf8_buffer[utf8_buflen++] = c;
556             }
557           else
558             {
559               /* Convert the byte from ISO-8859-1 to UTF-8 on the fly.  */
560               ucs4_t uc = c;
561               int len;
562 
563               utf8_buffer_ensure_available (2);
564               len = u8_uctomb (utf8_buffer + utf8_buflen, uc, 2);
565               if (len < 0)
566                 abort ();
567               utf8_buflen += len;
568             }
569         }
570 
571       c = phase3_getc ();
572       if (c == EOF || c == '\n')
573         {
574           if (in_key)
575             phase3_ungetc (c);
576           break;
577         }
578     }
579   if (utf16_surr != 0)
580     utf8_buffer_append_lone_surrogate (utf16_surr, utf16_surr_line);
581 
582   /* Return the result.  */
583   {
584     unsigned char *utf8_string = XNMALLOC (utf8_buflen + 1, unsigned char);
585     if (utf8_buflen > 0)
586       memcpy (utf8_string, utf8_buffer, utf8_buflen);
587     utf8_string[utf8_buflen] = '\0';
588 
589     return (char *) utf8_string;
590   }
591   #undef utf8_buffer_append_lone_surrogate
592   #undef utf8_buffer_ensure_available
593 }
594 
595 
596 /* Read a .properties file from a stream, and dispatch to the various
597    abstract_catalog_reader_class_ty methods.  */
598 static void
properties_parse(abstract_catalog_reader_ty * this,FILE * file,const char * real_filename,const char * logical_filename)599 properties_parse (abstract_catalog_reader_ty *this, FILE *file,
600                   const char *real_filename, const char *logical_filename)
601 {
602   /* Read the file into memory.  */
603   contents = fread_file (file, 0, &contents_length);
604   if (contents == NULL)
605     {
606       const char *errno_description = strerror (errno);
607       po_xerror (PO_SEVERITY_FATAL_ERROR, NULL, NULL, 0, 0, false,
608                  xasprintf ("%s: %s",
609                             xasprintf (_("error while reading \"%s\""),
610                                        real_filename),
611                             errno_description));
612       return;
613     }
614 
615   /* Test whether it's valid UTF-8.  */
616   assume_utf8 = (u8_check ((uint8_t *) contents, contents_length) == NULL);
617 
618   position = 0;
619   real_file_name = real_filename;
620   gram_pos.file_name = xstrdup (real_file_name);
621   gram_pos.line_number = 1;
622 
623   for (;;)
624     {
625       int c;
626       bool comment;
627       bool hidden;
628 
629       c = phase2_getc ();
630 
631       if (c == EOF)
632         break;
633 
634       comment = false;
635       hidden = false;
636       if (c == '#')
637         comment = true;
638       else if (c == '!')
639         {
640           /* For compatibility with write-properties.c, we treat '!' not
641              followed by space as a fuzzy or untranslated message.  */
642           int c2 = phase2_getc ();
643           if (c2 == ' ' || c2 == '\n' || c2 == EOF)
644             comment = true;
645           else
646             hidden = true;
647           phase2_ungetc (c2);
648         }
649       else
650         phase2_ungetc (c);
651 
652       if (comment)
653         {
654           /* A comment line.  */
655           static char *buffer;
656           static size_t bufmax;
657           static size_t buflen;
658 
659           buflen = 0;
660           for (;;)
661             {
662               c = phase2_getc ();
663 
664               if (buflen >= bufmax)
665                 {
666                   bufmax += 100;
667                   buffer = xrealloc (buffer, bufmax);
668                 }
669 
670               if (c == EOF || c == '\n')
671                 break;
672 
673               buffer[buflen++] = c;
674             }
675           buffer[buflen] = '\0';
676 
677           po_callback_comment_dispatcher (
678             conv_from_java (
679               assume_utf8 ? buffer : conv_from_iso_8859_1 (buffer)));
680         }
681       else
682         {
683           /* A key/value pair.  */
684           char *msgid;
685           lex_pos_ty msgid_pos;
686 
687           msgid_pos = gram_pos;
688           msgid = read_escaped_string (true);
689           if (msgid == NULL)
690             /* Skip blank line.  */
691             ;
692           else
693             {
694               char *msgstr;
695               lex_pos_ty msgstr_pos;
696               bool force_fuzzy;
697 
698               msgstr_pos = gram_pos;
699               msgstr = read_escaped_string (false);
700               if (msgstr == NULL)
701                 msgstr = xstrdup ("");
702 
703               /* Be sure to make the message fuzzy if it was commented out
704                  and if it is not already header/fuzzy/untranslated.  */
705               force_fuzzy = (hidden && msgid[0] != '\0' && msgstr[0] != '\0');
706 
707               po_callback_message (NULL, msgid, &msgid_pos, NULL,
708                                    msgstr, strlen (msgstr) + 1, &msgstr_pos,
709                                    NULL, NULL, NULL,
710                                    force_fuzzy, false);
711             }
712         }
713     }
714 
715   free (contents);
716   contents = NULL;
717   real_file_name = NULL;
718   gram_pos.line_number = 0;
719 }
720 
721 const struct catalog_input_format input_format_properties =
722 {
723   properties_parse,                     /* parse */
724   true                                  /* produces_utf8 */
725 };
726