1 /* Reading Java .properties files.
2 Copyright (C) 2003, 2005-2007, 2009, 2018, 2020 Free Software Foundation, Inc.
3 Written by Bruno Haible <bruno@clisp.org>, 2003.
4
5 This program is free software: you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by
7 the Free Software Foundation; either version 3 of the License, or
8 (at your option) any later version.
9
10 This program is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU General Public License for more details.
14
15 You should have received a copy of the GNU General Public License
16 along with this program. If not, see <https://www.gnu.org/licenses/>. */
17
18 #ifdef HAVE_CONFIG_H
19 # include <config.h>
20 #endif
21
22 /* Specification. */
23 #include "read-properties.h"
24
25 #include <assert.h>
26 #include <errno.h>
27 #include <stdbool.h>
28 #include <stdio.h>
29 #include <stdlib.h>
30 #include <string.h>
31
32 #include "error.h"
33 #include "error-progname.h"
34 #include "message.h"
35 #include "read-catalog-abstract.h"
36 #include "xalloc.h"
37 #include "xvasprintf.h"
38 #include "po-xerror.h"
39 #include "msgl-ascii.h"
40 #include "read-file.h"
41 #include "unistr.h"
42 #include "gettext.h"
43
44 #define _(str) gettext (str)
45
46 /* For compiling this file in C++ mode. */
47 #ifdef __cplusplus
48 # define this thiss
49 #endif
50
51
52 /* The format of the Java .properties files is documented in the JDK
53 documentation for class java.util.Properties. In the case of .properties
54 files for PropertyResourceBundle, each non-comment line contains a
55 key/value pair in the form "key = value" or "key : value" or "key value",
56 where the key is the msgid and the value is the msgstr. Messages with
57 plurals are not supported in this format.
58
59 The encoding of Java .properties files is:
60 - ASCII with Java \uxxxx escape sequences,
61 - ISO-8859-1 if non-ASCII bytes are encounterd,
62 - UTF-8 if non-ASCII bytes are encountered and the entire file is
63 valid UTF-8 (in Java 9 or newer), see
64 https://docs.oracle.com/javase/9/intl/internationalization-enhancements-jdk-9.htm */
65
66 /* Handling of comments: We copy all comments from the .properties file to
67 the PO file. This is not really needed; it's a service for translators
68 who don't like PO files and prefer to maintain the .properties file. */
69
70 /* Real filename, used in error messages about the input file. */
71 static const char *real_file_name;
72
73 /* File name and line number. */
74 extern lex_pos_ty gram_pos;
75
76 /* The contents of the input file. */
77 static char *contents;
78 static size_t contents_length;
79
80 /* True if the input file is assumed to be in UTF-8 encoding.
81 False if it is assumed to be in ISO-8859-1 encoding. */
82 static bool assume_utf8;
83
84 /* Current position in contents. */
85 static size_t position;
86
87 /* Phase 1: Read an input byte.
88 Max. 1 pushback byte. */
89
90 static int
phase1_getc()91 phase1_getc ()
92 {
93 if (position == contents_length)
94 return EOF;
95
96 return (unsigned char) contents[position++];
97 }
98
99 static inline void
phase1_ungetc(int c)100 phase1_ungetc (int c)
101 {
102 if (c != EOF)
103 position--;
104 }
105
106
107 /* Phase 2: Read an input byte, treating CR/LF like a single LF.
108 Max. 2 pushback bytes. */
109
110 static unsigned char phase2_pushback[2];
111 static int phase2_pushback_length;
112
113 static int
phase2_getc()114 phase2_getc ()
115 {
116 int c;
117
118 if (phase2_pushback_length)
119 c = phase2_pushback[--phase2_pushback_length];
120 else
121 {
122 c = phase1_getc ();
123
124 if (c == '\r')
125 {
126 int c2 = phase1_getc ();
127 if (c2 == '\n')
128 c = c2;
129 else
130 phase1_ungetc (c2);
131 }
132 }
133
134 if (c == '\n')
135 gram_pos.line_number++;
136
137 return c;
138 }
139
140 static void
phase2_ungetc(int c)141 phase2_ungetc (int c)
142 {
143 if (c == '\n')
144 --gram_pos.line_number;
145 if (c != EOF)
146 phase2_pushback[phase2_pushback_length++] = c;
147 }
148
149
150 /* Phase 3: Read an input byte, treating CR/LF like a single LF,
151 with handling of continuation lines.
152 Max. 1 pushback character. */
153
154 static int
phase3_getc()155 phase3_getc ()
156 {
157 int c = phase2_getc ();
158
159 for (;;)
160 {
161 if (c != '\\')
162 return c;
163
164 c = phase2_getc ();
165 if (c != '\n')
166 {
167 phase2_ungetc (c);
168 return '\\';
169 }
170
171 /* Skip the backslash-newline and all whitespace that follows it. */
172 do
173 c = phase2_getc ();
174 while (c == ' ' || c == '\t' || c == '\r' || c == '\f');
175 }
176 }
177
178 static inline void
phase3_ungetc(int c)179 phase3_ungetc (int c)
180 {
181 phase2_ungetc (c);
182 }
183
184
185 /* Converts a string from ISO-8859-1 encoding to UTF-8 encoding. */
186 static char *
conv_from_iso_8859_1(char * string)187 conv_from_iso_8859_1 (char *string)
188 {
189 if (is_ascii_string (string))
190 return string;
191 else
192 {
193 size_t length = strlen (string);
194 /* Each ISO-8859-1 character needs 2 bytes at worst. */
195 unsigned char *utf8_string = XNMALLOC (2 * length + 1, unsigned char);
196 unsigned char *q = utf8_string;
197 const char *str = string;
198 const char *str_limit = str + length;
199
200 while (str < str_limit)
201 {
202 unsigned int uc = (unsigned char) *str++;
203 int n = u8_uctomb (q, uc, 6);
204 assert (n > 0);
205 q += n;
206 }
207 *q = '\0';
208 assert (q - utf8_string <= 2 * length);
209
210 return (char *) utf8_string;
211 }
212 }
213
214
215 /* Converts a string from JAVA encoding (with \uxxxx sequences) to UTF-8
216 encoding. May destructively modify the argument string. */
217 static char *
conv_from_java(char * string)218 conv_from_java (char *string)
219 {
220 /* This conversion can only shrink the string, never increase its size.
221 So there is no need to xmalloc the result freshly. */
222 const char *p = string;
223 unsigned char *q = (unsigned char *) string;
224
225 while (*p != '\0')
226 {
227 if (p[0] == '\\' && p[1] == 'u')
228 {
229 unsigned int n = 0;
230 int i;
231
232 for (i = 0; i < 4; i++)
233 {
234 int c1 = (unsigned char) p[2 + i];
235
236 if (c1 >= '0' && c1 <= '9')
237 n = (n << 4) + (c1 - '0');
238 else if (c1 >= 'A' && c1 <= 'F')
239 n = (n << 4) + (c1 - 'A' + 10);
240 else if (c1 >= 'a' && c1 <= 'f')
241 n = (n << 4) + (c1 - 'a' + 10);
242 else
243 goto just_one_byte;
244 }
245
246 if (i == 4)
247 {
248 unsigned int uc;
249
250 if (n >= 0xd800 && n < 0xdc00)
251 {
252 if (p[6] == '\\' && p[7] == 'u')
253 {
254 unsigned int m = 0;
255
256 for (i = 0; i < 4; i++)
257 {
258 int c1 = (unsigned char) p[8 + i];
259
260 if (c1 >= '0' && c1 <= '9')
261 m = (m << 4) + (c1 - '0');
262 else if (c1 >= 'A' && c1 <= 'F')
263 m = (m << 4) + (c1 - 'A' + 10);
264 else if (c1 >= 'a' && c1 <= 'f')
265 m = (m << 4) + (c1 - 'a' + 10);
266 else
267 goto just_one_byte;
268 }
269
270 if (i == 4 && (m >= 0xdc00 && m < 0xe000))
271 {
272 /* Combine two UTF-16 words to a character. */
273 uc = 0x10000 + ((n - 0xd800) << 10) + (m - 0xdc00);
274 p += 12;
275 }
276 else
277 goto just_one_byte;
278 }
279 else
280 goto just_one_byte;
281 }
282 else
283 {
284 uc = n;
285 p += 6;
286 }
287
288 q += u8_uctomb (q, uc, 6);
289 continue;
290 }
291 }
292 just_one_byte:
293 *q++ = (unsigned char) *p++;
294 }
295 *q = '\0';
296 return string;
297 }
298
299
300 /* Phase 4: Read the next single byte or UTF-16 code point,
301 treating CR/LF like a single LF, with handling of continuation lines
302 and of \uxxxx sequences. */
303
304 /* Return value of phase 4 when EOF is reached. */
305 #define P4_EOF 0xffff
306
307 /* Convert an UTF-16 code point to a return value that can be distinguished
308 from a single-byte return value. */
309 #define UNICODE(code) (0x10000 + (code))
310
311 /* Test a return value of phase 4 whether it designates an UTF-16 code
312 point. */
313 #define IS_UNICODE(p4_result) ((p4_result) >= 0x10000)
314
315 /* Extract the UTF-16 code of a return value that satisfies IS_UNICODE. */
316 #define UTF16_VALUE(p4_result) ((p4_result) - 0x10000)
317
318 static int
phase4_getuc()319 phase4_getuc ()
320 {
321 int c = phase3_getc ();
322
323 if (c == EOF)
324 return P4_EOF;
325 if (c == '\\')
326 {
327 int c2 = phase3_getc ();
328
329 if (c2 == 't')
330 return '\t';
331 if (c2 == 'n')
332 return '\n';
333 if (c2 == 'r')
334 return '\r';
335 if (c2 == 'f')
336 return '\f';
337 if (c2 == 'u')
338 {
339 unsigned int n = 0;
340 int i;
341
342 for (i = 0; i < 4; i++)
343 {
344 int c1 = phase3_getc ();
345
346 if (c1 >= '0' && c1 <= '9')
347 n = (n << 4) + (c1 - '0');
348 else if (c1 >= 'A' && c1 <= 'F')
349 n = (n << 4) + (c1 - 'A' + 10);
350 else if (c1 >= 'a' && c1 <= 'f')
351 n = (n << 4) + (c1 - 'a' + 10);
352 else
353 {
354 phase3_ungetc (c1);
355 po_xerror (PO_SEVERITY_ERROR, NULL,
356 real_file_name, gram_pos.line_number, (size_t)(-1),
357 false, _("warning: invalid \\uxxxx syntax for Unicode character"));
358 return 'u';
359 }
360 }
361 return UNICODE (n);
362 }
363
364 return c2;
365 }
366 else
367 return c;
368 }
369
370
371 /* Reads a key or value string.
372 Returns the string in UTF-8 encoding, or NULL if the end of the logical
373 line is reached.
374 Parsing ends:
375 - when returning NULL, after the end of the logical line,
376 - otherwise, if in_key is true, after the whitespace and possibly the
377 separator that follows after the string,
378 - otherwise, if in_key is false, after the end of the logical line. */
379
380 static char *
read_escaped_string(bool in_key)381 read_escaped_string (bool in_key)
382 {
383 /* The part of the string that has already been converted to UTF-8. */
384 static unsigned char *utf8_buffer;
385 static size_t utf8_buflen;
386 static size_t utf8_allocated;
387 /* The first half of an UTF-16 surrogate character. */
388 unsigned short utf16_surr;
389 /* Line in which this surrogate character occurred. */
390 size_t utf16_surr_line;
391
392 /* Ensures utf8_buffer has room for N bytes. N must be <= 10. */
393 #define utf8_buffer_ensure_available(n) \
394 do \
395 { \
396 if (utf8_buflen + (n) > utf8_allocated) \
397 { \
398 utf8_allocated = 2 * utf8_allocated + 10; \
399 utf8_buffer = \
400 (unsigned char *) xrealloc (utf8_buffer, utf8_allocated); \
401 } \
402 } \
403 while (0)
404
405 /* Appends a lone surrogate to utf8_buffer. */
406 /* Note: A half surrogate is invalid in UTF-8:
407 - RFC 3629 says
408 "The definition of UTF-8 prohibits encoding character
409 numbers between U+D800 and U+DFFF".
410 - Unicode 4.0 chapter 3
411 <https://www.unicode.org/versions/Unicode4.0.0/ch03.pdf>
412 section 3.9, p.77, says
413 "Because surrogate code points are not Unicode scalar
414 values, any UTF-8 byte sequence that would otherwise
415 map to code points D800..DFFF is ill-formed."
416 and in table 3-6, p. 78, does not mention D800..DFFF.
417 - The unicode.org FAQ question "How do I convert an unpaired
418 UTF-16 surrogate to UTF-8?" has the answer
419 "By representing such an unpaired surrogate on its own
420 as a 3-byte sequence, the resulting UTF-8 data stream
421 would become ill-formed."
422 So use U+FFFD instead. */
423 #define utf8_buffer_append_lone_surrogate(uc, line) \
424 do \
425 { \
426 error_with_progname = false; \
427 po_xerror (PO_SEVERITY_ERROR, NULL, \
428 real_file_name, (line), (size_t)(-1), false, \
429 xasprintf (_("warning: lone surrogate U+%04X"), (uc))); \
430 error_with_progname = true; \
431 utf8_buffer_ensure_available (3); \
432 utf8_buffer[utf8_buflen++] = 0xef; \
433 utf8_buffer[utf8_buflen++] = 0xbf; \
434 utf8_buffer[utf8_buflen++] = 0xbd; \
435 } \
436 while (0)
437
438 int c;
439
440 /* Skip whitespace before the string. */
441 do
442 c = phase3_getc ();
443 while (c == ' ' || c == '\t' || c == '\r' || c == '\f');
444
445 if (c == EOF || c == '\n')
446 /* Empty string. */
447 return NULL;
448
449 /* Start accumulating the string. */
450 utf8_buflen = 0;
451 utf16_surr = 0;
452 utf16_surr_line = 0;
453 for (;;)
454 {
455 if (in_key && (c == '=' || c == ':'
456 || c == ' ' || c == '\t' || c == '\r' || c == '\f'))
457 {
458 /* Skip whitespace after the string. */
459 while (c == ' ' || c == '\t' || c == '\r' || c == '\f')
460 c = phase3_getc ();
461 /* Skip '=' or ':' separator. */
462 if (!(c == '=' || c == ':'))
463 phase3_ungetc (c);
464 break;
465 }
466
467 phase3_ungetc (c);
468
469 /* Read the next byte or UTF-16 code point. */
470 c = phase4_getuc ();
471 if (c == P4_EOF)
472 break;
473
474 /* Append it to the buffer. */
475 if (IS_UNICODE (c))
476 {
477 /* Append an UTF-16 code point. */
478 /* Test whether this character and the previous one form a Unicode
479 surrogate pair. */
480 if (utf16_surr != 0
481 && (c >= UNICODE (0xdc00) && c < UNICODE (0xe000)))
482 {
483 unsigned short utf16buf[2];
484 ucs4_t uc;
485 int len;
486
487 utf16buf[0] = utf16_surr;
488 utf16buf[1] = UTF16_VALUE (c);
489 if (u16_mbtouc (&uc, utf16buf, 2) != 2)
490 abort ();
491
492 utf8_buffer_ensure_available (6);
493 len = u8_uctomb (utf8_buffer + utf8_buflen, uc, 6);
494 if (len < 0)
495 {
496 error_with_progname = false;
497 po_xerror (PO_SEVERITY_ERROR, NULL,
498 real_file_name, gram_pos.line_number, (size_t)(-1),
499 false, _("warning: invalid Unicode character"));
500 error_with_progname = true;
501 }
502 else
503 utf8_buflen += len;
504
505 utf16_surr = 0;
506 }
507 else
508 {
509 if (utf16_surr != 0)
510 {
511 utf8_buffer_append_lone_surrogate (utf16_surr, utf16_surr_line);
512 utf16_surr = 0;
513 }
514
515 if (c >= UNICODE (0xd800) && c < UNICODE (0xdc00))
516 {
517 utf16_surr = UTF16_VALUE (c);
518 utf16_surr_line = gram_pos.line_number;
519 }
520 else if (c >= UNICODE (0xdc00) && c < UNICODE (0xe000))
521 utf8_buffer_append_lone_surrogate (UTF16_VALUE (c), gram_pos.line_number);
522 else
523 {
524 ucs4_t uc = UTF16_VALUE (c);
525 int len;
526
527 utf8_buffer_ensure_available (3);
528 len = u8_uctomb (utf8_buffer + utf8_buflen, uc, 3);
529 if (len < 0)
530 {
531 error_with_progname = false;
532 po_xerror (PO_SEVERITY_ERROR, NULL,
533 real_file_name, gram_pos.line_number, (size_t)(-1),
534 false, _("warning: invalid Unicode character"));
535 error_with_progname = true;
536 }
537 else
538 utf8_buflen += len;
539 }
540 }
541 }
542 else
543 {
544 /* Append a single byte. */
545 if (utf16_surr != 0)
546 {
547 utf8_buffer_append_lone_surrogate (utf16_surr, utf16_surr_line);
548 utf16_surr = 0;
549 }
550
551 if (assume_utf8)
552 {
553 /* No conversion needed. */
554 utf8_buffer_ensure_available (1);
555 utf8_buffer[utf8_buflen++] = c;
556 }
557 else
558 {
559 /* Convert the byte from ISO-8859-1 to UTF-8 on the fly. */
560 ucs4_t uc = c;
561 int len;
562
563 utf8_buffer_ensure_available (2);
564 len = u8_uctomb (utf8_buffer + utf8_buflen, uc, 2);
565 if (len < 0)
566 abort ();
567 utf8_buflen += len;
568 }
569 }
570
571 c = phase3_getc ();
572 if (c == EOF || c == '\n')
573 {
574 if (in_key)
575 phase3_ungetc (c);
576 break;
577 }
578 }
579 if (utf16_surr != 0)
580 utf8_buffer_append_lone_surrogate (utf16_surr, utf16_surr_line);
581
582 /* Return the result. */
583 {
584 unsigned char *utf8_string = XNMALLOC (utf8_buflen + 1, unsigned char);
585 if (utf8_buflen > 0)
586 memcpy (utf8_string, utf8_buffer, utf8_buflen);
587 utf8_string[utf8_buflen] = '\0';
588
589 return (char *) utf8_string;
590 }
591 #undef utf8_buffer_append_lone_surrogate
592 #undef utf8_buffer_ensure_available
593 }
594
595
596 /* Read a .properties file from a stream, and dispatch to the various
597 abstract_catalog_reader_class_ty methods. */
598 static void
properties_parse(abstract_catalog_reader_ty * this,FILE * file,const char * real_filename,const char * logical_filename)599 properties_parse (abstract_catalog_reader_ty *this, FILE *file,
600 const char *real_filename, const char *logical_filename)
601 {
602 /* Read the file into memory. */
603 contents = fread_file (file, 0, &contents_length);
604 if (contents == NULL)
605 {
606 const char *errno_description = strerror (errno);
607 po_xerror (PO_SEVERITY_FATAL_ERROR, NULL, NULL, 0, 0, false,
608 xasprintf ("%s: %s",
609 xasprintf (_("error while reading \"%s\""),
610 real_filename),
611 errno_description));
612 return;
613 }
614
615 /* Test whether it's valid UTF-8. */
616 assume_utf8 = (u8_check ((uint8_t *) contents, contents_length) == NULL);
617
618 position = 0;
619 real_file_name = real_filename;
620 gram_pos.file_name = xstrdup (real_file_name);
621 gram_pos.line_number = 1;
622
623 for (;;)
624 {
625 int c;
626 bool comment;
627 bool hidden;
628
629 c = phase2_getc ();
630
631 if (c == EOF)
632 break;
633
634 comment = false;
635 hidden = false;
636 if (c == '#')
637 comment = true;
638 else if (c == '!')
639 {
640 /* For compatibility with write-properties.c, we treat '!' not
641 followed by space as a fuzzy or untranslated message. */
642 int c2 = phase2_getc ();
643 if (c2 == ' ' || c2 == '\n' || c2 == EOF)
644 comment = true;
645 else
646 hidden = true;
647 phase2_ungetc (c2);
648 }
649 else
650 phase2_ungetc (c);
651
652 if (comment)
653 {
654 /* A comment line. */
655 static char *buffer;
656 static size_t bufmax;
657 static size_t buflen;
658
659 buflen = 0;
660 for (;;)
661 {
662 c = phase2_getc ();
663
664 if (buflen >= bufmax)
665 {
666 bufmax += 100;
667 buffer = xrealloc (buffer, bufmax);
668 }
669
670 if (c == EOF || c == '\n')
671 break;
672
673 buffer[buflen++] = c;
674 }
675 buffer[buflen] = '\0';
676
677 po_callback_comment_dispatcher (
678 conv_from_java (
679 assume_utf8 ? buffer : conv_from_iso_8859_1 (buffer)));
680 }
681 else
682 {
683 /* A key/value pair. */
684 char *msgid;
685 lex_pos_ty msgid_pos;
686
687 msgid_pos = gram_pos;
688 msgid = read_escaped_string (true);
689 if (msgid == NULL)
690 /* Skip blank line. */
691 ;
692 else
693 {
694 char *msgstr;
695 lex_pos_ty msgstr_pos;
696 bool force_fuzzy;
697
698 msgstr_pos = gram_pos;
699 msgstr = read_escaped_string (false);
700 if (msgstr == NULL)
701 msgstr = xstrdup ("");
702
703 /* Be sure to make the message fuzzy if it was commented out
704 and if it is not already header/fuzzy/untranslated. */
705 force_fuzzy = (hidden && msgid[0] != '\0' && msgstr[0] != '\0');
706
707 po_callback_message (NULL, msgid, &msgid_pos, NULL,
708 msgstr, strlen (msgstr) + 1, &msgstr_pos,
709 NULL, NULL, NULL,
710 force_fuzzy, false);
711 }
712 }
713 }
714
715 free (contents);
716 contents = NULL;
717 real_file_name = NULL;
718 gram_pos.line_number = 0;
719 }
720
721 const struct catalog_input_format input_format_properties =
722 {
723 properties_parse, /* parse */
724 true /* produces_utf8 */
725 };
726