1 /* xgettext RST/RSJ backend.
2 Copyright (C) 2001-2003, 2005-2009, 2018-2019 Free Software Foundation, Inc.
3
4 This file was written by Bruno Haible <haible@clisp.cons.org>, 2001.
5
6 This program is free software: you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 3 of the License, or
9 (at your option) any later version.
10
11 This program is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
15
16 You should have received a copy of the GNU General Public License
17 along with this program. If not, see <https://www.gnu.org/licenses/>. */
18
19 #ifdef HAVE_CONFIG_H
20 # include "config.h"
21 #endif
22
23 /* Specification. */
24 #include "x-rst.h"
25
26 #include <errno.h>
27 #include <stdbool.h>
28 #include <stdio.h>
29 #include <stddef.h>
30 #include <stdlib.h>
31
32 #include "c-ctype.h"
33 #include "po-charset.h"
34 #include "message.h"
35 #include "xgettext.h"
36 #include "xg-pos.h"
37 #include "xg-encoding.h"
38 #include "xg-mixed-string.h"
39 #include "xg-message.h"
40 #include "error.h"
41 #include "error-progname.h"
42 #include "xalloc.h"
43 #include "gettext.h"
44
45 #define _(s) gettext(s)
46
47 /* RST stands for Resource String Table.
48
49 An RST file consists of several string definitions. A string definition
50 starts at the beginning of a line and looks like this:
51 ModuleName.ConstName=StringExpression
52 A StringExpression consists of string pieces of the form 'xyz',
53 single characters of the form #nnn (decimal integer), and +
54 at the end of the line to designate continuation on the next line.
55 String definitions can be separated by blank lines or comment lines
56 beginning with '#'.
57
58 This backend attempts to be functionally equivalent to the 'rstconv'
59 program, part of the Free Pascal run time library, written by
60 Sebastian Guenther. Except that
61 * the locations are output as "ModuleName.ConstName",
62 not "ModuleName:ConstName",
63 * we add the flag '#, object-pascal-format' where appropriate.
64 */
65
66 void
extract_rst(FILE * f,const char * real_filename,const char * logical_filename,flag_context_list_table_ty * flag_table,msgdomain_list_ty * mdlp)67 extract_rst (FILE *f,
68 const char *real_filename, const char *logical_filename,
69 flag_context_list_table_ty *flag_table,
70 msgdomain_list_ty *mdlp)
71 {
72 static char *buffer;
73 static int bufmax;
74 message_list_ty *mlp = mdlp->item[0]->messages;
75 int line_number;
76
77 line_number = 1;
78 for (;;)
79 {
80 int c;
81 int bufpos;
82 char *location;
83 char *msgid;
84 lex_pos_ty pos;
85
86 c = getc (f);
87 if (c == EOF)
88 break;
89
90 /* Ignore blank line. */
91 if (c == '\n')
92 {
93 line_number++;
94 continue;
95 }
96
97 /* Ignore comment line. */
98 if (c == '#')
99 {
100 do
101 c = getc (f);
102 while (c != EOF && c != '\n');
103 if (c == EOF)
104 break;
105 line_number++;
106 continue;
107 }
108
109 /* Read ModuleName.ConstName. */
110 bufpos = 0;
111 for (;;)
112 {
113 if (c == EOF || c == '\n')
114 {
115 error_with_progname = false;
116 error (EXIT_FAILURE, 0, _("%s:%d: invalid string definition"),
117 logical_filename, line_number);
118 error_with_progname = true;
119 }
120 if (bufpos >= bufmax)
121 {
122 bufmax = 2 * bufmax + 10;
123 buffer = xrealloc (buffer, bufmax);
124 }
125 if (c == '=')
126 break;
127 buffer[bufpos++] = c;
128 c = getc (f);
129 if (c == EOF && ferror (f))
130 goto bomb;
131 }
132 buffer[bufpos] = '\0';
133 location = xstrdup (buffer);
134
135 /* Read StringExpression. */
136 bufpos = 0;
137 for (;;)
138 {
139 c = getc (f);
140 if (c == EOF)
141 break;
142 else if (c == '\n')
143 {
144 line_number++;
145 break;
146 }
147 else if (c == '\'')
148 {
149 for (;;)
150 {
151 c = getc (f);
152 /* Embedded single quotes like 'abc''def' don't occur.
153 See fpc-1.0.4/compiler/cresstr.pas. */
154 if (c == EOF || c == '\n' || c == '\'')
155 break;
156 if (bufpos >= bufmax)
157 {
158 bufmax = 2 * bufmax + 10;
159 buffer = xrealloc (buffer, bufmax);
160 }
161 buffer[bufpos++] = c;
162 }
163 if (c == EOF)
164 break;
165 else if (c == '\n')
166 {
167 line_number++;
168 break;
169 }
170 }
171 else if (c == '#')
172 {
173 int n;
174 c = getc (f);
175 if (c == EOF && ferror (f))
176 goto bomb;
177 if (c == EOF || !c_isdigit (c))
178 {
179 error_with_progname = false;
180 error (EXIT_FAILURE, 0, _("%s:%d: missing number after #"),
181 logical_filename, line_number);
182 error_with_progname = true;
183 }
184 n = (c - '0');
185 for (;;)
186 {
187 c = getc (f);
188 if (c == EOF || !c_isdigit (c))
189 break;
190 n = n * 10 + (c - '0');
191 }
192 if (bufpos >= bufmax)
193 {
194 bufmax = 2 * bufmax + 10;
195 buffer = xrealloc (buffer, bufmax);
196 }
197 buffer[bufpos++] = (unsigned char) n;
198 if (c == EOF)
199 break;
200 ungetc (c, f);
201 }
202 else if (c == '+')
203 {
204 c = getc (f);
205 if (c == EOF)
206 break;
207 if (c == '\n')
208 line_number++;
209 else
210 ungetc (c, f);
211 }
212 else
213 {
214 error_with_progname = false;
215 error (EXIT_FAILURE, 0, _("%s:%d: invalid string expression"),
216 logical_filename, line_number);
217 error_with_progname = true;
218 }
219 }
220 if (bufpos >= bufmax)
221 {
222 bufmax = 2 * bufmax + 10;
223 buffer = xrealloc (buffer, bufmax);
224 }
225 buffer[bufpos] = '\0';
226 msgid = xstrdup (buffer);
227
228 pos.file_name = location;
229 pos.line_number = (size_t)(-1);
230
231 remember_a_message (mlp, NULL, msgid, false, false, null_context, &pos,
232 NULL, NULL, false);
233
234 /* Here c is the last read character: EOF or '\n'. */
235 if (c == EOF)
236 break;
237 }
238
239 if (ferror (f))
240 {
241 bomb:
242 error (EXIT_FAILURE, errno, _("error while reading \"%s\""),
243 real_filename);
244 }
245 }
246
247
248 /* RSJ stands for Resource String Table in JSON.
249
250 An RSJ file is a JSON file that contains several string definitions.
251 It has the format (modulo whitespace)
252 {
253 "version": 1,
254 "strings":
255 [
256 {
257 "hash": <integer>,
258 "name": <string>,
259 "sourcebytes": [ <integer>... ],
260 "value": <string>
261 },
262 ...
263 ]
264 }
265 The sourcebytes array contains the original source bytes, in the
266 source encoding (not guaranteed to be ISO-8859-1, see
267 <http://wiki.freepascal.org/FPC_Unicode_support#Source_file_codepage>).
268
269 This backend attempts to be functionally equivalent to the 'rstconv'
270 program, part of the Free Pascal run time library, written by
271 Sebastian Guenther. Except that
272 * we use the "value" as msgid, not the "sourcebytes",
273 * the locations are output as "ModuleName.ConstName",
274 not "ModuleName:ConstName",
275 * we add the flag '#, object-pascal-format' where appropriate.
276 */
277
278 /* For the JSON syntax, refer to RFC 8259. */
279
280 /* ======================== Reading of characters. ======================== */
281
282 /* The input file stream. */
283 static FILE *fp;
284
285
286 /* 1. line_number handling. */
287
288 static int
phase1_getc()289 phase1_getc ()
290 {
291 int c = getc (fp);
292
293 if (c == EOF)
294 {
295 if (ferror (fp))
296 error (EXIT_FAILURE, errno, _("error while reading \"%s\""),
297 real_file_name);
298 return EOF;
299 }
300
301 if (c == '\n')
302 line_number++;
303
304 return c;
305 }
306
307 /* Supports only one pushback character. */
308 static void
phase1_ungetc(int c)309 phase1_ungetc (int c)
310 {
311 if (c != EOF)
312 {
313 if (c == '\n')
314 --line_number;
315
316 ungetc (c, fp);
317 }
318 }
319
320
321 /* 2. Skipping whitespace. */
322
323 /* Tests whether a phase1_getc() result is JSON whitespace. */
324 static inline bool
is_whitespace(int c)325 is_whitespace (int c)
326 {
327 return (c == ' ' || c == '\t' || c == '\n' || c == '\r');
328 }
329
330 static int
phase2_getc()331 phase2_getc ()
332 {
333 int c;
334
335 do
336 c = phase1_getc ();
337 while (is_whitespace (c));
338
339 return c;
340 }
341
342 static void
phase2_ungetc(int c)343 phase2_ungetc (int c)
344 {
345 phase1_ungetc (c);
346 }
347
348
349 /* ========================== Reading of tokens. ========================== */
350
351 /* Result of parsing a token. */
352
353 enum parse_result
354 {
355 pr_parsed, /* successfully parsed */
356 pr_none, /* the next token is of a different type */
357 pr_syntax /* syntax error inside the token */
358 };
359
360 static char *buffer;
361 static int bufmax;
362
363 /* Parses an integer. Returns it in buffer, of length bufmax.
364 Returns pr_parsed or pr_none. */
365 static enum parse_result
parse_integer()366 parse_integer ()
367 {
368 int c;
369 int bufpos;
370
371 c = phase2_getc ();
372 bufpos = 0;
373 for (;;)
374 {
375 if (bufpos >= bufmax)
376 {
377 bufmax = 2 * bufmax + 10;
378 buffer = xrealloc (buffer, bufmax);
379 }
380 if (!(c >= '0' && c <= '9'))
381 break;
382 buffer[bufpos++] = c;
383 c = phase1_getc ();
384 }
385 phase1_ungetc (c);
386 buffer[bufpos] = '\0';
387 return (bufpos == 0 ? pr_none : pr_parsed);
388 }
389
390 static struct mixed_string_buffer stringbuf;
391
392 /* Parses a string. Returns it in stringbuf, in UTF-8 encoding.
393 Returns a parse_result. */
394 static enum parse_result
parse_string()395 parse_string ()
396 {
397 int c;
398
399 c = phase2_getc ();
400 if (c != '"')
401 {
402 phase2_ungetc (c);
403 return pr_none;
404 }
405 mixed_string_buffer_init (&stringbuf, lc_string,
406 logical_file_name, line_number);
407 for (;;)
408 {
409 c = phase1_getc ();
410 /* Keep line_number in sync. */
411 stringbuf.line_number = line_number;
412 if (c == EOF || (c >= 0 && c < 0x20))
413 return pr_syntax;
414 if (c == '"')
415 break;
416 if (c == '\\')
417 {
418 c = phase1_getc ();
419 if (c == 'u')
420 {
421 unsigned int n = 0;
422 int i;
423
424 for (i = 0; i < 4; i++)
425 {
426 c = phase1_getc ();
427
428 if (c >= '0' && c <= '9')
429 n = (n << 4) + (c - '0');
430 else if (c >= 'A' && c <= 'F')
431 n = (n << 4) + (c - 'A' + 10);
432 else if (c >= 'a' && c <= 'f')
433 n = (n << 4) + (c - 'a' + 10);
434 else
435 return pr_syntax;
436 }
437 mixed_string_buffer_append_unicode (&stringbuf, n);
438 }
439 else
440 {
441 switch (c)
442 {
443 case '"':
444 case '\\':
445 case '/':
446 break;
447 case 'b':
448 c = '\b';
449 break;
450 case 'f':
451 c = '\f';
452 break;
453 case 'n':
454 c = '\n';
455 break;
456 case 'r':
457 c = '\r';
458 break;
459 case 't':
460 c = '\t';
461 break;
462 default:
463 return pr_syntax;
464 }
465 mixed_string_buffer_append_char (&stringbuf, c);
466 }
467 }
468 else
469 mixed_string_buffer_append_char (&stringbuf, c);
470 }
471 return pr_parsed;
472 }
473
474 void
extract_rsj(FILE * f,const char * real_filename,const char * logical_filename,flag_context_list_table_ty * flag_table,msgdomain_list_ty * mdlp)475 extract_rsj (FILE *f,
476 const char *real_filename, const char *logical_filename,
477 flag_context_list_table_ty *flag_table,
478 msgdomain_list_ty *mdlp)
479 {
480 message_list_ty *mlp = mdlp->item[0]->messages;
481 int c;
482
483 fp = f;
484 real_file_name = real_filename;
485 logical_file_name = xstrdup (logical_filename);
486 line_number = 1;
487
488 /* JSON is always in UTF-8. */
489 xgettext_current_source_encoding = po_charset_utf8;
490
491 /* Parse the initial opening brace. */
492 c = phase2_getc ();
493 if (c != '{')
494 goto invalid_json;
495
496 c = phase2_getc ();
497 if (c != '}')
498 {
499 phase2_ungetc (c);
500 for (;;)
501 {
502 /* Parse a string. */
503 char *s1;
504 if (parse_string () != pr_parsed)
505 goto invalid_json;
506 s1 = mixed_string_contents_free1 (
507 mixed_string_buffer_result (&stringbuf));
508
509 /* Parse a colon. */
510 c = phase2_getc ();
511 if (c != ':')
512 goto invalid_json;
513
514 if (strcmp (s1, "version") == 0)
515 {
516 /* Parse an integer. */
517 if (parse_integer () != pr_parsed)
518 goto invalid_rsj;
519 if (strcmp (buffer, "1") != 0)
520 goto invalid_rsj_version;
521 }
522 else if (strcmp (s1, "strings") == 0)
523 {
524 /* Parse an array. */
525 c = phase2_getc ();
526 if (c != '[')
527 goto invalid_rsj;
528
529 c = phase2_getc ();
530 if (c != ']')
531 {
532 phase2_ungetc (c);
533 for (;;)
534 {
535 char *location = NULL;
536 char *msgid = NULL;
537 lex_pos_ty pos;
538
539 /* Parse an object. */
540 c = phase2_getc ();
541 if (c != '{')
542 goto invalid_rsj;
543
544 c = phase2_getc ();
545 if (c != '}')
546 {
547 phase2_ungetc (c);
548 for (;;)
549 {
550 /* Parse a string. */
551 char *s2;
552 if (parse_string () != pr_parsed)
553 goto invalid_json;
554 s2 = mixed_string_contents_free1 (
555 mixed_string_buffer_result (&stringbuf));
556
557 /* Parse a colon. */
558 c = phase2_getc ();
559 if (c != ':')
560 goto invalid_json;
561
562 if (strcmp (s2, "hash") == 0)
563 {
564 /* Parse an integer. */
565 if (parse_integer () != pr_parsed)
566 goto invalid_rsj;
567 }
568 else if (strcmp (s2, "name") == 0)
569 {
570 /* Parse a string. */
571 enum parse_result r = parse_string ();
572 if (r == pr_none)
573 goto invalid_rsj;
574 if (r == pr_syntax || location != NULL)
575 goto invalid_json;
576 location =
577 mixed_string_contents_free1 (
578 mixed_string_buffer_result (&stringbuf));
579 }
580 else if (strcmp (s2, "sourcebytes") == 0)
581 {
582 /* Parse an array. */
583 c = phase2_getc ();
584 if (c != '[')
585 goto invalid_rsj;
586
587 c = phase2_getc ();
588 if (c != ']')
589 {
590 phase2_ungetc (c);
591 for (;;)
592 {
593 /* Parse an integer. */
594 if (parse_integer () != pr_parsed)
595 goto invalid_rsj;
596
597 /* Parse a comma. */
598 c = phase2_getc ();
599 if (c == ']')
600 break;
601 if (c != ',')
602 goto invalid_json;
603 }
604 }
605 }
606 else if (strcmp (s2, "value") == 0)
607 {
608 /* Parse a string. */
609 enum parse_result r = parse_string ();
610 if (r == pr_none)
611 goto invalid_rsj;
612 if (r == pr_syntax || msgid != NULL)
613 goto invalid_json;
614 msgid =
615 mixed_string_contents_free1 (
616 mixed_string_buffer_result (&stringbuf));
617 }
618 else
619 goto invalid_rsj;
620
621 free (s2);
622
623 /* Parse a comma. */
624 c = phase2_getc ();
625 if (c == '}')
626 break;
627 if (c != ',')
628 goto invalid_json;
629 }
630 }
631
632 if (location == NULL || msgid == NULL)
633 goto invalid_rsj;
634
635 pos.file_name = location;
636 pos.line_number = (size_t)(-1);
637
638 remember_a_message (mlp, NULL, msgid, true, false,
639 null_context, &pos,
640 NULL, NULL, false);
641
642 /* Parse a comma. */
643 c = phase2_getc ();
644 if (c == ']')
645 break;
646 if (c != ',')
647 goto invalid_json;
648 }
649 }
650 }
651 else
652 goto invalid_rsj;
653
654 /* Parse a comma. */
655 c = phase2_getc ();
656 if (c == '}')
657 break;
658 if (c != ',')
659 goto invalid_json;
660 }
661 }
662
663 /* Seen the closing brace. */
664 c = phase2_getc ();
665 if (c != EOF)
666 goto invalid_json;
667
668 fp = NULL;
669 real_file_name = NULL;
670 logical_file_name = NULL;
671 line_number = 0;
672
673 return;
674
675 invalid_json:
676 error_with_progname = false;
677 error (EXIT_FAILURE, 0, _("%s:%d: invalid JSON syntax"),
678 logical_filename, line_number);
679 error_with_progname = true;
680 return;
681
682 invalid_rsj:
683 error_with_progname = false;
684 error (EXIT_FAILURE, 0, _("%s:%d: invalid RSJ syntax"),
685 logical_filename, line_number);
686 error_with_progname = true;
687 return;
688
689 invalid_rsj_version:
690 error_with_progname = false;
691 error (EXIT_FAILURE, 0,
692 _("%s:%d: invalid RSJ version. Only version 1 is supported."),
693 logical_filename, line_number);
694 error_with_progname = true;
695 return;
696 }
697