1 /* xgettext RST/RSJ backend.
2    Copyright (C) 2001-2003, 2005-2009, 2018-2019 Free Software Foundation, Inc.
3 
4    This file was written by Bruno Haible <haible@clisp.cons.org>, 2001.
5 
6    This program is free software: you can redistribute it and/or modify
7    it under the terms of the GNU General Public License as published by
8    the Free Software Foundation; either version 3 of the License, or
9    (at your option) any later version.
10 
11    This program is distributed in the hope that it will be useful,
12    but WITHOUT ANY WARRANTY; without even the implied warranty of
13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14    GNU General Public License for more details.
15 
16    You should have received a copy of the GNU General Public License
17    along with this program.  If not, see <https://www.gnu.org/licenses/>.  */
18 
19 #ifdef HAVE_CONFIG_H
20 # include "config.h"
21 #endif
22 
23 /* Specification.  */
24 #include "x-rst.h"
25 
26 #include <errno.h>
27 #include <stdbool.h>
28 #include <stdio.h>
29 #include <stddef.h>
30 #include <stdlib.h>
31 
32 #include "c-ctype.h"
33 #include "po-charset.h"
34 #include "message.h"
35 #include "xgettext.h"
36 #include "xg-pos.h"
37 #include "xg-encoding.h"
38 #include "xg-mixed-string.h"
39 #include "xg-message.h"
40 #include "error.h"
41 #include "error-progname.h"
42 #include "xalloc.h"
43 #include "gettext.h"
44 
45 #define _(s) gettext(s)
46 
47 /* RST stands for Resource String Table.
48 
49    An RST file consists of several string definitions.  A string definition
50    starts at the beginning of a line and looks like this:
51        ModuleName.ConstName=StringExpression
52    A StringExpression consists of string pieces of the form 'xyz',
53    single characters of the form #nnn (decimal integer), and +
54    at the end of the line to designate continuation on the next line.
55    String definitions can be separated by blank lines or comment lines
56    beginning with '#'.
57 
58    This backend attempts to be functionally equivalent to the 'rstconv'
59    program, part of the Free Pascal run time library, written by
60    Sebastian Guenther.  Except that
61      * the locations are output as "ModuleName.ConstName",
62        not "ModuleName:ConstName",
63      * we add the flag '#, object-pascal-format' where appropriate.
64  */
65 
66 void
extract_rst(FILE * f,const char * real_filename,const char * logical_filename,flag_context_list_table_ty * flag_table,msgdomain_list_ty * mdlp)67 extract_rst (FILE *f,
68              const char *real_filename, const char *logical_filename,
69              flag_context_list_table_ty *flag_table,
70              msgdomain_list_ty *mdlp)
71 {
72   static char *buffer;
73   static int bufmax;
74   message_list_ty *mlp = mdlp->item[0]->messages;
75   int line_number;
76 
77   line_number = 1;
78   for (;;)
79     {
80       int c;
81       int bufpos;
82       char *location;
83       char *msgid;
84       lex_pos_ty pos;
85 
86       c = getc (f);
87       if (c == EOF)
88         break;
89 
90       /* Ignore blank line.  */
91       if (c == '\n')
92         {
93           line_number++;
94           continue;
95         }
96 
97       /* Ignore comment line.  */
98       if (c == '#')
99         {
100           do
101             c = getc (f);
102           while (c != EOF && c != '\n');
103           if (c == EOF)
104             break;
105           line_number++;
106           continue;
107         }
108 
109       /* Read ModuleName.ConstName.  */
110       bufpos = 0;
111       for (;;)
112         {
113           if (c == EOF || c == '\n')
114             {
115               error_with_progname = false;
116               error (EXIT_FAILURE, 0, _("%s:%d: invalid string definition"),
117                      logical_filename, line_number);
118               error_with_progname = true;
119             }
120           if (bufpos >= bufmax)
121             {
122               bufmax = 2 * bufmax + 10;
123               buffer = xrealloc (buffer, bufmax);
124             }
125           if (c == '=')
126             break;
127           buffer[bufpos++] = c;
128           c = getc (f);
129           if (c == EOF && ferror (f))
130             goto bomb;
131         }
132       buffer[bufpos] = '\0';
133       location = xstrdup (buffer);
134 
135       /* Read StringExpression.  */
136       bufpos = 0;
137       for (;;)
138         {
139           c = getc (f);
140           if (c == EOF)
141             break;
142           else if (c == '\n')
143             {
144               line_number++;
145               break;
146             }
147           else if (c == '\'')
148             {
149               for (;;)
150                 {
151                   c = getc (f);
152                   /* Embedded single quotes like 'abc''def' don't occur.
153                      See fpc-1.0.4/compiler/cresstr.pas.  */
154                   if (c == EOF || c == '\n' || c == '\'')
155                     break;
156                   if (bufpos >= bufmax)
157                     {
158                       bufmax = 2 * bufmax + 10;
159                       buffer = xrealloc (buffer, bufmax);
160                     }
161                   buffer[bufpos++] = c;
162                 }
163               if (c == EOF)
164                 break;
165               else if (c == '\n')
166                 {
167                   line_number++;
168                   break;
169                 }
170             }
171           else if (c == '#')
172             {
173               int n;
174               c = getc (f);
175               if (c == EOF && ferror (f))
176                 goto bomb;
177               if (c == EOF || !c_isdigit (c))
178                 {
179                   error_with_progname = false;
180                   error (EXIT_FAILURE, 0, _("%s:%d: missing number after #"),
181                          logical_filename, line_number);
182                   error_with_progname = true;
183                 }
184               n = (c - '0');
185               for (;;)
186                 {
187                   c = getc (f);
188                   if (c == EOF || !c_isdigit (c))
189                     break;
190                   n = n * 10 + (c - '0');
191                 }
192               if (bufpos >= bufmax)
193                 {
194                   bufmax = 2 * bufmax + 10;
195                   buffer = xrealloc (buffer, bufmax);
196                 }
197               buffer[bufpos++] = (unsigned char) n;
198               if (c == EOF)
199                 break;
200               ungetc (c, f);
201             }
202           else if (c == '+')
203             {
204               c = getc (f);
205               if (c == EOF)
206                 break;
207               if (c == '\n')
208                 line_number++;
209               else
210                 ungetc (c, f);
211             }
212           else
213             {
214               error_with_progname = false;
215               error (EXIT_FAILURE, 0, _("%s:%d: invalid string expression"),
216                      logical_filename, line_number);
217               error_with_progname = true;
218             }
219         }
220       if (bufpos >= bufmax)
221         {
222           bufmax = 2 * bufmax + 10;
223           buffer = xrealloc (buffer, bufmax);
224         }
225       buffer[bufpos] = '\0';
226       msgid = xstrdup (buffer);
227 
228       pos.file_name = location;
229       pos.line_number = (size_t)(-1);
230 
231       remember_a_message (mlp, NULL, msgid, false, false, null_context, &pos,
232                           NULL, NULL, false);
233 
234       /* Here c is the last read character: EOF or '\n'.  */
235       if (c == EOF)
236         break;
237     }
238 
239   if (ferror (f))
240     {
241     bomb:
242       error (EXIT_FAILURE, errno, _("error while reading \"%s\""),
243              real_filename);
244     }
245 }
246 
247 
248 /* RSJ stands for Resource String Table in JSON.
249 
250    An RSJ file is a JSON file that contains several string definitions.
251    It has the format (modulo whitespace)
252      {
253        "version": 1,
254        "strings":
255          [
256            {
257              "hash": <integer>,
258              "name": <string>,
259              "sourcebytes": [ <integer>... ],
260              "value": <string>
261            },
262            ...
263          ]
264      }
265    The sourcebytes array contains the original source bytes, in the
266    source encoding (not guaranteed to be ISO-8859-1, see
267    <http://wiki.freepascal.org/FPC_Unicode_support#Source_file_codepage>).
268 
269    This backend attempts to be functionally equivalent to the 'rstconv'
270    program, part of the Free Pascal run time library, written by
271    Sebastian Guenther.  Except that
272      * we use the "value" as msgid, not the "sourcebytes",
273      * the locations are output as "ModuleName.ConstName",
274        not "ModuleName:ConstName",
275      * we add the flag '#, object-pascal-format' where appropriate.
276  */
277 
278 /* For the JSON syntax, refer to RFC 8259.  */
279 
280 /* ======================== Reading of characters.  ======================== */
281 
282 /* The input file stream.  */
283 static FILE *fp;
284 
285 
286 /* 1. line_number handling.  */
287 
288 static int
phase1_getc()289 phase1_getc ()
290 {
291   int c = getc (fp);
292 
293   if (c == EOF)
294     {
295       if (ferror (fp))
296         error (EXIT_FAILURE, errno, _("error while reading \"%s\""),
297                real_file_name);
298       return EOF;
299     }
300 
301   if (c == '\n')
302     line_number++;
303 
304   return c;
305 }
306 
307 /* Supports only one pushback character.  */
308 static void
phase1_ungetc(int c)309 phase1_ungetc (int c)
310 {
311   if (c != EOF)
312     {
313       if (c == '\n')
314         --line_number;
315 
316       ungetc (c, fp);
317     }
318 }
319 
320 
321 /* 2. Skipping whitespace.  */
322 
323 /* Tests whether a phase1_getc() result is JSON whitespace.  */
324 static inline bool
is_whitespace(int c)325 is_whitespace (int c)
326 {
327   return (c == ' ' || c == '\t' || c == '\n' || c == '\r');
328 }
329 
330 static int
phase2_getc()331 phase2_getc ()
332 {
333   int c;
334 
335   do
336     c = phase1_getc ();
337   while (is_whitespace (c));
338 
339   return c;
340 }
341 
342 static void
phase2_ungetc(int c)343 phase2_ungetc (int c)
344 {
345   phase1_ungetc (c);
346 }
347 
348 
349 /* ========================== Reading of tokens.  ========================== */
350 
351 /* Result of parsing a token.  */
352 
353 enum parse_result
354 {
355   pr_parsed, /* successfully parsed */
356   pr_none,   /* the next token is of a different type */
357   pr_syntax  /* syntax error inside the token */
358 };
359 
360 static char *buffer;
361 static int bufmax;
362 
363 /* Parses an integer.  Returns it in buffer, of length bufmax.
364    Returns pr_parsed or pr_none.  */
365 static enum parse_result
parse_integer()366 parse_integer ()
367 {
368   int c;
369   int bufpos;
370 
371   c = phase2_getc ();
372   bufpos = 0;
373   for (;;)
374     {
375       if (bufpos >= bufmax)
376         {
377           bufmax = 2 * bufmax + 10;
378           buffer = xrealloc (buffer, bufmax);
379         }
380       if (!(c >= '0' && c <= '9'))
381         break;
382       buffer[bufpos++] = c;
383       c = phase1_getc ();
384     }
385   phase1_ungetc (c);
386   buffer[bufpos] = '\0';
387   return (bufpos == 0 ? pr_none : pr_parsed);
388 }
389 
390 static struct mixed_string_buffer stringbuf;
391 
392 /* Parses a string.  Returns it in stringbuf, in UTF-8 encoding.
393    Returns a parse_result.  */
394 static enum parse_result
parse_string()395 parse_string ()
396 {
397   int c;
398 
399   c = phase2_getc ();
400   if (c != '"')
401     {
402       phase2_ungetc (c);
403       return pr_none;
404     }
405   mixed_string_buffer_init (&stringbuf, lc_string,
406                             logical_file_name, line_number);
407   for (;;)
408     {
409       c = phase1_getc ();
410       /* Keep line_number in sync.  */
411       stringbuf.line_number = line_number;
412       if (c == EOF || (c >= 0 && c < 0x20))
413         return pr_syntax;
414       if (c == '"')
415         break;
416       if (c == '\\')
417         {
418           c = phase1_getc ();
419           if (c == 'u')
420             {
421               unsigned int n = 0;
422               int i;
423 
424               for (i = 0; i < 4; i++)
425                 {
426                   c = phase1_getc ();
427 
428                   if (c >= '0' && c <= '9')
429                     n = (n << 4) + (c - '0');
430                   else if (c >= 'A' && c <= 'F')
431                     n = (n << 4) + (c - 'A' + 10);
432                   else if (c >= 'a' && c <= 'f')
433                     n = (n << 4) + (c - 'a' + 10);
434                   else
435                     return pr_syntax;
436                 }
437               mixed_string_buffer_append_unicode (&stringbuf, n);
438             }
439           else
440             {
441               switch (c)
442                 {
443                 case '"':
444                 case '\\':
445                 case '/':
446                   break;
447                 case 'b':
448                   c = '\b';
449                   break;
450                 case 'f':
451                   c = '\f';
452                   break;
453                 case 'n':
454                   c = '\n';
455                   break;
456                 case 'r':
457                   c = '\r';
458                   break;
459                 case 't':
460                   c = '\t';
461                   break;
462                 default:
463                   return pr_syntax;
464                 }
465               mixed_string_buffer_append_char (&stringbuf, c);
466             }
467         }
468       else
469         mixed_string_buffer_append_char (&stringbuf, c);
470     }
471   return pr_parsed;
472 }
473 
474 void
extract_rsj(FILE * f,const char * real_filename,const char * logical_filename,flag_context_list_table_ty * flag_table,msgdomain_list_ty * mdlp)475 extract_rsj (FILE *f,
476              const char *real_filename, const char *logical_filename,
477              flag_context_list_table_ty *flag_table,
478              msgdomain_list_ty *mdlp)
479 {
480   message_list_ty *mlp = mdlp->item[0]->messages;
481   int c;
482 
483   fp = f;
484   real_file_name = real_filename;
485   logical_file_name = xstrdup (logical_filename);
486   line_number = 1;
487 
488   /* JSON is always in UTF-8.  */
489   xgettext_current_source_encoding = po_charset_utf8;
490 
491   /* Parse the initial opening brace.  */
492   c = phase2_getc ();
493   if (c != '{')
494     goto invalid_json;
495 
496   c = phase2_getc ();
497   if (c != '}')
498     {
499       phase2_ungetc (c);
500       for (;;)
501         {
502           /* Parse a string.  */
503           char *s1;
504           if (parse_string () != pr_parsed)
505             goto invalid_json;
506           s1 = mixed_string_contents_free1 (
507                  mixed_string_buffer_result (&stringbuf));
508 
509           /* Parse a colon.  */
510           c = phase2_getc ();
511           if (c != ':')
512             goto invalid_json;
513 
514           if (strcmp (s1, "version") == 0)
515             {
516               /* Parse an integer.  */
517               if (parse_integer () != pr_parsed)
518                 goto invalid_rsj;
519               if (strcmp (buffer, "1") != 0)
520                 goto invalid_rsj_version;
521             }
522           else if (strcmp (s1, "strings") == 0)
523             {
524               /* Parse an array.  */
525               c = phase2_getc ();
526               if (c != '[')
527                 goto invalid_rsj;
528 
529               c = phase2_getc ();
530               if (c != ']')
531                 {
532                   phase2_ungetc (c);
533                   for (;;)
534                     {
535                       char *location = NULL;
536                       char *msgid = NULL;
537                       lex_pos_ty pos;
538 
539                       /* Parse an object.  */
540                       c = phase2_getc ();
541                       if (c != '{')
542                         goto invalid_rsj;
543 
544                       c = phase2_getc ();
545                       if (c != '}')
546                         {
547                           phase2_ungetc (c);
548                           for (;;)
549                             {
550                               /* Parse a string.  */
551                               char *s2;
552                               if (parse_string () != pr_parsed)
553                                 goto invalid_json;
554                               s2 = mixed_string_contents_free1 (
555                                      mixed_string_buffer_result (&stringbuf));
556 
557                               /* Parse a colon.  */
558                               c = phase2_getc ();
559                               if (c != ':')
560                                 goto invalid_json;
561 
562                               if (strcmp (s2, "hash") == 0)
563                                 {
564                                   /* Parse an integer.  */
565                                   if (parse_integer () != pr_parsed)
566                                     goto invalid_rsj;
567                                 }
568                               else if (strcmp (s2, "name") == 0)
569                                 {
570                                   /* Parse a string.  */
571                                   enum parse_result r = parse_string ();
572                                   if (r == pr_none)
573                                     goto invalid_rsj;
574                                   if (r == pr_syntax || location != NULL)
575                                     goto invalid_json;
576                                   location =
577                                     mixed_string_contents_free1 (
578                                       mixed_string_buffer_result (&stringbuf));
579                                 }
580                               else if (strcmp (s2, "sourcebytes") == 0)
581                                 {
582                                   /* Parse an array.  */
583                                   c = phase2_getc ();
584                                   if (c != '[')
585                                     goto invalid_rsj;
586 
587                                   c = phase2_getc ();
588                                   if (c != ']')
589                                     {
590                                       phase2_ungetc (c);
591                                       for (;;)
592                                         {
593                                           /* Parse an integer.  */
594                                           if (parse_integer () != pr_parsed)
595                                             goto invalid_rsj;
596 
597                                           /* Parse a comma.  */
598                                           c = phase2_getc ();
599                                           if (c == ']')
600                                             break;
601                                           if (c != ',')
602                                             goto invalid_json;
603                                         }
604                                     }
605                                 }
606                               else if (strcmp (s2, "value") == 0)
607                                 {
608                                   /* Parse a string.  */
609                                   enum parse_result r = parse_string ();
610                                   if (r == pr_none)
611                                     goto invalid_rsj;
612                                   if (r == pr_syntax || msgid != NULL)
613                                     goto invalid_json;
614                                   msgid =
615                                     mixed_string_contents_free1 (
616                                       mixed_string_buffer_result (&stringbuf));
617                                 }
618                               else
619                                 goto invalid_rsj;
620 
621                               free (s2);
622 
623                               /* Parse a comma.  */
624                               c = phase2_getc ();
625                               if (c == '}')
626                                 break;
627                               if (c != ',')
628                                 goto invalid_json;
629                             }
630                         }
631 
632                       if (location == NULL || msgid == NULL)
633                         goto invalid_rsj;
634 
635                       pos.file_name = location;
636                       pos.line_number = (size_t)(-1);
637 
638                       remember_a_message (mlp, NULL, msgid, true, false,
639                                           null_context, &pos,
640                                           NULL, NULL, false);
641 
642                       /* Parse a comma.  */
643                       c = phase2_getc ();
644                       if (c == ']')
645                         break;
646                       if (c != ',')
647                         goto invalid_json;
648                     }
649                 }
650             }
651           else
652             goto invalid_rsj;
653 
654           /* Parse a comma.  */
655           c = phase2_getc ();
656           if (c == '}')
657             break;
658           if (c != ',')
659             goto invalid_json;
660         }
661     }
662 
663   /* Seen the closing brace.  */
664   c = phase2_getc ();
665   if (c != EOF)
666     goto invalid_json;
667 
668   fp = NULL;
669   real_file_name = NULL;
670   logical_file_name = NULL;
671   line_number = 0;
672 
673   return;
674 
675  invalid_json:
676   error_with_progname = false;
677   error (EXIT_FAILURE, 0, _("%s:%d: invalid JSON syntax"),
678          logical_filename, line_number);
679   error_with_progname = true;
680   return;
681 
682  invalid_rsj:
683   error_with_progname = false;
684   error (EXIT_FAILURE, 0, _("%s:%d: invalid RSJ syntax"),
685          logical_filename, line_number);
686   error_with_progname = true;
687   return;
688 
689  invalid_rsj_version:
690   error_with_progname = false;
691   error (EXIT_FAILURE, 0,
692          _("%s:%d: invalid RSJ version. Only version 1 is supported."),
693          logical_filename, line_number);
694   error_with_progname = true;
695   return;
696 }
697