1 /* xgettext Vala backend.
2    Copyright (C) 2013-2014, 2018-2020 Free Software Foundation, Inc.
3 
4    This file was written by Daiki Ueno <ueno@gnu.org>, 2013.
5 
6    This program is free software: you can redistribute it and/or modify
7    it under the terms of the GNU General Public License as published by
8    the Free Software Foundation; either version 3 of the License, or
9    (at your option) any later version.
10 
11    This program is distributed in the hope that it will be useful,
12    but WITHOUT ANY WARRANTY; without even the implied warranty of
13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14    GNU General Public License for more details.
15 
16    You should have received a copy of the GNU General Public License
17    along with this program.  If not, see <https://www.gnu.org/licenses/>.  */
18 
19 #ifdef HAVE_CONFIG_H
20 # include "config.h"
21 #endif
22 
23 /* Specification.  */
24 #include "x-vala.h"
25 
26 #include <assert.h>
27 #include <errno.h>
28 #include <stdbool.h>
29 #include <stdio.h>
30 #include <stdlib.h>
31 #include <string.h>
32 
33 #include "message.h"
34 #include "rc-str-list.h"
35 #include "xgettext.h"
36 #include "xg-pos.h"
37 #include "xg-encoding.h"
38 #include "xg-mixed-string.h"
39 #include "xg-arglist-context.h"
40 #include "xg-arglist-callshape.h"
41 #include "xg-arglist-parser.h"
42 #include "xg-message.h"
43 #include "error.h"
44 #include "error-progname.h"
45 #include "xalloc.h"
46 #include "xvasprintf.h"
47 #include "mem-hash-map.h"
48 #include "po-charset.h"
49 #include "gettext.h"
50 
51 #define _(s) gettext(s)
52 
53 #define SIZEOF(a) (sizeof(a) / sizeof(a[0]))
54 
55 /* The Vala syntax is defined in the Vala Reference Manual
56    https://www.vala-project.org/doc/vala/.
57    See also vala/valascanner.vala.  */
58 
59 /* ====================== Keyword set customization.  ====================== */
60 
61 /* If true extract all strings.  */
62 static bool extract_all = false;
63 
64 static hash_table keywords;
65 static bool default_keywords = true;
66 
67 
68 void
x_vala_extract_all()69 x_vala_extract_all ()
70 {
71   extract_all = true;
72 }
73 
74 
75 static void
add_keyword(const char * name,hash_table * keywords)76 add_keyword (const char *name, hash_table *keywords)
77 {
78   if (name == NULL)
79     default_keywords = false;
80   else
81     {
82       const char *end;
83       struct callshape shape;
84       const char *colon;
85 
86       if (keywords->table == NULL)
87         hash_init (keywords, 100);
88 
89       split_keywordspec (name, &end, &shape);
90 
91       /* The characters between name and end should form a valid C identifier.
92          A colon means an invalid parse in split_keywordspec().  */
93       colon = strchr (name, ':');
94       if (colon == NULL || colon >= end)
95         insert_keyword_callshape (keywords, name, end - name, &shape);
96     }
97 }
98 
99 void
x_vala_keyword(const char * name)100 x_vala_keyword (const char *name)
101 {
102   add_keyword (name, &keywords);
103 }
104 
105 static void
init_keywords()106 init_keywords ()
107 {
108   if (default_keywords)
109     {
110       /* When adding new keywords here, also update the documentation in
111          xgettext.texi!  */
112       x_vala_keyword ("dgettext:2");
113       x_vala_keyword ("dcgettext:2");
114       x_vala_keyword ("ngettext:1,2");
115       x_vala_keyword ("dngettext:2,3");
116       x_vala_keyword ("dpgettext:2g");
117       x_vala_keyword ("dpgettext2:2c,3");
118       x_vala_keyword ("_");
119       x_vala_keyword ("Q_");
120       x_vala_keyword ("N_");
121       x_vala_keyword ("NC_:1c,2");
122 
123       default_keywords = false;
124     }
125 }
126 
127 void
init_flag_table_vala()128 init_flag_table_vala ()
129 {
130   xgettext_record_flag ("dgettext:2:pass-c-format");
131   xgettext_record_flag ("dcgettext:2:pass-c-format");
132   xgettext_record_flag ("ngettext:1:pass-c-format");
133   xgettext_record_flag ("ngettext:2:pass-c-format");
134   xgettext_record_flag ("dngettext:2:pass-c-format");
135   xgettext_record_flag ("dngettext:3:pass-c-format");
136   xgettext_record_flag ("dpgettext:2:pass-c-format");
137   xgettext_record_flag ("dpgettext2:3:pass-c-format");
138   xgettext_record_flag ("_:1:pass-c-format");
139   xgettext_record_flag ("Q_:1:pass-c-format");
140   xgettext_record_flag ("N_:1:pass-c-format");
141   xgettext_record_flag ("NC_:2:pass-c-format");
142 
143   /* Vala leaves string formatting to Glib functions and thus the
144      format string is exactly same as C.  See also
145      vapi/glib-2.0.vapi.  */
146   xgettext_record_flag ("printf:1:c-format");
147   xgettext_record_flag ("vprintf:1:c-format");
148 }
149 
150 
151 /* ======================== Reading of characters.  ======================== */
152 
153 /* The input file stream.  */
154 static FILE *fp;
155 
156 
157 /* 1. line_number handling.  */
158 
159 #define MAX_PHASE1_PUSHBACK 16
160 static unsigned char phase1_pushback[MAX_PHASE1_PUSHBACK];
161 static int phase1_pushback_length;
162 
163 
164 static int
phase1_getc()165 phase1_getc ()
166 {
167   int c;
168 
169   if (phase1_pushback_length)
170     c = phase1_pushback[--phase1_pushback_length];
171   else
172     {
173       c = getc (fp);
174       if (c == EOF)
175         {
176           if (ferror (fp))
177             error (EXIT_FAILURE, errno, _("error while reading \"%s\""),
178                    real_file_name);
179           return EOF;
180         }
181     }
182 
183   if (c == '\n')
184     ++line_number;
185   return c;
186 }
187 
188 
189 /* Supports 2 characters of pushback.  */
190 static void
phase1_ungetc(int c)191 phase1_ungetc (int c)
192 {
193   if (c != EOF)
194     {
195       if (c == '\n')
196         --line_number;
197 
198       if (phase1_pushback_length == SIZEOF (phase1_pushback))
199         abort ();
200       phase1_pushback[phase1_pushback_length++] = c;
201     }
202 }
203 
204 
205 /* These are for tracking whether comments count as immediately before
206    keyword.  */
207 static int last_comment_line;
208 static int last_non_comment_line;
209 
210 /* Accumulating comments.  */
211 
212 static char *buffer;
213 static size_t bufmax;
214 static size_t buflen;
215 
216 static inline void
comment_start()217 comment_start ()
218 {
219   buflen = 0;
220 }
221 
222 static inline void
comment_add(int c)223 comment_add (int c)
224 {
225   if (buflen >= bufmax)
226     {
227       bufmax = 2 * bufmax + 10;
228       buffer = xrealloc (buffer, bufmax);
229     }
230   buffer[buflen++] = c;
231 }
232 
233 static inline void
comment_line_end(size_t chars_to_remove)234 comment_line_end (size_t chars_to_remove)
235 {
236   buflen -= chars_to_remove;
237   while (buflen >= 1
238          && (buffer[buflen - 1] == ' ' || buffer[buflen - 1] == '\t'))
239     --buflen;
240   if (chars_to_remove == 0 && buflen >= bufmax)
241     {
242       bufmax = 2 * bufmax + 10;
243       buffer = xrealloc (buffer, bufmax);
244     }
245   buffer[buflen] = '\0';
246   savable_comment_add (buffer);
247 }
248 
249 
250 /* 2. Replace each comment that is not inside a character constant or
251    string literal with a space character.  */
252 
253 static int
phase2_getc()254 phase2_getc ()
255 {
256   int c;
257   bool last_was_star;
258 
259   c = phase1_getc ();
260   if (c != '/')
261     return c;
262   c = phase1_getc ();
263   switch (c)
264     {
265     default:
266       phase1_ungetc (c);
267       return '/';
268 
269     case '*':
270       /* C comment.  */
271       comment_start ();
272       last_was_star = false;
273       for (;;)
274         {
275           c = phase1_getc ();
276           if (c == EOF)
277             break;
278           /* We skip all leading white space, but not EOLs.  */
279           if (!(buflen == 0 && (c == ' ' || c == '\t')))
280             comment_add (c);
281           switch (c)
282             {
283             case '\n':
284               comment_line_end (1);
285               comment_start ();
286               last_was_star = false;
287               continue;
288 
289             case '*':
290               last_was_star = true;
291               continue;
292 
293             case '/':
294               if (last_was_star)
295                 {
296                   comment_line_end (2);
297                   break;
298                 }
299               /* FALLTHROUGH */
300 
301             default:
302               last_was_star = false;
303               continue;
304             }
305           break;
306         }
307       last_comment_line = line_number;
308       return ' ';
309 
310     case '/':
311       /* C++ or ISO C 99 comment.  */
312       comment_start ();
313       for (;;)
314         {
315           c = phase1_getc ();
316           if (c == '\n' || c == EOF)
317             break;
318           /* We skip all leading white space, but not EOLs.  */
319           if (!(buflen == 0 && (c == ' ' || c == '\t')))
320             comment_add (c);
321         }
322       comment_line_end (0);
323       last_comment_line = line_number;
324       return '\n';
325     }
326 }
327 
328 
329 static void
phase2_ungetc(int c)330 phase2_ungetc (int c)
331 {
332   phase1_ungetc (c);
333 }
334 
335 
336 /* ========================== Reading of tokens.  ========================== */
337 
338 enum token_type_ty
339 {
340   token_type_character_constant,        /* 'x' */
341   token_type_eof,
342   token_type_lparen,                    /* ( */
343   token_type_rparen,                    /* ) */
344   token_type_lbrace,                    /* { */
345   token_type_rbrace,                    /* } */
346   token_type_assign,                    /* = += -= *= /= %= <<= >>= &= |= ^= */
347   token_type_return,                    /* return */
348   token_type_plus,                      /* + */
349   token_type_arithmetic_operator,       /* - * / % << >> & | ^ */
350   token_type_equality_test_operator,    /* == < > >= <= != */
351   token_type_logic_operator,            /* ! && || */
352   token_type_comma,                     /* , */
353   token_type_question,                  /* ? */
354   token_type_colon,                     /* : */
355   token_type_number,                    /* 2.7 */
356   token_type_string_literal,            /* "abc" */
357   token_type_string_template,           /* @"abc" */
358   token_type_regex_literal,             /* /.../ */
359   token_type_symbol,                    /* if else etc. */
360   token_type_other
361 };
362 typedef enum token_type_ty token_type_ty;
363 
364 typedef struct token_ty token_ty;
365 struct token_ty
366 {
367   token_type_ty type;
368   char *string;                         /* for token_type_symbol */
369   mixed_string_ty *mixed_string;        /* for token_type_string_literal */
370   refcounted_string_list_ty *comment;   /* for token_type_string_literal */
371   int line_number;
372 };
373 
374 /* Free the memory pointed to by a 'struct token_ty'.  */
375 static inline void
free_token(token_ty * tp)376 free_token (token_ty *tp)
377 {
378   if (tp->type == token_type_symbol)
379     free (tp->string);
380   if (tp->type == token_type_string_literal)
381     {
382       mixed_string_free (tp->mixed_string);
383       drop_reference (tp->comment);
384     }
385 }
386 
387 
388 /* Return value of phase7_getc when EOF is reached.  */
389 #define P7_EOF (-1)
390 #define P7_STRING_END (-2)
391 
392 /* Replace escape sequences within character strings with their single
393    character equivalents.  */
394 #define P7_QUOTES (-3)
395 #define P7_QUOTE (-4)
396 #define P7_NEWLINE (-5)
397 
398 /* Convert an UTF-16 or UTF-32 code point to a return value that can be
399    distinguished from a single-byte return value.  */
400 #define UNICODE(code) (0x100 + (code))
401 
402 /* Test a return value of phase7_getuc whether it designates an UTF-16 or
403    UTF-32 code point.  */
404 #define IS_UNICODE(p7_result) ((p7_result) >= 0x100)
405 
406 /* Extract the UTF-16 or UTF-32 code of a return value that satisfies
407    IS_UNICODE.  */
408 #define UNICODE_VALUE(p7_result) ((p7_result) - 0x100)
409 
410 
411 static int
phase7_getc()412 phase7_getc ()
413 {
414   int c, n, j;
415 
416   /* Use phase 1, because phase 2 elides comments.  */
417   c = phase1_getc ();
418 
419   /* Return a magic newline indicator, so that we can distinguish
420      between the user requesting a newline in the string (e.g. using
421      "\n" or "\012") from the user failing to terminate the string or
422      character constant.  The ANSI C standard says: 3.1.3.4 Character
423      Constants contain "any character except single quote, backslash or
424      newline; or an escape sequence" and 3.1.4 String Literals contain
425      "any character except double quote, backslash or newline; or an
426      escape sequence".
427 
428      Most compilers give a fatal error in this case, however gcc is
429      stupidly silent, even though this is a very common typo.  OK, so
430      "gcc --pedantic" will tell me, but that gripes about too much other
431      stuff.  Could I have a "gcc -Wnewline-in-string" option, or
432      better yet a "gcc -fno-newline-in-string" option, please?  Gcc is
433      also inconsistent between string literals and character constants:
434      you may not embed newlines in character constants; try it, you get
435      a useful diagnostic.  --PMiller  */
436   if (c == '\n')
437     return P7_NEWLINE;
438 
439   if (c == '"')
440     return P7_QUOTES;
441   if (c == '\'')
442     return P7_QUOTE;
443   if (c != '\\')
444     return c;
445   c = phase1_getc ();
446   switch (c)
447     {
448     default:
449       /* Unknown escape sequences really should be an error, but just
450          ignore them, and let the real compiler complain.  */
451       phase1_ungetc (c);
452       return '\\';
453 
454     case '"':
455     case '\'':
456     case '\\':
457     case '$':
458       return c;
459 
460     case 'b':
461       return '\b';
462 
463     case 'f':
464       return '\f';
465     case 'n':
466       return '\n';
467     case 'r':
468       return '\r';
469     case 't':
470       return '\t';
471     case 'v':
472       return '\v';
473 
474     case 'x':
475       c = phase1_getc ();
476       switch (c)
477         {
478         default:
479           phase1_ungetc (c);
480           phase1_ungetc ('x');
481           return '\\';
482 
483         case '0': case '1': case '2': case '3': case '4':
484         case '5': case '6': case '7': case '8': case '9':
485         case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
486         case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
487           break;
488         }
489       n = 0;
490       for (;;)
491         {
492           switch (c)
493             {
494             default:
495               phase1_ungetc (c);
496               return n;
497 
498             case '0': case '1': case '2': case '3': case '4':
499             case '5': case '6': case '7': case '8': case '9':
500               n = n * 16 + c - '0';
501               break;
502 
503             case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
504               n = n * 16 + 10 + c - 'A';
505               break;
506 
507             case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
508               n = n * 16 + 10 + c - 'a';
509               break;
510             }
511           c = phase1_getc ();
512         }
513       return n;
514 
515     case '0':
516       n = 0;
517       for (j = 0; j < 3; ++j)
518         {
519           n = n * 8 + c - '0';
520           c = phase1_getc ();
521           switch (c)
522             {
523             default:
524               break;
525 
526             case '0': case '1': case '2': case '3':
527             case '4': case '5': case '6': case '7':
528               continue;
529             }
530           break;
531         }
532       phase1_ungetc (c);
533       return n;
534 
535     case 'u':
536       {
537         unsigned char buf[8];
538 
539         n = 0;
540         for (j = 0; j < 4; j++)
541           {
542             int c1 = phase1_getc ();
543 
544             if (c1 >= '0' && c1 <= '9')
545               n = (n << 4) + (c1 - '0');
546             else if (c1 >= 'A' && c1 <= 'F')
547               n = (n << 4) + (c1 - 'A' + 10);
548             else if (c1 >= 'a' && c1 <= 'f')
549               n = (n << 4) + (c1 - 'a' + 10);
550             else
551               {
552                 phase1_ungetc (c1);
553                 while (--j >= 0)
554                   phase1_ungetc (buf[j]);
555                 phase1_ungetc (c);
556                 return '\\';
557               }
558 
559             buf[j] = c1;
560           }
561 
562         if (n < 0x110000)
563           return UNICODE (n);
564 
565         error_with_progname = false;
566         error (0, 0, _("%s:%d: warning: invalid Unicode character"),
567                logical_file_name, line_number);
568         error_with_progname = true;
569 
570         while (--j >= 0)
571           phase1_ungetc (buf[j]);
572         phase1_ungetc (c);
573         return '\\';
574       }
575     }
576 }
577 
578 
579 static void
phase7_ungetc(int c)580 phase7_ungetc (int c)
581 {
582   phase1_ungetc (c);
583 }
584 
585 
586 /* 3. Parse each resulting logical line as preprocessing tokens and
587    white space.  Preprocessing tokens and Vala tokens don't always
588    match.  */
589 
590 static token_ty phase3_pushback[2];
591 static int phase3_pushback_length;
592 
593 
594 static token_type_ty last_token_type;
595 
596 static void
phase3_scan_regex()597 phase3_scan_regex ()
598 {
599     int c;
600 
601     for (;;)
602       {
603         c = phase1_getc ();
604         if (c == '/')
605           break;
606         if (c == '\\')
607           {
608             c = phase1_getc ();
609             if (c != EOF)
610               continue;
611           }
612         if (c == EOF)
613           {
614             error_with_progname = false;
615             error (0, 0,
616                    _("%s:%d: warning: regular expression literal terminated too early"),
617                    logical_file_name, line_number);
618             error_with_progname = true;
619             return;
620           }
621       }
622 
623     c = phase2_getc ();
624     if (!(c == 'i' || c == 's' || c == 'm' || c == 'x'))
625       phase2_ungetc (c);
626 }
627 
628 static void
phase3_get(token_ty * tp)629 phase3_get (token_ty *tp)
630 {
631   static char *buffer;
632   static int bufmax;
633   int bufpos;
634 
635 #undef APPEND
636 #define APPEND(c)                               \
637   do                                            \
638     {                                           \
639       if (bufpos >= bufmax)                     \
640         {                                       \
641           bufmax = 2 * bufmax + 10;             \
642           buffer = xrealloc (buffer, bufmax);   \
643         }                                       \
644       buffer[bufpos++] = c;                     \
645     }                                           \
646   while (0)
647 
648   if (phase3_pushback_length)
649     {
650       *tp = phase3_pushback[--phase3_pushback_length];
651       last_token_type = tp->type;
652       return;
653     }
654 
655   for (;;)
656     {
657       bool template;
658       bool verbatim;
659       int c;
660 
661       tp->line_number = line_number;
662       c = phase2_getc ();
663 
664       switch (c)
665         {
666         case EOF:
667           tp->type = last_token_type = token_type_eof;
668           return;
669 
670         case '\n':
671           if (last_non_comment_line > last_comment_line)
672             savable_comment_reset ();
673           /* FALLTHROUGH */
674         case ' ':
675         case '\f':
676         case '\t':
677           /* Ignore whitespace and comments.  */
678           continue;
679         default:
680           break;
681         }
682 
683       last_non_comment_line = tp->line_number;
684       template = false;
685       verbatim = false;
686 
687       switch (c)
688         {
689         case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': case 'G':
690         case 'H': case 'I': case 'J': case 'K': case 'L': case 'M': case 'N':
691         case 'O': case 'P': case 'Q': case 'R': case 'S': case 'T': case 'U':
692         case 'V': case 'W': case 'X': case 'Y': case 'Z':
693         case '_':
694         case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': case 'g':
695         case 'h': case 'i': case 'j': case 'k': case 'l': case 'm': case 'n':
696         case 'o': case 'p': case 'q': case 'r': case 's': case 't': case 'u':
697         case 'v': case 'w': case 'x': case 'y': case 'z':
698           bufpos = 0;
699           for (;;)
700             {
701               APPEND (c);
702               c = phase2_getc ();
703               switch (c)
704                 {
705                 case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
706                 case 'G': case 'H': case 'I': case 'J': case 'K': case 'L':
707                 case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R':
708                 case 'S': case 'T': case 'U': case 'V': case 'W': case 'X':
709                 case 'Y': case 'Z':
710                 case '_':
711                 case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
712                 case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
713                 case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
714                 case 's': case 't': case 'u': case 'v': case 'w': case 'x':
715                 case 'y': case 'z':
716                 case '0': case '1': case '2': case '3': case '4':
717                 case '5': case '6': case '7': case '8': case '9':
718                   continue;
719 
720                 default:
721                   phase2_ungetc (c);
722                   break;
723                 }
724               break;
725             }
726           APPEND (0);
727           if (strcmp (buffer, "return") == 0)
728             tp->type = last_token_type = token_type_return;
729           else
730             {
731               tp->string = xstrdup (buffer);
732               tp->type = last_token_type = token_type_symbol;
733             }
734           return;
735 
736         case '.':
737           c = phase2_getc ();
738           phase2_ungetc (c);
739           switch (c)
740             {
741             default:
742               tp->string = xstrdup (".");
743               tp->type = last_token_type = token_type_symbol;
744               return;
745 
746             case '0': case '1': case '2': case '3': case '4':
747             case '5': case '6': case '7': case '8': case '9':
748               c = '.';
749               break;
750             }
751           /* FALLTHROUGH */
752 
753         case '0': case '1': case '2': case '3': case '4':
754         case '5': case '6': case '7': case '8': case '9':
755           /* The preprocessing number token is more "generous" than the C
756              number tokens.  This is mostly due to token pasting (another
757              thing we can ignore here).  */
758           bufpos = 0;
759           for (;;)
760             {
761               APPEND (c);
762               c = phase2_getc ();
763               switch (c)
764                 {
765                 case 'e':
766                 case 'E':
767                   APPEND (c);
768                   c = phase2_getc ();
769                   if (c != '+' && c != '-')
770                     {
771                       phase2_ungetc (c);
772                       break;
773                     }
774                   continue;
775 
776                 case 'A': case 'B': case 'C': case 'D':           case 'F':
777                 case 'G': case 'H': case 'I': case 'J': case 'K': case 'L':
778                 case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R':
779                 case 'S': case 'T': case 'U': case 'V': case 'W': case 'X':
780                 case 'Y': case 'Z':
781                 case 'a': case 'b': case 'c': case 'd':           case 'f':
782                 case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
783                 case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
784                 case 's': case 't': case 'u': case 'v': case 'w': case 'x':
785                 case 'y': case 'z':
786                 case '0': case '1': case '2': case '3': case '4':
787                 case '5': case '6': case '7': case '8': case '9':
788                 case '.':
789                   continue;
790 
791                 default:
792                   phase2_ungetc (c);
793                   break;
794                 }
795               break;
796             }
797           APPEND (0);
798           tp->type = last_token_type = token_type_number;
799           return;
800 
801         case '\'':
802           for (;;)
803             {
804               c = phase7_getc ();
805               if (c == P7_NEWLINE)
806                 {
807                   error_with_progname = false;
808                   error (0, 0, _("%s:%d: warning: unterminated character constant"),
809                          logical_file_name, line_number - 1);
810                   error_with_progname = true;
811                   phase7_ungetc ('\n');
812                   break;
813                 }
814               if (c == EOF || c == P7_QUOTE)
815                 break;
816             }
817           tp->type = last_token_type = token_type_character_constant;
818           return;
819 
820           /* Vala provides strings in three different formats.
821 
822              Usual string literals:
823                "..."
824              Verbatim string literals:
825                """...""" (where ... can include newlines and double quotes)
826              String templates.
827                @"...", @"""..."""
828 
829              Note that, with the current implementation string
830              templates are not subject to translation, because they are
831              inspected at compile time.  For example, the following code
832 
833                string bar = "bar";
834                string foo = _(@"foo $bar");
835 
836              will be translated into the C code, like:
837 
838                _(g_strconcat ("foo ", "bar", NULL));  */
839         case '@':
840           c = phase2_getc ();
841           if (c != '"')
842             {
843               phase2_ungetc (c);
844               tp->type = last_token_type = token_type_other;
845               return;
846             }
847           template = true;
848           /* FALLTHROUGH */
849         case '"':
850           {
851             struct mixed_string_buffer msb;
852             int c2 = phase1_getc ();
853 
854             if (c2 == '"')
855               {
856                 int c3 = phase1_getc ();
857                 if (c3 == '"')
858                   verbatim = true;
859                 else
860                   {
861                     phase1_ungetc (c3);
862                     phase1_ungetc (c2);
863                   }
864               }
865             else
866               phase2_ungetc (c2);
867 
868             /* Start accumulating the string.  */
869             mixed_string_buffer_init (&msb, lc_string,
870                                       logical_file_name, line_number);
871             if (verbatim)
872               for (;;)
873                 {
874                   c = phase1_getc ();
875 
876                   /* Keep line_number in sync.  */
877                   msb.line_number = line_number;
878 
879                   if (c == '"')
880                     {
881                       int c2 = phase1_getc ();
882                       if (c2 == '"')
883                         {
884                           int c3 = phase1_getc ();
885                           if (c3 == '"')
886                             break;
887                           phase1_ungetc (c3);
888                         }
889                       phase1_ungetc (c2);
890                     }
891                   if (c == EOF)
892                     break;
893                   mixed_string_buffer_append_char (&msb, c);
894                 }
895             else
896               for (;;)
897                 {
898                   c = phase7_getc ();
899 
900                   /* Keep line_number in sync.  */
901                   msb.line_number = line_number;
902 
903                   if (c == P7_NEWLINE)
904                     {
905                       error_with_progname = false;
906                       error (0, 0,
907                              _("%s:%d: warning: unterminated string literal"),
908                              logical_file_name, line_number - 1);
909                       error_with_progname = true;
910                       phase7_ungetc ('\n');
911                       break;
912                     }
913                   if (c == P7_QUOTES)
914                     break;
915                   if (c == EOF)
916                     break;
917                   if (c == P7_QUOTE)
918                     c = '\'';
919                   if (IS_UNICODE (c))
920                     {
921                       assert (UNICODE_VALUE (c) >= 0
922                               && UNICODE_VALUE (c) < 0x110000);
923                       mixed_string_buffer_append_unicode (&msb,
924                                                           UNICODE_VALUE (c));
925                     }
926                   else
927                     mixed_string_buffer_append_char (&msb, c);
928                 }
929             /* Done accumulating the string.  */
930             if (template)
931               {
932                 tp->type = token_type_string_template;
933                 mixed_string_buffer_destroy (&msb);
934               }
935             else
936               {
937                 tp->type = token_type_string_literal;
938                 tp->mixed_string = mixed_string_buffer_result (&msb);
939                 tp->comment = add_reference (savable_comment);
940               }
941             last_token_type = tp->type;
942             return;
943           }
944 
945         case '/':
946           switch (last_token_type)
947             {
948             case token_type_lparen:
949             case token_type_lbrace:
950             case token_type_assign:
951             case token_type_return:
952             case token_type_plus:
953             case token_type_arithmetic_operator:
954             case token_type_equality_test_operator:
955             case token_type_logic_operator:
956             case token_type_comma:
957             case token_type_question:
958             case token_type_colon:
959               phase3_scan_regex ();
960               tp->type = last_token_type = token_type_regex_literal;
961               break;
962             default:
963               {
964                 int c2 = phase2_getc ();
965                 if (c2 == '=')
966                   tp->type = last_token_type = token_type_assign;
967                 else
968                   {
969                     phase2_ungetc (c2);
970                     tp->type = last_token_type = token_type_arithmetic_operator;
971                   }
972                 break;
973               }
974             }
975           return;
976 
977         case '(':
978           tp->type = last_token_type = token_type_lparen;
979           return;
980 
981         case ')':
982           tp->type = last_token_type = token_type_rparen;
983           return;
984 
985         case '{':
986           tp->type = last_token_type = token_type_lbrace;
987           return;
988 
989         case '}':
990           tp->type = last_token_type = token_type_rbrace;
991           return;
992 
993         case '+':
994           {
995             int c2 = phase2_getc ();
996             switch (c2)
997               {
998               case '+':
999                 tp->type = last_token_type = token_type_other;
1000                 break;
1001               case '=':
1002                 tp->type = last_token_type = token_type_assign;
1003                 break;
1004               default:
1005                 phase2_ungetc (c2);
1006                 tp->type = last_token_type = token_type_plus;
1007                 break;
1008               }
1009             return;
1010           }
1011 
1012         case '-':
1013           {
1014             int c2 = phase2_getc ();
1015             switch (c2)
1016               {
1017               case '-':
1018                 tp->type = last_token_type = token_type_other;
1019                 break;
1020               case '=':
1021                 tp->type = last_token_type = token_type_assign;
1022                 break;
1023               default:
1024                 phase2_ungetc (c2);
1025                 tp->type = last_token_type = token_type_arithmetic_operator;
1026                 break;
1027               }
1028             return;
1029           }
1030 
1031         case '%':
1032         case '^':
1033           {
1034             int c2 = phase2_getc ();
1035             if (c2 == '=')
1036 	      tp->type = last_token_type = token_type_assign;
1037             else
1038               {
1039                 phase2_ungetc (c2);
1040                 tp->type = last_token_type = token_type_logic_operator;
1041               }
1042             return;
1043           }
1044 
1045         case '=':
1046           {
1047             int c2 = phase2_getc ();
1048             switch (c2)
1049               {
1050               case '=':
1051                 tp->type = last_token_type = token_type_equality_test_operator;
1052                 break;
1053               case '>':
1054                 tp->type = last_token_type = token_type_other;
1055                 break;
1056               default:
1057                 phase2_ungetc (c2);
1058                 tp->type = last_token_type = token_type_assign;
1059                 break;
1060               }
1061             return;
1062           }
1063 
1064         case '!':
1065           {
1066             int c2 = phase2_getc ();
1067             if (c2 == '=')
1068               tp->type = last_token_type = token_type_equality_test_operator;
1069             else
1070               {
1071                 phase2_ungetc (c2);
1072                 tp->type = last_token_type = token_type_logic_operator;
1073               }
1074             return;
1075           }
1076 
1077         case '>':
1078         case '<':
1079           {
1080             int c2 = phase2_getc ();
1081             if (c2 == '=')
1082 	      tp->type = last_token_type = token_type_equality_test_operator;
1083             else if (c2 == c)
1084               {
1085                 int c3 = phase2_getc ();
1086                 if (c3 == '=')
1087                   tp->type = last_token_type = token_type_assign;
1088                 else
1089                   {
1090                     phase2_ungetc (c2);
1091                     phase2_ungetc (c3);
1092                     tp->type = last_token_type = token_type_other;
1093                   }
1094               }
1095             else
1096               {
1097                 phase2_ungetc (c2);
1098                 tp->type = last_token_type = token_type_equality_test_operator;
1099               }
1100             return;
1101           }
1102 
1103         case ',':
1104           tp->type = last_token_type = token_type_comma;
1105           return;
1106 
1107         case ':':
1108           tp->type = last_token_type = token_type_colon;
1109           return;
1110 
1111         case '&':
1112         case '|':
1113           {
1114             int c2 = phase2_getc ();
1115             if (c2 == c)
1116 	      tp->type = last_token_type = token_type_logic_operator;
1117             else if (c2 == '=')
1118 	      tp->type = last_token_type = token_type_assign;
1119             else
1120               {
1121                 phase2_ungetc (c2);
1122                 tp->type = last_token_type = token_type_arithmetic_operator;
1123               }
1124             return;
1125           }
1126 
1127         case '?':
1128           {
1129             int c2 = phase2_getc ();
1130             if (c2 == '?')
1131               tp->type = last_token_type = token_type_logic_operator;
1132             else
1133               {
1134                 phase2_ungetc (c2);
1135                 tp->type = last_token_type = token_type_question;
1136               }
1137             return;
1138           }
1139 
1140         default:
1141           tp->type = last_token_type = token_type_other;
1142           return;
1143         }
1144     }
1145 #undef APPEND
1146 }
1147 
1148 static void
phase3_unget(token_ty * tp)1149 phase3_unget (token_ty *tp)
1150 {
1151   if (tp->type != token_type_eof)
1152     {
1153       if (phase3_pushback_length == SIZEOF (phase3_pushback))
1154         abort ();
1155       phase3_pushback[phase3_pushback_length++] = *tp;
1156     }
1157 }
1158 
1159 
1160 /* String concatenation with '+'.  */
1161 
1162 static void
x_vala_lex(token_ty * tp)1163 x_vala_lex (token_ty *tp)
1164 {
1165   phase3_get (tp);
1166   if (tp->type == token_type_string_literal)
1167     {
1168       mixed_string_ty *sum = tp->mixed_string;
1169 
1170       for (;;)
1171         {
1172           token_ty token2;
1173 
1174           phase3_get (&token2);
1175           if (token2.type == token_type_plus)
1176             {
1177               token_ty token3;
1178 
1179               phase3_get (&token3);
1180               if (token3.type == token_type_string_literal)
1181                 {
1182                   sum = mixed_string_concat_free1 (sum, token3.mixed_string);
1183 
1184                   free_token (&token3);
1185                   free_token (&token2);
1186                   continue;
1187                 }
1188               phase3_unget (&token3);
1189             }
1190           phase3_unget (&token2);
1191           break;
1192         }
1193       tp->mixed_string = sum;
1194     }
1195 }
1196 
1197 
1198 /* ========================= Extracting strings.  ========================== */
1199 
1200 
1201 /* Context lookup table.  */
1202 static flag_context_list_table_ty *flag_context_list_table;
1203 
1204 
1205 /* The file is broken into tokens.  Scan the token stream, looking for
1206    a keyword, followed by a left paren, followed by a string.  When we
1207    see this sequence, we have something to remember.  We assume we are
1208    looking at a valid Vala program, and leave the complaints about the
1209    grammar to the compiler.
1210 
1211      Normal handling: Look for
1212        keyword ( ... msgid ... )
1213        keyword msgid
1214      Plural handling: Look for
1215        keyword ( ... msgid ... msgid_plural ... )
1216 
1217    We use recursion because the arguments before msgid or between msgid
1218    and msgid_plural can contain subexpressions of the same form.  */
1219 
1220 /* Extract messages until the next balanced closing parenthesis or bracket.
1221    Extracted messages are added to MLP.
1222    DELIM can be either token_type_rparen or token_type_rbracket, or
1223    token_type_eof to accept both.
1224    Return true upon eof, false upon closing parenthesis or bracket.  */
1225 static bool
extract_balanced(message_list_ty * mlp,token_type_ty delim,flag_context_ty outer_context,flag_context_list_iterator_ty context_iter,struct arglist_parser * argparser)1226 extract_balanced (message_list_ty *mlp, token_type_ty delim,
1227                   flag_context_ty outer_context,
1228                   flag_context_list_iterator_ty context_iter,
1229                   struct arglist_parser *argparser)
1230 {
1231   /* Current argument number.  */
1232   int arg = 1;
1233   /* 0 when no keyword has been seen.  1 right after a keyword is seen.  */
1234   int state;
1235   /* Parameters of the keyword just seen.  Defined only in state 1.  */
1236   const struct callshapes *next_shapes = NULL;
1237   /* Context iterator that will be used if the next token is a '('.  */
1238   flag_context_list_iterator_ty next_context_iter =
1239     passthrough_context_list_iterator;
1240   /* Current context.  */
1241   flag_context_ty inner_context =
1242     inherited_context (outer_context,
1243                        flag_context_list_iterator_advance (&context_iter));
1244 
1245   /* Start state is 0.  */
1246   state = 0;
1247 
1248   for (;;)
1249     {
1250       token_ty token;
1251 
1252       x_vala_lex (&token);
1253 
1254       switch (token.type)
1255         {
1256         case token_type_symbol:
1257           {
1258             void *keyword_value;
1259 
1260             if (hash_find_entry (&keywords, token.string, strlen (token.string),
1261                                  &keyword_value)
1262                 == 0)
1263               {
1264                 next_shapes = (const struct callshapes *) keyword_value;
1265                 state = 1;
1266               }
1267             else
1268               state = 0;
1269           }
1270           next_context_iter =
1271             flag_context_list_iterator (
1272               flag_context_list_table_lookup (
1273                 flag_context_list_table,
1274                 token.string, strlen (token.string)));
1275           free (token.string);
1276           continue;
1277 
1278         case token_type_lparen:
1279           if (extract_balanced (mlp, token_type_rparen,
1280                                 inner_context, next_context_iter,
1281                                 arglist_parser_alloc (mlp,
1282                                                       state ? next_shapes : NULL)))
1283             {
1284               arglist_parser_done (argparser, arg);
1285               return true;
1286             }
1287           next_context_iter = null_context_list_iterator;
1288           state = 0;
1289           break;
1290 
1291         case token_type_rparen:
1292           if (delim == token_type_rparen || delim == token_type_eof)
1293             {
1294               arglist_parser_done (argparser, arg);
1295               return false;
1296             }
1297 
1298           next_context_iter = null_context_list_iterator;
1299           state = 0;
1300           continue;
1301 
1302         case token_type_comma:
1303           arg++;
1304           inner_context =
1305             inherited_context (outer_context,
1306                                flag_context_list_iterator_advance (
1307                                  &context_iter));
1308           next_context_iter = passthrough_context_list_iterator;
1309           state = 0;
1310           continue;
1311 
1312         case token_type_eof:
1313           arglist_parser_done (argparser, arg);
1314           return true;
1315 
1316         case token_type_string_literal:
1317           {
1318             lex_pos_ty pos;
1319 
1320             pos.file_name = logical_file_name;
1321             pos.line_number = token.line_number;
1322 
1323             if (extract_all)
1324               {
1325                 char *string = mixed_string_contents (token.mixed_string);
1326                 mixed_string_free (token.mixed_string);
1327                 remember_a_message (mlp, NULL, string, true, false,
1328                                     inner_context, &pos,
1329                                     NULL, token.comment, false);
1330               }
1331             else
1332               {
1333                 /* A string immediately after a symbol means a function call.  */
1334                 if (state)
1335                   {
1336                     struct arglist_parser *tmp_argparser;
1337                     tmp_argparser = arglist_parser_alloc (mlp, next_shapes);
1338 
1339                     arglist_parser_remember (tmp_argparser, 1,
1340                                              token.mixed_string, inner_context,
1341                                              pos.file_name, pos.line_number,
1342                                              token.comment, false);
1343                     arglist_parser_done (tmp_argparser, 1);
1344                   }
1345                 else
1346                   arglist_parser_remember (argparser, arg,
1347                                            token.mixed_string, inner_context,
1348                                            pos.file_name, pos.line_number,
1349                                            token.comment, false);
1350               }
1351           }
1352           drop_reference (token.comment);
1353           next_context_iter = null_context_list_iterator;
1354           state = 0;
1355           continue;
1356 
1357         case token_type_character_constant:
1358         case token_type_lbrace:
1359         case token_type_rbrace:
1360         case token_type_assign:
1361         case token_type_return:
1362         case token_type_plus:
1363         case token_type_arithmetic_operator:
1364         case token_type_equality_test_operator:
1365         case token_type_logic_operator:
1366         case token_type_question:
1367         case token_type_colon:
1368         case token_type_number:
1369         case token_type_string_template:
1370         case token_type_regex_literal:
1371         case token_type_other:
1372           next_context_iter = null_context_list_iterator;
1373           state = 0;
1374           continue;
1375 
1376         default:
1377           abort ();
1378         }
1379     }
1380 }
1381 
1382 void
extract_vala(FILE * f,const char * real_filename,const char * logical_filename,flag_context_list_table_ty * flag_table,msgdomain_list_ty * mdlp)1383 extract_vala (FILE *f,
1384               const char *real_filename, const char *logical_filename,
1385               flag_context_list_table_ty *flag_table,
1386               msgdomain_list_ty *mdlp)
1387 {
1388   message_list_ty *mlp = mdlp->item[0]->messages;
1389 
1390   fp = f;
1391   real_file_name = real_filename;
1392   logical_file_name = xstrdup (logical_filename);
1393   line_number = 1;
1394 
1395   phase1_pushback_length = 0;
1396 
1397   last_comment_line = -1;
1398   last_non_comment_line = -1;
1399 
1400   phase3_pushback_length = 0;
1401   last_token_type = token_type_other;
1402 
1403   flag_context_list_table = flag_table;
1404 
1405   init_keywords ();
1406 
1407   /* Eat tokens until eof is seen.  When extract_parenthesized returns
1408      due to an unbalanced closing parenthesis, just restart it.  */
1409   while (!extract_balanced (mlp, token_type_eof,
1410                             null_context, null_context_list_iterator,
1411                             arglist_parser_alloc (mlp, NULL)))
1412     ;
1413 
1414   fp = NULL;
1415   real_file_name = NULL;
1416   logical_file_name = NULL;
1417   line_number = 0;
1418 }
1419