1 /* xgettext Lua backend.
2    Copyright (C) 2012-2013, 2016, 2018-2020 Free Software Foundation, Inc.
3 
4    This file was written by Ľubomír Remák <lubomirr@lubomirr.eu>, 2012.
5 
6    This program is free software: you can redistribute it and/or modify
7    it under the terms of the GNU General Public License as published by
8    the Free Software Foundation; either version 3 of the License, or
9    (at your option) any later version.
10 
11    This program is distributed in the hope that it will be useful,
12    but WITHOUT ANY WARRANTY; without even the implied warranty of
13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14    GNU General Public License for more details.
15 
16    You should have received a copy of the GNU General Public License
17    along with this program.  If not, see <https://www.gnu.org/licenses/>.  */
18 
19 #ifdef HAVE_CONFIG_H
20 #include "config.h"
21 #endif
22 
23 /* Specification.  */
24 #include "x-lua.h"
25 
26 #include <errno.h>
27 #include <stdbool.h>
28 #include <stdio.h>
29 #include <stdlib.h>
30 
31 #include "message.h"
32 #include "rc-str-list.h"
33 #include "xgettext.h"
34 #include "xg-pos.h"
35 #include "xg-mixed-string.h"
36 #include "xg-arglist-context.h"
37 #include "xg-arglist-callshape.h"
38 #include "xg-arglist-parser.h"
39 #include "xg-message.h"
40 #include "error.h"
41 #include "xalloc.h"
42 #include "gettext.h"
43 #include "po-charset.h"
44 
45 #define _(s) gettext(s)
46 
47 #define SIZEOF(a) (sizeof(a) / sizeof(a[0]))
48 
49 /* The Lua syntax is defined in the Lua manual sections 3.1 and 9,
50    which can be found at
51    https://www.lua.org/manual/5.2/manual.html#3.1
52    https://www.lua.org/manual/5.2/manual.html#9  */
53 
54 /* If true extract all strings.  */
55 static bool extract_all = false;
56 
57 /* A hash table for keywords.  */
58 static hash_table keywords;
59 static bool default_keywords = true;
60 
61 /* Set extract_all flag (gettext will extract all strings).  */
62 void
x_lua_extract_all()63 x_lua_extract_all ()
64 {
65   extract_all = true;
66 }
67 
68 /* Adds a keyword.  Copied from other lexers.  */
69 void
x_lua_keyword(const char * name)70 x_lua_keyword (const char *name)
71 {
72   if (name == NULL)
73     default_keywords = false;
74   else
75     {
76       const char *end;
77       struct callshape shape;
78       const char *colon;
79 
80       if (keywords.table == NULL)
81         hash_init (&keywords, 100);
82 
83       split_keywordspec (name, &end, &shape);
84 
85       /* The characters between name and end should form a valid C identifier.
86          A colon means an invalid parse in split_keywordspec().  */
87       colon = strchr (name, ':');
88       if (colon == NULL || colon >= end)
89         insert_keyword_callshape (&keywords, name, end - name, &shape);
90     }
91 }
92 
93 /* Finish initializing the keywords hash table.
94    Called after argument processing, before each file is processed.  */
95 static void
init_keywords()96 init_keywords ()
97 {
98   if (default_keywords)
99     {
100       /* When adding new keywords here, also update the documentation in
101          xgettext.texi!  */
102       x_lua_keyword ("_");
103       x_lua_keyword ("gettext.gettext");
104       x_lua_keyword ("gettext.dgettext:2");
105       x_lua_keyword ("gettext.dcgettext:2");
106       x_lua_keyword ("gettext.ngettext:1,2");
107       x_lua_keyword ("gettext.dngettext:2,3");
108       x_lua_keyword ("gettext.dcngettext:2,3");
109       default_keywords = false;
110     }
111 }
112 
113 void
init_flag_table_lua()114 init_flag_table_lua ()
115 {
116   xgettext_record_flag ("_:1:pass-lua-format");
117   xgettext_record_flag ("gettext.gettext:1:pass-lua-format");
118   xgettext_record_flag ("gettext.dgettext:2:pass-lua-format");
119   xgettext_record_flag ("gettext.dcgettext:2:pass-lua-format");
120   xgettext_record_flag ("gettext.ngettext:1:pass-lua-format");
121   xgettext_record_flag ("gettext.ngettext:2:pass-lua-format");
122   xgettext_record_flag ("gettext.dngettext:2:pass-lua-format");
123   xgettext_record_flag ("gettext.dngettext:3:pass-lua-format");
124   xgettext_record_flag ("gettext.dcngettext:2:pass-lua-format");
125   xgettext_record_flag ("gettext.dcngettext:3:pass-lua-format");
126   xgettext_record_flag ("string.format:1:lua-format");
127 }
128 
129 
130 /* ======================== Reading of characters.  ======================== */
131 
132 /* The input file stream.  */
133 static FILE *fp;
134 
135 
136 /* 1. line_number handling.  */
137 
138 static unsigned char phase1_pushback[2];
139 static int phase1_pushback_length;
140 
141 static bool first_character;
142 
143 static int
phase1_getc()144 phase1_getc ()
145 {
146   int c;
147 
148   if (phase1_pushback_length)
149     c = phase1_pushback[--phase1_pushback_length];
150   else
151     {
152       c = getc (fp);
153 
154       if (first_character)
155         {
156           first_character = false;
157 
158           /* Ignore shebang line.  No pushback required in this case.  */
159           if (c == '#')
160             {
161               while (c != '\n' && c != EOF)
162                 c = getc (fp);
163               if (c == '\n')
164                 {
165                   line_number++;
166                   c = getc (fp);
167                 }
168             }
169         }
170 
171       if (c == EOF)
172         {
173           if (ferror (fp))
174             error (EXIT_FAILURE, errno, _("error while reading \"%s\""),
175                    real_file_name);
176           return EOF;
177         }
178     }
179 
180   if (c == '\n')
181     line_number++;
182 
183   return c;
184 }
185 
186 /* Supports 2 characters of pushback.  */
187 
188 static void
phase1_ungetc(int c)189 phase1_ungetc (int c)
190 {
191   if (c != EOF)
192     {
193       if (c == '\n')
194         --line_number;
195 
196       if (phase1_pushback_length == SIZEOF (phase1_pushback))
197         abort ();
198       phase1_pushback[phase1_pushback_length++] = c;
199     }
200 }
201 
202 
203 /* These are for tracking whether comments count as immediately before
204    keyword.  */
205 static int last_comment_line;
206 static int last_non_comment_line;
207 
208 /* Accumulating comments.  */
209 
210 static char *buffer;
211 static size_t bufmax;
212 static size_t buflen;
213 
214 static inline void
comment_start()215 comment_start ()
216 {
217   buflen = 0;
218 }
219 
220 static inline void
comment_add(int c)221 comment_add (int c)
222 {
223   if (buflen >= bufmax)
224     {
225       bufmax = 2 * bufmax + 10;
226       buffer = xrealloc (buffer, bufmax);
227     }
228   buffer[buflen++] = c;
229 }
230 
231 static inline void
comment_line_end(size_t chars_to_remove)232 comment_line_end (size_t chars_to_remove)
233 {
234   buflen -= chars_to_remove;
235   while (buflen >= 1
236          && (buffer[buflen - 1] == ' ' || buffer[buflen - 1] == '\t'))
237     --buflen;
238   if (chars_to_remove == 0 && buflen >= bufmax)
239     {
240       bufmax = 2 * bufmax + 10;
241       buffer = xrealloc (buffer, bufmax);
242     }
243   buffer[buflen] = '\0';
244   savable_comment_add (buffer);
245 }
246 
247 /* Eats characters until '\n' and adds them to the comment.  */
248 static void
eat_comment_line()249 eat_comment_line ()
250 {
251   for (;;)
252     {
253       int c = phase1_getc ();
254       if (c == '\n' || c == EOF)
255         {
256           comment_line_end (0);
257           break;
258         }
259 
260       if (!(buflen == 0 && (c == ' ' || c == '\t')))
261         comment_add (c);
262     }
263 }
264 
265 static int
phase2_getc()266 phase2_getc ()
267 {
268   int c;
269   int lineno;
270 
271   c = phase1_getc ();
272 
273   if (c == '-')
274     {
275       c = phase1_getc ();
276 
277       if (c == '-')
278         {
279           /* It starts with '--', so it must be either a short or a long
280              comment.  */
281           c = phase1_getc ();
282 
283           if (c == '[')
284             {
285               c = phase1_getc ();
286 
287               int esigns = 0;
288               while (c == '=')
289                 {
290                   esigns++;
291                   c = phase1_getc ();
292                 }
293 
294               if (c == '[')
295                 {
296                   /* Long comment.  */
297                   bool right_bracket = false;
298                   bool end = false;
299                   int esigns2 = 0;
300 
301                   lineno = line_number;
302                   comment_start ();
303                   while (!end)
304                     {
305                       c = phase1_getc ();
306 
307                       if (c == EOF)
308                         break;
309 
310                       /* Ignore leading spaces and tabs.  */
311                       if (!(buflen == 0 && (c == ' ' || c == '\t')))
312                         {
313                           comment_add (c);
314 
315                           switch (c)
316                             {
317                             case ']':
318                               if (!right_bracket)
319                                 {
320                                   right_bracket = true;
321                                   esigns2 = 0;
322                                 }
323                               else
324                                 {
325                                   if (esigns2 == esigns)
326                                     {
327                                       comment_line_end (2 + esigns);
328                                       end = true;
329                                     }
330                                 }
331                               break;
332 
333                             case '=':
334                               if (right_bracket)
335                                 esigns2++;
336                               break;
337 
338                             case '\n':
339                               comment_line_end (1);
340                               comment_start ();
341                               lineno = line_number;
342                               /* Intentionally not breaking.  */
343 
344                             default:
345                               right_bracket = false;
346                             }
347                         }
348                     }
349                   last_comment_line = lineno;
350                   return ' ';
351                 }
352               else
353                 {
354                   /* One line (short) comment, starting with '--[=...='.  */
355                   lineno = last_comment_line;
356                   comment_start ();
357                   comment_add ('[');
358                   while (esigns--)
359                     comment_add ('=');
360                   phase1_ungetc (c);
361                   eat_comment_line ();
362                   last_comment_line = lineno;
363                   return '\n';
364                 }
365             }
366           else
367             {
368               /* One line (short) comment.  */
369               lineno = line_number;
370               comment_start ();
371               phase1_ungetc (c);
372               eat_comment_line ();
373               last_comment_line = lineno;
374               return '\n';
375             }
376         }
377       else
378         {
379           /* Minus sign.  */
380           phase1_ungetc (c);
381           return '-';
382         }
383     }
384   else
385     return c;
386 }
387 
388 
389 /* ========================== Reading of tokens.  ========================== */
390 
391 enum token_type_ty
392 {
393   token_type_eof,
394   token_type_lparen,            /* ( */
395   token_type_rparen,            /* ) */
396   token_type_lbracket,          /* [ */
397   token_type_rbracket,          /* ] */
398   token_type_comma,             /* , */
399   token_type_dot,               /* . */
400   token_type_doubledot,         /* .. */
401   token_type_operator1,         /* + - * / % not # - ^ */
402   token_type_operator2,         /* < > <= >= ~= == and or */
403   token_type_string,
404   token_type_number,
405   token_type_symbol,
406   token_type_other
407 };
408 
409 typedef enum token_type_ty token_type_ty;
410 
411 typedef struct token_ty token_ty;
412 struct token_ty
413 {
414   token_type_ty type;
415   char *string; /* for token_type_string_literal, token_type_symbol */
416   refcounted_string_list_ty *comment;  /* for token_type_string_literal */
417   int line_number;
418 };
419 
420 /* Free the memory pointed to by a 'struct token_ty'.  */
421 static inline void
free_token(token_ty * tp)422 free_token (token_ty *tp)
423 {
424   if (tp->type == token_type_string || tp->type == token_type_symbol)
425     free (tp->string);
426   if (tp->type == token_type_string)
427     drop_reference (tp->comment);
428 }
429 
430 /* Our current string.  */
431 static int string_buf_length;
432 static int string_buf_alloc;
433 static char *string_buf;
434 
435 static void
string_start()436 string_start ()
437 {
438   string_buf_length = 0;
439 }
440 
441 static void
string_add(int c)442 string_add (int c)
443 {
444   if (string_buf_length >= string_buf_alloc)
445     {
446       string_buf_alloc = 2 * string_buf_alloc + 10;
447       string_buf = xrealloc (string_buf, string_buf_alloc);
448     }
449 
450   string_buf[string_buf_length++] = c;
451 }
452 
453 static void
string_end()454 string_end ()
455 {
456   if (string_buf_length >= string_buf_alloc)
457     {
458       string_buf_alloc = string_buf_alloc + 1;
459       string_buf = xrealloc (string_buf, string_buf_alloc);
460     }
461 
462   string_buf[string_buf_length] = '\0';
463 }
464 
465 
466 /* We need 3 pushback tokens for string optimization.  */
467 static int phase3_pushback_length;
468 static token_ty phase3_pushback[3];
469 
470 
471 static void
phase3_unget(token_ty * tp)472 phase3_unget (token_ty *tp)
473 {
474   if (tp->type != token_type_eof)
475     {
476       if (phase3_pushback_length == SIZEOF (phase3_pushback))
477         abort ();
478       phase3_pushback[phase3_pushback_length++] = *tp;
479     }
480 }
481 
482 static void
phase3_get(token_ty * tp)483 phase3_get (token_ty *tp)
484 {
485   int c;
486   int c2;
487   int c_start;
488 
489   if (phase3_pushback_length)
490     {
491       *tp = phase3_pushback[--phase3_pushback_length];
492       return;
493     }
494 
495   tp->string = NULL;
496 
497   for (;;)
498     {
499       tp->line_number = line_number;
500       c = phase2_getc ();
501 
502       switch (c)
503         {
504         case EOF:
505           tp->type = token_type_eof;
506           return;
507 
508         case '\n':
509           if (last_non_comment_line > last_comment_line)
510             savable_comment_reset ();
511           /* Intentionally not breaking.  */
512         case ' ':
513         case '\t':
514         case '\f':
515           continue;
516 
517         case '+':
518         case '-':
519         case '*':
520         case '/':
521         case '^':
522         case '%':
523         case '#':
524           tp->type = token_type_operator1;
525           return;
526         case '<':
527         case '>':
528         case '=':
529           c2 = phase1_getc ();
530           if (c2 != '=')
531             phase1_ungetc (c2);
532           tp->type = token_type_operator2;
533           return;
534         case '~':
535           c2 = phase1_getc ();
536           if (c2 == '=')
537             {
538               tp->type = token_type_operator2;
539               return;
540             }
541           else
542             phase1_ungetc (c2);
543           continue;
544         case '(':
545           tp->type = token_type_lparen;
546           return;
547         case ')':
548           tp->type = token_type_rparen;
549           return;
550         case ',':
551           tp->type = token_type_comma;
552           return;
553 
554         case ';':
555           tp->type = token_type_other;
556           return;
557 
558           /* There are three operators beginning with a dot.  '.',
559              '..' and '...'.  The most useful for us is the string
560              concatenation operator ('..').  */
561         case '.':
562           c = phase1_getc ();
563           if (c == '.')
564             {
565               c = phase1_getc ();
566               if (c == '.')
567                 {
568                   tp->type = token_type_other;
569                   return;
570                 }
571               else
572                 {
573                   phase1_ungetc (c);
574                   tp->type = token_type_doubledot;
575                   return;
576                 }
577             }
578           else if (c >= '0' && c <= '9')
579             {
580               /* It's a number.  We aren't interested in the actual
581                  numeric value, so ignore the dot and let next
582                  iteration eat the number.  */
583               phase1_ungetc (c);
584               continue;
585             }
586           else
587             {
588               phase1_ungetc (c);
589               tp->type = token_type_dot;
590               return;
591             }
592 
593         case '"':
594         case '\'':
595           c_start = c;
596           string_start ();
597 
598           for (;;)
599             {
600               /* We need unprocessed characters from phase 1.  */
601               c = phase1_getc ();
602 
603               if (c == EOF || c == c_start || c == '\n')
604                 {
605                   /* End of string.  */
606                   string_end ();
607                   tp->string = xstrdup (string_buf);
608                   tp->comment = add_reference (savable_comment);
609                   tp->type = token_type_string;
610                   return;
611                 }
612 
613               /* We got '\', this is probably an escape sequence.  */
614               if (c == '\\')
615                 {
616                   c = phase1_getc ();
617                   switch (c)
618                     {
619                     case 'a':
620                       string_add ('\a');
621                       break;
622                     case 'b':
623                       string_add ('\b');
624                       break;
625                     case 'f':
626                       string_add ('\f');
627                       break;
628                     case 'n':
629                       string_add ('\n');
630                       break;
631                     case 'r':
632                       string_add ('\r');
633                       break;
634                     case 't':
635                       string_add ('\t');
636                       break;
637                     case 'v':
638                       string_add ('\v');
639                       break;
640                     case 'x':
641                       {
642                         int num = 0;
643                         int i = 0;
644 
645                         for (i = 0; i < 2; i++)
646                           {
647                             c = phase1_getc ();
648                             if (c >= '0' && c <= '9')
649                               num += c - '0';
650                             else if (c >= 'a' && c <= 'f')
651                               num += c - 'a' + 10;
652                             else if (c >= 'A' && c <= 'F')
653                               num += c - 'A' + 10;
654                             else
655                               {
656                                 phase1_ungetc (c);
657                                 break;
658                               }
659 
660                             if (i == 0)
661                               num *= 16;
662                           }
663 
664                         if (i == 2)
665                           string_add (num);
666                       }
667 
668                       break;
669                     case 'z':
670                       /* Ignore the following whitespace.  */
671                       do
672                         {
673                           c = phase1_getc ();
674                         }
675                       while (c == ' ' || c == '\n' || c == '\t' || c == '\r'
676                              || c == '\f' || c == '\v');
677 
678                       phase1_ungetc (c);
679 
680                       break;
681                     default:
682                       /* Check if it's a '\ddd' sequence.  */
683                       if (c >= '0' && c <= '9')
684                         {
685                           int num = 0;
686                           int i = 0;
687 
688                           while (c >= '0' && c <= '9' && i < 3)
689                             {
690                               num *= 10;
691                               num += (c - '0');
692                               c = phase1_getc ();
693                               i++;
694                             }
695 
696                           /* The last read character is either a
697                              non-number or another number after our
698                              '\ddd' sequence.  We need to ungetc it.  */
699                           phase1_ungetc (c);
700 
701                           /* The sequence number is too big, this
702                              causes a lexical error.  Ignore it.  */
703                           if (num < 256)
704                             string_add (num);
705                         }
706                       else
707                         string_add (c);
708                     }
709                 }
710               else
711                 string_add (c);
712             }
713           break;
714 
715         case '[':
716           c = phase1_getc ();
717 
718           /* Count the number of equal signs.  */
719           int esigns = 0;
720           while (c == '=')
721             {
722               esigns++;
723               c = phase1_getc ();
724             }
725 
726           if (c != '[')
727             {
728               /* We did not find what we were looking for, ungetc it.  */
729               phase1_ungetc (c);
730               if (esigns == 0)
731                 {
732                   /* Our current character isn't '[' and we got 0 equal
733                      signs, so the first '[' must have been a left
734                      bracket.  */
735                   tp->type = token_type_lbracket;
736                   return;
737                 }
738               else
739                 /* Lexical error, ignore it.  */
740                 continue;
741             }
742 
743           /* Found an opening long bracket.  */
744           string_start ();
745 
746           /* See if it is immediately followed by a newline.  */
747           c = phase1_getc ();
748           if (c != '\n')
749             phase1_ungetc (c);
750 
751           for (;;)
752             {
753               c = phase1_getc ();
754 
755               if (c == EOF)
756                 {
757                   string_end ();
758                   tp->string = xstrdup (string_buf);
759                   tp->comment = add_reference (savable_comment);
760                   tp->type = token_type_string;
761                   return;
762                 }
763               if (c == ']')
764                 {
765                   c = phase1_getc ();
766 
767                   /* Count the number of equal signs.  */
768                   int esigns2 = 0;
769                   while (c == '=')
770                     {
771                       esigns2++;
772                       c = phase1_getc ();
773                     }
774 
775                   if (c == ']' && esigns == esigns2)
776                     {
777                       /* We got ']==...==]', where the number of equal
778                          signs matches the number of equal signs in
779                          the opening bracket.  */
780                       string_end ();
781                       tp->string = xstrdup (string_buf);
782                       tp->comment = add_reference (savable_comment);
783                       tp->type = token_type_string;
784                       return;
785                     }
786                   else
787                     {
788                       /* Otherwise we got either ']==' garbage or
789                          ']==...==]' with a different number of equal
790                          signs.
791 
792                          Add ']' and equal signs to the string, and
793                          ungetc the current character, because the
794                          second ']' might be a part of another closing
795                          long bracket, e.g. '==]===]'.  */
796                       phase1_ungetc (c);
797 
798                       string_add (']');
799                       while (esigns2--)
800                         string_add ('=');
801                     }
802                 }
803               else
804                 string_add (c);
805             }
806           break;
807 
808         case ']':
809           tp->type = token_type_rbracket;
810           return;
811 
812         default:
813           if (c >= '0' && c <= '9')
814             {
815               while (c >= '0' && c <= '9')
816                 c = phase1_getc ();
817 
818               if (c == '.')
819                 {
820                   c = phase1_getc ();
821                   while (c >= '0' && c <= '9')
822                     c = phase1_getc ();
823                 }
824 
825               if (c == 'e' || c == 'E')
826                 {
827                   if (c == '+' || c == '-')
828                     c = phase1_getc ();
829                   while (c >= '0' && c <= '9')
830                     c = phase1_getc ();
831                 }
832 
833               phase1_ungetc (c);
834 
835               tp->type = token_type_number;
836               return;
837             }
838           else if ((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z')
839                    || c == '_')
840             {
841               string_start ();
842               while ((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z')
843                      || c == '_' || (c >= '0' && c <= '9'))
844                 {
845                   string_add (c);
846                   c = phase1_getc ();
847                 }
848               string_end ();
849               phase1_ungetc (c);
850 
851               if (strcmp (string_buf, "not") == 0)
852                 tp->type = token_type_operator1;
853               else if (strcmp (string_buf, "and") == 0)
854                 tp->type = token_type_operator2;
855               else if (strcmp (string_buf, "or") == 0)
856                 tp->type = token_type_operator2;
857               else
858                 {
859                   tp->string = xstrdup (string_buf);
860                   tp->type = token_type_symbol;
861                 }
862               return;
863             }
864           else
865             tp->type = token_type_other;
866         }
867     }
868 }
869 
870 /* String and symbol concatenation.  */
871 
872 static token_type_ty phase4_last;
873 
874 /* We need 3 pushback tokens for string and symbol concatenation.  */
875 static int phase4_pushback_length;
876 static token_ty phase4_pushback[3];
877 
878 static void
phase4_unget(token_ty * tp)879 phase4_unget (token_ty *tp)
880 {
881   if (tp->type != token_type_eof)
882     {
883       if (phase4_pushback_length == SIZEOF (phase4_pushback))
884         abort ();
885       phase4_pushback[phase4_pushback_length++] = *tp;
886     }
887 }
888 
889 static void
phase4_get(token_ty * tp)890 phase4_get (token_ty *tp)
891 {
892   if (phase4_pushback_length)
893     {
894       *tp = phase4_pushback[--phase4_pushback_length];
895       phase4_last = tp->type;
896       return;
897     }
898 
899   phase3_get (tp);
900   if (tp->type == token_type_string
901       && !(phase4_last == token_type_operator1
902            || phase4_last == token_type_dot
903            || phase4_last == token_type_symbol
904            || phase4_last == token_type_doubledot
905            || phase4_last == token_type_rparen))
906     {
907       char *sum = tp->string;
908       size_t sum_len = strlen (sum);
909 
910       for (;;)
911         {
912           token_ty token2;
913 
914           phase3_get (&token2);
915           if (token2.type == token_type_doubledot)
916             {
917               token_ty token3;
918 
919               phase3_get (&token3);
920               if (token3.type == token_type_string)
921                 {
922                   token_ty token_after;
923 
924                   phase3_get (&token_after);
925                   if (token_after.type != token_type_operator1)
926                     {
927                       char *addend = token3.string;
928                       size_t addend_len = strlen (addend);
929 
930                       sum = (char *) xrealloc (sum, sum_len + addend_len + 1);
931                       memcpy (sum + sum_len, addend, addend_len + 1);
932                       sum_len += addend_len;
933 
934                       phase3_unget (&token_after);
935                       free_token (&token3);
936                       free_token (&token2);
937                       continue;
938                     }
939                   phase3_unget (&token_after);
940                 }
941               phase3_unget (&token3);
942             }
943           phase3_unget (&token2);
944           break;
945         }
946       tp->string = sum;
947     }
948   phase4_last = tp->type;
949 }
950 
951 static void
phase5_get(token_ty * tp)952 phase5_get (token_ty *tp)
953 {
954   phase4_get (tp);
955 
956   /* Combine symbol1 . ... . symbolN to a single strings, so that
957      we can recognize function calls like
958      gettext.gettext.  The information present for
959      symbolI.....symbolN has precedence over the information for
960      symbolJ.....symbolN with J > I.  */
961   if (tp->type == token_type_symbol)
962     {
963       char *sum = tp->string;
964       size_t sum_len = strlen (sum);
965 
966       for (;;)
967         {
968           token_ty token2;
969 
970           phase4_get (&token2);
971           if (token2.type == token_type_dot)
972             {
973               token_ty token3;
974 
975               phase4_get (&token3);
976               if (token3.type == token_type_symbol)
977                 {
978                   char *addend = token3.string;
979                   size_t addend_len = strlen (addend);
980 
981                   sum = (char *) xrealloc (sum, sum_len + 1 + addend_len + 1);
982                   sum[sum_len] = '.';
983                   memcpy (sum + sum_len + 1, addend, addend_len + 1);
984                   sum_len += 1 + addend_len;
985 
986                   free_token (&token2);
987                   free_token (&token3);
988                   continue;
989                 }
990               phase4_unget (&token3);
991             }
992           phase4_unget (&token2);
993           break;
994         }
995       tp->string = sum;
996     }
997 }
998 
999 static void
x_lua_lex(token_ty * tok)1000 x_lua_lex (token_ty *tok)
1001 {
1002   phase5_get (tok);
1003 }
1004 
1005 
1006 /* ========================= Extracting strings.  ========================== */
1007 
1008 
1009 /* Context lookup table.  */
1010 static flag_context_list_table_ty *flag_context_list_table;
1011 
1012 
1013 /* The file is broken into tokens.  Scan the token stream, looking for
1014    a keyword, followed by a left paren, followed by a string.  When we
1015    see this sequence, we have something to remember.  We assume we are
1016    looking at a valid Lua program, and leave the complaints about the
1017    grammar to the compiler.
1018 
1019      Normal handling: Look for
1020        keyword ( ... msgid ... )
1021        keyword msgid
1022      Plural handling: Look for
1023        keyword ( ... msgid ... msgid_plural ... )
1024 
1025    We use recursion because the arguments before msgid or between msgid
1026    and msgid_plural can contain subexpressions of the same form.  */
1027 
1028 /* Extract messages until the next balanced closing parenthesis or bracket.
1029    Extracted messages are added to MLP.
1030    DELIM can be either token_type_rparen or token_type_rbracket, or
1031    token_type_eof to accept both.
1032    Return true upon eof, false upon closing parenthesis or bracket.  */
1033 static bool
extract_balanced(message_list_ty * mlp,token_type_ty delim,flag_context_ty outer_context,flag_context_list_iterator_ty context_iter,struct arglist_parser * argparser)1034 extract_balanced (message_list_ty *mlp, token_type_ty delim,
1035                   flag_context_ty outer_context,
1036                   flag_context_list_iterator_ty context_iter,
1037                   struct arglist_parser *argparser)
1038 {
1039   /* Current argument number.  */
1040   int arg = 1;
1041   /* 0 when no keyword has been seen.  1 right after a keyword is seen.  */
1042   int state;
1043   /* Parameters of the keyword just seen.  Defined only in state 1.  */
1044   const struct callshapes *next_shapes = NULL;
1045   /* Context iterator that will be used if the next token is a '('.  */
1046   flag_context_list_iterator_ty next_context_iter =
1047     passthrough_context_list_iterator;
1048   /* Current context.  */
1049   flag_context_ty inner_context =
1050     inherited_context (outer_context,
1051                        flag_context_list_iterator_advance (&context_iter));
1052 
1053   /* Start state is 0.  */
1054   state = 0;
1055 
1056   for (;;)
1057     {
1058       token_ty token;
1059 
1060       x_lua_lex (&token);
1061 
1062       switch (token.type)
1063         {
1064         case token_type_symbol:
1065           {
1066             void *keyword_value;
1067 
1068             if (hash_find_entry (&keywords, token.string, strlen (token.string),
1069                                  &keyword_value)
1070                 == 0)
1071               {
1072                 next_shapes = (const struct callshapes *) keyword_value;
1073                 state = 1;
1074               }
1075             else
1076               state = 0;
1077           }
1078           next_context_iter =
1079             flag_context_list_iterator (
1080               flag_context_list_table_lookup (
1081                 flag_context_list_table,
1082                 token.string, strlen (token.string)));
1083           free (token.string);
1084           continue;
1085 
1086         case token_type_lparen:
1087           if (extract_balanced (mlp, token_type_rparen,
1088                                 inner_context, next_context_iter,
1089                                 arglist_parser_alloc (mlp,
1090                                                       state ? next_shapes : NULL)))
1091             {
1092               arglist_parser_done (argparser, arg);
1093               return true;
1094             }
1095           next_context_iter = null_context_list_iterator;
1096           state = 0;
1097           break;
1098 
1099         case token_type_rparen:
1100           if (delim == token_type_rparen || delim == token_type_eof)
1101             {
1102               arglist_parser_done (argparser, arg);
1103               return false;
1104             }
1105 
1106           next_context_iter = null_context_list_iterator;
1107           state = 0;
1108           continue;
1109 
1110         case token_type_lbracket:
1111           if (extract_balanced (mlp, token_type_rbracket,
1112                                 null_context, null_context_list_iterator,
1113                                 arglist_parser_alloc (mlp, NULL)))
1114             {
1115               arglist_parser_done (argparser, arg);
1116               return true;
1117             }
1118           next_context_iter = null_context_list_iterator;
1119           state = 0;
1120           break;
1121 
1122         case token_type_rbracket:
1123           if (delim == token_type_rbracket || delim == token_type_eof)
1124             {
1125               arglist_parser_done (argparser, arg);
1126               return false;
1127             }
1128 
1129           next_context_iter = null_context_list_iterator;
1130           state = 0;
1131           continue;
1132 
1133         case token_type_comma:
1134           arg++;
1135           inner_context =
1136             inherited_context (outer_context,
1137                                flag_context_list_iterator_advance (
1138                                  &context_iter));
1139           next_context_iter = passthrough_context_list_iterator;
1140           state = 0;
1141           continue;
1142 
1143         case token_type_eof:
1144           arglist_parser_done (argparser, arg);
1145           return true;
1146 
1147         case token_type_string:
1148           {
1149             lex_pos_ty pos;
1150             pos.file_name = logical_file_name;
1151             pos.line_number = token.line_number;
1152 
1153             if (extract_all)
1154               remember_a_message (mlp, NULL, token.string, false, false,
1155                                   inner_context, &pos,
1156                                   NULL, token.comment, false);
1157             else
1158               {
1159                 mixed_string_ty *ms =
1160                   mixed_string_alloc_simple (token.string, lc_string,
1161                                              pos.file_name, pos.line_number);
1162                 free (token.string);
1163                 /* A string immediately after a symbol means a function call.  */
1164                 if (state)
1165                   {
1166                     struct arglist_parser *tmp_argparser;
1167                     tmp_argparser = arglist_parser_alloc (mlp, next_shapes);
1168 
1169                     arglist_parser_remember (tmp_argparser, 1, ms,
1170                                              inner_context,
1171                                              pos.file_name, pos.line_number,
1172                                              token.comment, false);
1173                     arglist_parser_done (tmp_argparser, 1);
1174                   }
1175                 else
1176                   arglist_parser_remember (argparser, arg, ms,
1177                                            inner_context,
1178                                            pos.file_name, pos.line_number,
1179                                            token.comment, false);
1180               }
1181           }
1182           drop_reference (token.comment);
1183           next_context_iter = null_context_list_iterator;
1184           state = 0;
1185           continue;
1186 
1187         case token_type_dot:
1188         case token_type_doubledot:
1189         case token_type_operator1:
1190         case token_type_operator2:
1191         case token_type_number:
1192         case token_type_other:
1193           next_context_iter = null_context_list_iterator;
1194           state = 0;
1195           continue;
1196 
1197         default:
1198           abort ();
1199         }
1200     }
1201 }
1202 
1203 void
extract_lua(FILE * f,const char * real_filename,const char * logical_filename,flag_context_list_table_ty * flag_table,msgdomain_list_ty * mdlp)1204 extract_lua (FILE *f,
1205              const char *real_filename, const char *logical_filename,
1206              flag_context_list_table_ty *flag_table,
1207              msgdomain_list_ty *mdlp)
1208 {
1209   message_list_ty *mlp = mdlp->item[0]->messages;
1210 
1211   fp = f;
1212   real_file_name = real_filename;
1213   logical_file_name = xstrdup (logical_filename);
1214   line_number = 1;
1215 
1216   phase1_pushback_length = 0;
1217   first_character = true;
1218 
1219   last_comment_line = -1;
1220   last_non_comment_line = -1;
1221 
1222   phase3_pushback_length = 0;
1223 
1224   phase4_last = token_type_eof;
1225   phase4_pushback_length = 0;
1226 
1227   flag_context_list_table = flag_table;
1228 
1229   init_keywords ();
1230 
1231   /* Eat tokens until eof is seen.  When extract_parenthesized returns
1232      due to an unbalanced closing parenthesis, just restart it.  */
1233   while (!extract_balanced (mlp, token_type_eof,
1234                             null_context, null_context_list_iterator,
1235                             arglist_parser_alloc (mlp, NULL)))
1236     ;
1237 
1238   fp = NULL;
1239   real_file_name = NULL;
1240   logical_file_name = NULL;
1241   line_number = 0;
1242 }
1243