1 /* xgettext Lua backend.
2 Copyright (C) 2012-2013, 2016, 2018-2020 Free Software Foundation, Inc.
3
4 This file was written by Ľubomír Remák <lubomirr@lubomirr.eu>, 2012.
5
6 This program is free software: you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 3 of the License, or
9 (at your option) any later version.
10
11 This program is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
15
16 You should have received a copy of the GNU General Public License
17 along with this program. If not, see <https://www.gnu.org/licenses/>. */
18
19 #ifdef HAVE_CONFIG_H
20 #include "config.h"
21 #endif
22
23 /* Specification. */
24 #include "x-lua.h"
25
26 #include <errno.h>
27 #include <stdbool.h>
28 #include <stdio.h>
29 #include <stdlib.h>
30
31 #include "message.h"
32 #include "rc-str-list.h"
33 #include "xgettext.h"
34 #include "xg-pos.h"
35 #include "xg-mixed-string.h"
36 #include "xg-arglist-context.h"
37 #include "xg-arglist-callshape.h"
38 #include "xg-arglist-parser.h"
39 #include "xg-message.h"
40 #include "error.h"
41 #include "xalloc.h"
42 #include "gettext.h"
43 #include "po-charset.h"
44
45 #define _(s) gettext(s)
46
47 #define SIZEOF(a) (sizeof(a) / sizeof(a[0]))
48
49 /* The Lua syntax is defined in the Lua manual sections 3.1 and 9,
50 which can be found at
51 https://www.lua.org/manual/5.2/manual.html#3.1
52 https://www.lua.org/manual/5.2/manual.html#9 */
53
54 /* If true extract all strings. */
55 static bool extract_all = false;
56
57 /* A hash table for keywords. */
58 static hash_table keywords;
59 static bool default_keywords = true;
60
61 /* Set extract_all flag (gettext will extract all strings). */
62 void
x_lua_extract_all()63 x_lua_extract_all ()
64 {
65 extract_all = true;
66 }
67
68 /* Adds a keyword. Copied from other lexers. */
69 void
x_lua_keyword(const char * name)70 x_lua_keyword (const char *name)
71 {
72 if (name == NULL)
73 default_keywords = false;
74 else
75 {
76 const char *end;
77 struct callshape shape;
78 const char *colon;
79
80 if (keywords.table == NULL)
81 hash_init (&keywords, 100);
82
83 split_keywordspec (name, &end, &shape);
84
85 /* The characters between name and end should form a valid C identifier.
86 A colon means an invalid parse in split_keywordspec(). */
87 colon = strchr (name, ':');
88 if (colon == NULL || colon >= end)
89 insert_keyword_callshape (&keywords, name, end - name, &shape);
90 }
91 }
92
93 /* Finish initializing the keywords hash table.
94 Called after argument processing, before each file is processed. */
95 static void
init_keywords()96 init_keywords ()
97 {
98 if (default_keywords)
99 {
100 /* When adding new keywords here, also update the documentation in
101 xgettext.texi! */
102 x_lua_keyword ("_");
103 x_lua_keyword ("gettext.gettext");
104 x_lua_keyword ("gettext.dgettext:2");
105 x_lua_keyword ("gettext.dcgettext:2");
106 x_lua_keyword ("gettext.ngettext:1,2");
107 x_lua_keyword ("gettext.dngettext:2,3");
108 x_lua_keyword ("gettext.dcngettext:2,3");
109 default_keywords = false;
110 }
111 }
112
113 void
init_flag_table_lua()114 init_flag_table_lua ()
115 {
116 xgettext_record_flag ("_:1:pass-lua-format");
117 xgettext_record_flag ("gettext.gettext:1:pass-lua-format");
118 xgettext_record_flag ("gettext.dgettext:2:pass-lua-format");
119 xgettext_record_flag ("gettext.dcgettext:2:pass-lua-format");
120 xgettext_record_flag ("gettext.ngettext:1:pass-lua-format");
121 xgettext_record_flag ("gettext.ngettext:2:pass-lua-format");
122 xgettext_record_flag ("gettext.dngettext:2:pass-lua-format");
123 xgettext_record_flag ("gettext.dngettext:3:pass-lua-format");
124 xgettext_record_flag ("gettext.dcngettext:2:pass-lua-format");
125 xgettext_record_flag ("gettext.dcngettext:3:pass-lua-format");
126 xgettext_record_flag ("string.format:1:lua-format");
127 }
128
129
130 /* ======================== Reading of characters. ======================== */
131
132 /* The input file stream. */
133 static FILE *fp;
134
135
136 /* 1. line_number handling. */
137
138 static unsigned char phase1_pushback[2];
139 static int phase1_pushback_length;
140
141 static bool first_character;
142
143 static int
phase1_getc()144 phase1_getc ()
145 {
146 int c;
147
148 if (phase1_pushback_length)
149 c = phase1_pushback[--phase1_pushback_length];
150 else
151 {
152 c = getc (fp);
153
154 if (first_character)
155 {
156 first_character = false;
157
158 /* Ignore shebang line. No pushback required in this case. */
159 if (c == '#')
160 {
161 while (c != '\n' && c != EOF)
162 c = getc (fp);
163 if (c == '\n')
164 {
165 line_number++;
166 c = getc (fp);
167 }
168 }
169 }
170
171 if (c == EOF)
172 {
173 if (ferror (fp))
174 error (EXIT_FAILURE, errno, _("error while reading \"%s\""),
175 real_file_name);
176 return EOF;
177 }
178 }
179
180 if (c == '\n')
181 line_number++;
182
183 return c;
184 }
185
186 /* Supports 2 characters of pushback. */
187
188 static void
phase1_ungetc(int c)189 phase1_ungetc (int c)
190 {
191 if (c != EOF)
192 {
193 if (c == '\n')
194 --line_number;
195
196 if (phase1_pushback_length == SIZEOF (phase1_pushback))
197 abort ();
198 phase1_pushback[phase1_pushback_length++] = c;
199 }
200 }
201
202
203 /* These are for tracking whether comments count as immediately before
204 keyword. */
205 static int last_comment_line;
206 static int last_non_comment_line;
207
208 /* Accumulating comments. */
209
210 static char *buffer;
211 static size_t bufmax;
212 static size_t buflen;
213
214 static inline void
comment_start()215 comment_start ()
216 {
217 buflen = 0;
218 }
219
220 static inline void
comment_add(int c)221 comment_add (int c)
222 {
223 if (buflen >= bufmax)
224 {
225 bufmax = 2 * bufmax + 10;
226 buffer = xrealloc (buffer, bufmax);
227 }
228 buffer[buflen++] = c;
229 }
230
231 static inline void
comment_line_end(size_t chars_to_remove)232 comment_line_end (size_t chars_to_remove)
233 {
234 buflen -= chars_to_remove;
235 while (buflen >= 1
236 && (buffer[buflen - 1] == ' ' || buffer[buflen - 1] == '\t'))
237 --buflen;
238 if (chars_to_remove == 0 && buflen >= bufmax)
239 {
240 bufmax = 2 * bufmax + 10;
241 buffer = xrealloc (buffer, bufmax);
242 }
243 buffer[buflen] = '\0';
244 savable_comment_add (buffer);
245 }
246
247 /* Eats characters until '\n' and adds them to the comment. */
248 static void
eat_comment_line()249 eat_comment_line ()
250 {
251 for (;;)
252 {
253 int c = phase1_getc ();
254 if (c == '\n' || c == EOF)
255 {
256 comment_line_end (0);
257 break;
258 }
259
260 if (!(buflen == 0 && (c == ' ' || c == '\t')))
261 comment_add (c);
262 }
263 }
264
265 static int
phase2_getc()266 phase2_getc ()
267 {
268 int c;
269 int lineno;
270
271 c = phase1_getc ();
272
273 if (c == '-')
274 {
275 c = phase1_getc ();
276
277 if (c == '-')
278 {
279 /* It starts with '--', so it must be either a short or a long
280 comment. */
281 c = phase1_getc ();
282
283 if (c == '[')
284 {
285 c = phase1_getc ();
286
287 int esigns = 0;
288 while (c == '=')
289 {
290 esigns++;
291 c = phase1_getc ();
292 }
293
294 if (c == '[')
295 {
296 /* Long comment. */
297 bool right_bracket = false;
298 bool end = false;
299 int esigns2 = 0;
300
301 lineno = line_number;
302 comment_start ();
303 while (!end)
304 {
305 c = phase1_getc ();
306
307 if (c == EOF)
308 break;
309
310 /* Ignore leading spaces and tabs. */
311 if (!(buflen == 0 && (c == ' ' || c == '\t')))
312 {
313 comment_add (c);
314
315 switch (c)
316 {
317 case ']':
318 if (!right_bracket)
319 {
320 right_bracket = true;
321 esigns2 = 0;
322 }
323 else
324 {
325 if (esigns2 == esigns)
326 {
327 comment_line_end (2 + esigns);
328 end = true;
329 }
330 }
331 break;
332
333 case '=':
334 if (right_bracket)
335 esigns2++;
336 break;
337
338 case '\n':
339 comment_line_end (1);
340 comment_start ();
341 lineno = line_number;
342 /* Intentionally not breaking. */
343
344 default:
345 right_bracket = false;
346 }
347 }
348 }
349 last_comment_line = lineno;
350 return ' ';
351 }
352 else
353 {
354 /* One line (short) comment, starting with '--[=...='. */
355 lineno = last_comment_line;
356 comment_start ();
357 comment_add ('[');
358 while (esigns--)
359 comment_add ('=');
360 phase1_ungetc (c);
361 eat_comment_line ();
362 last_comment_line = lineno;
363 return '\n';
364 }
365 }
366 else
367 {
368 /* One line (short) comment. */
369 lineno = line_number;
370 comment_start ();
371 phase1_ungetc (c);
372 eat_comment_line ();
373 last_comment_line = lineno;
374 return '\n';
375 }
376 }
377 else
378 {
379 /* Minus sign. */
380 phase1_ungetc (c);
381 return '-';
382 }
383 }
384 else
385 return c;
386 }
387
388
389 /* ========================== Reading of tokens. ========================== */
390
391 enum token_type_ty
392 {
393 token_type_eof,
394 token_type_lparen, /* ( */
395 token_type_rparen, /* ) */
396 token_type_lbracket, /* [ */
397 token_type_rbracket, /* ] */
398 token_type_comma, /* , */
399 token_type_dot, /* . */
400 token_type_doubledot, /* .. */
401 token_type_operator1, /* + - * / % not # - ^ */
402 token_type_operator2, /* < > <= >= ~= == and or */
403 token_type_string,
404 token_type_number,
405 token_type_symbol,
406 token_type_other
407 };
408
409 typedef enum token_type_ty token_type_ty;
410
411 typedef struct token_ty token_ty;
412 struct token_ty
413 {
414 token_type_ty type;
415 char *string; /* for token_type_string_literal, token_type_symbol */
416 refcounted_string_list_ty *comment; /* for token_type_string_literal */
417 int line_number;
418 };
419
420 /* Free the memory pointed to by a 'struct token_ty'. */
421 static inline void
free_token(token_ty * tp)422 free_token (token_ty *tp)
423 {
424 if (tp->type == token_type_string || tp->type == token_type_symbol)
425 free (tp->string);
426 if (tp->type == token_type_string)
427 drop_reference (tp->comment);
428 }
429
430 /* Our current string. */
431 static int string_buf_length;
432 static int string_buf_alloc;
433 static char *string_buf;
434
435 static void
string_start()436 string_start ()
437 {
438 string_buf_length = 0;
439 }
440
441 static void
string_add(int c)442 string_add (int c)
443 {
444 if (string_buf_length >= string_buf_alloc)
445 {
446 string_buf_alloc = 2 * string_buf_alloc + 10;
447 string_buf = xrealloc (string_buf, string_buf_alloc);
448 }
449
450 string_buf[string_buf_length++] = c;
451 }
452
453 static void
string_end()454 string_end ()
455 {
456 if (string_buf_length >= string_buf_alloc)
457 {
458 string_buf_alloc = string_buf_alloc + 1;
459 string_buf = xrealloc (string_buf, string_buf_alloc);
460 }
461
462 string_buf[string_buf_length] = '\0';
463 }
464
465
466 /* We need 3 pushback tokens for string optimization. */
467 static int phase3_pushback_length;
468 static token_ty phase3_pushback[3];
469
470
471 static void
phase3_unget(token_ty * tp)472 phase3_unget (token_ty *tp)
473 {
474 if (tp->type != token_type_eof)
475 {
476 if (phase3_pushback_length == SIZEOF (phase3_pushback))
477 abort ();
478 phase3_pushback[phase3_pushback_length++] = *tp;
479 }
480 }
481
482 static void
phase3_get(token_ty * tp)483 phase3_get (token_ty *tp)
484 {
485 int c;
486 int c2;
487 int c_start;
488
489 if (phase3_pushback_length)
490 {
491 *tp = phase3_pushback[--phase3_pushback_length];
492 return;
493 }
494
495 tp->string = NULL;
496
497 for (;;)
498 {
499 tp->line_number = line_number;
500 c = phase2_getc ();
501
502 switch (c)
503 {
504 case EOF:
505 tp->type = token_type_eof;
506 return;
507
508 case '\n':
509 if (last_non_comment_line > last_comment_line)
510 savable_comment_reset ();
511 /* Intentionally not breaking. */
512 case ' ':
513 case '\t':
514 case '\f':
515 continue;
516
517 case '+':
518 case '-':
519 case '*':
520 case '/':
521 case '^':
522 case '%':
523 case '#':
524 tp->type = token_type_operator1;
525 return;
526 case '<':
527 case '>':
528 case '=':
529 c2 = phase1_getc ();
530 if (c2 != '=')
531 phase1_ungetc (c2);
532 tp->type = token_type_operator2;
533 return;
534 case '~':
535 c2 = phase1_getc ();
536 if (c2 == '=')
537 {
538 tp->type = token_type_operator2;
539 return;
540 }
541 else
542 phase1_ungetc (c2);
543 continue;
544 case '(':
545 tp->type = token_type_lparen;
546 return;
547 case ')':
548 tp->type = token_type_rparen;
549 return;
550 case ',':
551 tp->type = token_type_comma;
552 return;
553
554 case ';':
555 tp->type = token_type_other;
556 return;
557
558 /* There are three operators beginning with a dot. '.',
559 '..' and '...'. The most useful for us is the string
560 concatenation operator ('..'). */
561 case '.':
562 c = phase1_getc ();
563 if (c == '.')
564 {
565 c = phase1_getc ();
566 if (c == '.')
567 {
568 tp->type = token_type_other;
569 return;
570 }
571 else
572 {
573 phase1_ungetc (c);
574 tp->type = token_type_doubledot;
575 return;
576 }
577 }
578 else if (c >= '0' && c <= '9')
579 {
580 /* It's a number. We aren't interested in the actual
581 numeric value, so ignore the dot and let next
582 iteration eat the number. */
583 phase1_ungetc (c);
584 continue;
585 }
586 else
587 {
588 phase1_ungetc (c);
589 tp->type = token_type_dot;
590 return;
591 }
592
593 case '"':
594 case '\'':
595 c_start = c;
596 string_start ();
597
598 for (;;)
599 {
600 /* We need unprocessed characters from phase 1. */
601 c = phase1_getc ();
602
603 if (c == EOF || c == c_start || c == '\n')
604 {
605 /* End of string. */
606 string_end ();
607 tp->string = xstrdup (string_buf);
608 tp->comment = add_reference (savable_comment);
609 tp->type = token_type_string;
610 return;
611 }
612
613 /* We got '\', this is probably an escape sequence. */
614 if (c == '\\')
615 {
616 c = phase1_getc ();
617 switch (c)
618 {
619 case 'a':
620 string_add ('\a');
621 break;
622 case 'b':
623 string_add ('\b');
624 break;
625 case 'f':
626 string_add ('\f');
627 break;
628 case 'n':
629 string_add ('\n');
630 break;
631 case 'r':
632 string_add ('\r');
633 break;
634 case 't':
635 string_add ('\t');
636 break;
637 case 'v':
638 string_add ('\v');
639 break;
640 case 'x':
641 {
642 int num = 0;
643 int i = 0;
644
645 for (i = 0; i < 2; i++)
646 {
647 c = phase1_getc ();
648 if (c >= '0' && c <= '9')
649 num += c - '0';
650 else if (c >= 'a' && c <= 'f')
651 num += c - 'a' + 10;
652 else if (c >= 'A' && c <= 'F')
653 num += c - 'A' + 10;
654 else
655 {
656 phase1_ungetc (c);
657 break;
658 }
659
660 if (i == 0)
661 num *= 16;
662 }
663
664 if (i == 2)
665 string_add (num);
666 }
667
668 break;
669 case 'z':
670 /* Ignore the following whitespace. */
671 do
672 {
673 c = phase1_getc ();
674 }
675 while (c == ' ' || c == '\n' || c == '\t' || c == '\r'
676 || c == '\f' || c == '\v');
677
678 phase1_ungetc (c);
679
680 break;
681 default:
682 /* Check if it's a '\ddd' sequence. */
683 if (c >= '0' && c <= '9')
684 {
685 int num = 0;
686 int i = 0;
687
688 while (c >= '0' && c <= '9' && i < 3)
689 {
690 num *= 10;
691 num += (c - '0');
692 c = phase1_getc ();
693 i++;
694 }
695
696 /* The last read character is either a
697 non-number or another number after our
698 '\ddd' sequence. We need to ungetc it. */
699 phase1_ungetc (c);
700
701 /* The sequence number is too big, this
702 causes a lexical error. Ignore it. */
703 if (num < 256)
704 string_add (num);
705 }
706 else
707 string_add (c);
708 }
709 }
710 else
711 string_add (c);
712 }
713 break;
714
715 case '[':
716 c = phase1_getc ();
717
718 /* Count the number of equal signs. */
719 int esigns = 0;
720 while (c == '=')
721 {
722 esigns++;
723 c = phase1_getc ();
724 }
725
726 if (c != '[')
727 {
728 /* We did not find what we were looking for, ungetc it. */
729 phase1_ungetc (c);
730 if (esigns == 0)
731 {
732 /* Our current character isn't '[' and we got 0 equal
733 signs, so the first '[' must have been a left
734 bracket. */
735 tp->type = token_type_lbracket;
736 return;
737 }
738 else
739 /* Lexical error, ignore it. */
740 continue;
741 }
742
743 /* Found an opening long bracket. */
744 string_start ();
745
746 /* See if it is immediately followed by a newline. */
747 c = phase1_getc ();
748 if (c != '\n')
749 phase1_ungetc (c);
750
751 for (;;)
752 {
753 c = phase1_getc ();
754
755 if (c == EOF)
756 {
757 string_end ();
758 tp->string = xstrdup (string_buf);
759 tp->comment = add_reference (savable_comment);
760 tp->type = token_type_string;
761 return;
762 }
763 if (c == ']')
764 {
765 c = phase1_getc ();
766
767 /* Count the number of equal signs. */
768 int esigns2 = 0;
769 while (c == '=')
770 {
771 esigns2++;
772 c = phase1_getc ();
773 }
774
775 if (c == ']' && esigns == esigns2)
776 {
777 /* We got ']==...==]', where the number of equal
778 signs matches the number of equal signs in
779 the opening bracket. */
780 string_end ();
781 tp->string = xstrdup (string_buf);
782 tp->comment = add_reference (savable_comment);
783 tp->type = token_type_string;
784 return;
785 }
786 else
787 {
788 /* Otherwise we got either ']==' garbage or
789 ']==...==]' with a different number of equal
790 signs.
791
792 Add ']' and equal signs to the string, and
793 ungetc the current character, because the
794 second ']' might be a part of another closing
795 long bracket, e.g. '==]===]'. */
796 phase1_ungetc (c);
797
798 string_add (']');
799 while (esigns2--)
800 string_add ('=');
801 }
802 }
803 else
804 string_add (c);
805 }
806 break;
807
808 case ']':
809 tp->type = token_type_rbracket;
810 return;
811
812 default:
813 if (c >= '0' && c <= '9')
814 {
815 while (c >= '0' && c <= '9')
816 c = phase1_getc ();
817
818 if (c == '.')
819 {
820 c = phase1_getc ();
821 while (c >= '0' && c <= '9')
822 c = phase1_getc ();
823 }
824
825 if (c == 'e' || c == 'E')
826 {
827 if (c == '+' || c == '-')
828 c = phase1_getc ();
829 while (c >= '0' && c <= '9')
830 c = phase1_getc ();
831 }
832
833 phase1_ungetc (c);
834
835 tp->type = token_type_number;
836 return;
837 }
838 else if ((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z')
839 || c == '_')
840 {
841 string_start ();
842 while ((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z')
843 || c == '_' || (c >= '0' && c <= '9'))
844 {
845 string_add (c);
846 c = phase1_getc ();
847 }
848 string_end ();
849 phase1_ungetc (c);
850
851 if (strcmp (string_buf, "not") == 0)
852 tp->type = token_type_operator1;
853 else if (strcmp (string_buf, "and") == 0)
854 tp->type = token_type_operator2;
855 else if (strcmp (string_buf, "or") == 0)
856 tp->type = token_type_operator2;
857 else
858 {
859 tp->string = xstrdup (string_buf);
860 tp->type = token_type_symbol;
861 }
862 return;
863 }
864 else
865 tp->type = token_type_other;
866 }
867 }
868 }
869
870 /* String and symbol concatenation. */
871
872 static token_type_ty phase4_last;
873
874 /* We need 3 pushback tokens for string and symbol concatenation. */
875 static int phase4_pushback_length;
876 static token_ty phase4_pushback[3];
877
878 static void
phase4_unget(token_ty * tp)879 phase4_unget (token_ty *tp)
880 {
881 if (tp->type != token_type_eof)
882 {
883 if (phase4_pushback_length == SIZEOF (phase4_pushback))
884 abort ();
885 phase4_pushback[phase4_pushback_length++] = *tp;
886 }
887 }
888
889 static void
phase4_get(token_ty * tp)890 phase4_get (token_ty *tp)
891 {
892 if (phase4_pushback_length)
893 {
894 *tp = phase4_pushback[--phase4_pushback_length];
895 phase4_last = tp->type;
896 return;
897 }
898
899 phase3_get (tp);
900 if (tp->type == token_type_string
901 && !(phase4_last == token_type_operator1
902 || phase4_last == token_type_dot
903 || phase4_last == token_type_symbol
904 || phase4_last == token_type_doubledot
905 || phase4_last == token_type_rparen))
906 {
907 char *sum = tp->string;
908 size_t sum_len = strlen (sum);
909
910 for (;;)
911 {
912 token_ty token2;
913
914 phase3_get (&token2);
915 if (token2.type == token_type_doubledot)
916 {
917 token_ty token3;
918
919 phase3_get (&token3);
920 if (token3.type == token_type_string)
921 {
922 token_ty token_after;
923
924 phase3_get (&token_after);
925 if (token_after.type != token_type_operator1)
926 {
927 char *addend = token3.string;
928 size_t addend_len = strlen (addend);
929
930 sum = (char *) xrealloc (sum, sum_len + addend_len + 1);
931 memcpy (sum + sum_len, addend, addend_len + 1);
932 sum_len += addend_len;
933
934 phase3_unget (&token_after);
935 free_token (&token3);
936 free_token (&token2);
937 continue;
938 }
939 phase3_unget (&token_after);
940 }
941 phase3_unget (&token3);
942 }
943 phase3_unget (&token2);
944 break;
945 }
946 tp->string = sum;
947 }
948 phase4_last = tp->type;
949 }
950
951 static void
phase5_get(token_ty * tp)952 phase5_get (token_ty *tp)
953 {
954 phase4_get (tp);
955
956 /* Combine symbol1 . ... . symbolN to a single strings, so that
957 we can recognize function calls like
958 gettext.gettext. The information present for
959 symbolI.....symbolN has precedence over the information for
960 symbolJ.....symbolN with J > I. */
961 if (tp->type == token_type_symbol)
962 {
963 char *sum = tp->string;
964 size_t sum_len = strlen (sum);
965
966 for (;;)
967 {
968 token_ty token2;
969
970 phase4_get (&token2);
971 if (token2.type == token_type_dot)
972 {
973 token_ty token3;
974
975 phase4_get (&token3);
976 if (token3.type == token_type_symbol)
977 {
978 char *addend = token3.string;
979 size_t addend_len = strlen (addend);
980
981 sum = (char *) xrealloc (sum, sum_len + 1 + addend_len + 1);
982 sum[sum_len] = '.';
983 memcpy (sum + sum_len + 1, addend, addend_len + 1);
984 sum_len += 1 + addend_len;
985
986 free_token (&token2);
987 free_token (&token3);
988 continue;
989 }
990 phase4_unget (&token3);
991 }
992 phase4_unget (&token2);
993 break;
994 }
995 tp->string = sum;
996 }
997 }
998
999 static void
x_lua_lex(token_ty * tok)1000 x_lua_lex (token_ty *tok)
1001 {
1002 phase5_get (tok);
1003 }
1004
1005
1006 /* ========================= Extracting strings. ========================== */
1007
1008
1009 /* Context lookup table. */
1010 static flag_context_list_table_ty *flag_context_list_table;
1011
1012
1013 /* The file is broken into tokens. Scan the token stream, looking for
1014 a keyword, followed by a left paren, followed by a string. When we
1015 see this sequence, we have something to remember. We assume we are
1016 looking at a valid Lua program, and leave the complaints about the
1017 grammar to the compiler.
1018
1019 Normal handling: Look for
1020 keyword ( ... msgid ... )
1021 keyword msgid
1022 Plural handling: Look for
1023 keyword ( ... msgid ... msgid_plural ... )
1024
1025 We use recursion because the arguments before msgid or between msgid
1026 and msgid_plural can contain subexpressions of the same form. */
1027
1028 /* Extract messages until the next balanced closing parenthesis or bracket.
1029 Extracted messages are added to MLP.
1030 DELIM can be either token_type_rparen or token_type_rbracket, or
1031 token_type_eof to accept both.
1032 Return true upon eof, false upon closing parenthesis or bracket. */
1033 static bool
extract_balanced(message_list_ty * mlp,token_type_ty delim,flag_context_ty outer_context,flag_context_list_iterator_ty context_iter,struct arglist_parser * argparser)1034 extract_balanced (message_list_ty *mlp, token_type_ty delim,
1035 flag_context_ty outer_context,
1036 flag_context_list_iterator_ty context_iter,
1037 struct arglist_parser *argparser)
1038 {
1039 /* Current argument number. */
1040 int arg = 1;
1041 /* 0 when no keyword has been seen. 1 right after a keyword is seen. */
1042 int state;
1043 /* Parameters of the keyword just seen. Defined only in state 1. */
1044 const struct callshapes *next_shapes = NULL;
1045 /* Context iterator that will be used if the next token is a '('. */
1046 flag_context_list_iterator_ty next_context_iter =
1047 passthrough_context_list_iterator;
1048 /* Current context. */
1049 flag_context_ty inner_context =
1050 inherited_context (outer_context,
1051 flag_context_list_iterator_advance (&context_iter));
1052
1053 /* Start state is 0. */
1054 state = 0;
1055
1056 for (;;)
1057 {
1058 token_ty token;
1059
1060 x_lua_lex (&token);
1061
1062 switch (token.type)
1063 {
1064 case token_type_symbol:
1065 {
1066 void *keyword_value;
1067
1068 if (hash_find_entry (&keywords, token.string, strlen (token.string),
1069 &keyword_value)
1070 == 0)
1071 {
1072 next_shapes = (const struct callshapes *) keyword_value;
1073 state = 1;
1074 }
1075 else
1076 state = 0;
1077 }
1078 next_context_iter =
1079 flag_context_list_iterator (
1080 flag_context_list_table_lookup (
1081 flag_context_list_table,
1082 token.string, strlen (token.string)));
1083 free (token.string);
1084 continue;
1085
1086 case token_type_lparen:
1087 if (extract_balanced (mlp, token_type_rparen,
1088 inner_context, next_context_iter,
1089 arglist_parser_alloc (mlp,
1090 state ? next_shapes : NULL)))
1091 {
1092 arglist_parser_done (argparser, arg);
1093 return true;
1094 }
1095 next_context_iter = null_context_list_iterator;
1096 state = 0;
1097 break;
1098
1099 case token_type_rparen:
1100 if (delim == token_type_rparen || delim == token_type_eof)
1101 {
1102 arglist_parser_done (argparser, arg);
1103 return false;
1104 }
1105
1106 next_context_iter = null_context_list_iterator;
1107 state = 0;
1108 continue;
1109
1110 case token_type_lbracket:
1111 if (extract_balanced (mlp, token_type_rbracket,
1112 null_context, null_context_list_iterator,
1113 arglist_parser_alloc (mlp, NULL)))
1114 {
1115 arglist_parser_done (argparser, arg);
1116 return true;
1117 }
1118 next_context_iter = null_context_list_iterator;
1119 state = 0;
1120 break;
1121
1122 case token_type_rbracket:
1123 if (delim == token_type_rbracket || delim == token_type_eof)
1124 {
1125 arglist_parser_done (argparser, arg);
1126 return false;
1127 }
1128
1129 next_context_iter = null_context_list_iterator;
1130 state = 0;
1131 continue;
1132
1133 case token_type_comma:
1134 arg++;
1135 inner_context =
1136 inherited_context (outer_context,
1137 flag_context_list_iterator_advance (
1138 &context_iter));
1139 next_context_iter = passthrough_context_list_iterator;
1140 state = 0;
1141 continue;
1142
1143 case token_type_eof:
1144 arglist_parser_done (argparser, arg);
1145 return true;
1146
1147 case token_type_string:
1148 {
1149 lex_pos_ty pos;
1150 pos.file_name = logical_file_name;
1151 pos.line_number = token.line_number;
1152
1153 if (extract_all)
1154 remember_a_message (mlp, NULL, token.string, false, false,
1155 inner_context, &pos,
1156 NULL, token.comment, false);
1157 else
1158 {
1159 mixed_string_ty *ms =
1160 mixed_string_alloc_simple (token.string, lc_string,
1161 pos.file_name, pos.line_number);
1162 free (token.string);
1163 /* A string immediately after a symbol means a function call. */
1164 if (state)
1165 {
1166 struct arglist_parser *tmp_argparser;
1167 tmp_argparser = arglist_parser_alloc (mlp, next_shapes);
1168
1169 arglist_parser_remember (tmp_argparser, 1, ms,
1170 inner_context,
1171 pos.file_name, pos.line_number,
1172 token.comment, false);
1173 arglist_parser_done (tmp_argparser, 1);
1174 }
1175 else
1176 arglist_parser_remember (argparser, arg, ms,
1177 inner_context,
1178 pos.file_name, pos.line_number,
1179 token.comment, false);
1180 }
1181 }
1182 drop_reference (token.comment);
1183 next_context_iter = null_context_list_iterator;
1184 state = 0;
1185 continue;
1186
1187 case token_type_dot:
1188 case token_type_doubledot:
1189 case token_type_operator1:
1190 case token_type_operator2:
1191 case token_type_number:
1192 case token_type_other:
1193 next_context_iter = null_context_list_iterator;
1194 state = 0;
1195 continue;
1196
1197 default:
1198 abort ();
1199 }
1200 }
1201 }
1202
1203 void
extract_lua(FILE * f,const char * real_filename,const char * logical_filename,flag_context_list_table_ty * flag_table,msgdomain_list_ty * mdlp)1204 extract_lua (FILE *f,
1205 const char *real_filename, const char *logical_filename,
1206 flag_context_list_table_ty *flag_table,
1207 msgdomain_list_ty *mdlp)
1208 {
1209 message_list_ty *mlp = mdlp->item[0]->messages;
1210
1211 fp = f;
1212 real_file_name = real_filename;
1213 logical_file_name = xstrdup (logical_filename);
1214 line_number = 1;
1215
1216 phase1_pushback_length = 0;
1217 first_character = true;
1218
1219 last_comment_line = -1;
1220 last_non_comment_line = -1;
1221
1222 phase3_pushback_length = 0;
1223
1224 phase4_last = token_type_eof;
1225 phase4_pushback_length = 0;
1226
1227 flag_context_list_table = flag_table;
1228
1229 init_keywords ();
1230
1231 /* Eat tokens until eof is seen. When extract_parenthesized returns
1232 due to an unbalanced closing parenthesis, just restart it. */
1233 while (!extract_balanced (mlp, token_type_eof,
1234 null_context, null_context_list_iterator,
1235 arglist_parser_alloc (mlp, NULL)))
1236 ;
1237
1238 fp = NULL;
1239 real_file_name = NULL;
1240 logical_file_name = NULL;
1241 line_number = 0;
1242 }
1243