1 /* xgettext JavaScript backend.
2    Copyright (C) 2002-2003, 2005-2009, 2013-2014, 2018-2020 Free Software Foundation, Inc.
3 
4    This file was written by Andreas Stricker <andy@knitter.ch>, 2010
5    It's based on x-python from Bruno Haible.
6 
7    This program is free software: you can redistribute it and/or modify
8    it under the terms of the GNU General Public License as published by
9    the Free Software Foundation; either version 3 of the License, or
10    (at your option) any later version.
11 
12    This program is distributed in the hope that it will be useful,
13    but WITHOUT ANY WARRANTY; without even the implied warranty of
14    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15    GNU General Public License for more details.
16 
17    You should have received a copy of the GNU General Public License
18    along with this program.  If not, see <https://www.gnu.org/licenses/>.  */
19 
20 #ifdef HAVE_CONFIG_H
21 # include "config.h"
22 #endif
23 
24 /* Specification.  */
25 #include "x-javascript.h"
26 
27 #include <assert.h>
28 #include <errno.h>
29 #include <stdbool.h>
30 #include <stdio.h>
31 #include <stdlib.h>
32 #include <string.h>
33 
34 #include "message.h"
35 #include "rc-str-list.h"
36 #include "xgettext.h"
37 #include "xg-pos.h"
38 #include "xg-encoding.h"
39 #include "xg-mixed-string.h"
40 #include "xg-arglist-context.h"
41 #include "xg-arglist-callshape.h"
42 #include "xg-arglist-parser.h"
43 #include "xg-message.h"
44 #include "error.h"
45 #include "error-progname.h"
46 #include "progname.h"
47 #include "xerror.h"
48 #include "xvasprintf.h"
49 #include "xalloc.h"
50 #include "c-strstr.h"
51 #include "c-ctype.h"
52 #include "po-charset.h"
53 #include "unistr.h"
54 #include "gettext.h"
55 
56 #define _(s) gettext(s)
57 
58 #define max(a,b) ((a) > (b) ? (a) : (b))
59 
60 #define SIZEOF(a) (sizeof(a) / sizeof(a[0]))
61 
62 /* The JavaScript aka ECMA-Script syntax is defined in ECMA-262
63    specification:
64    <https://www.ecma-international.org/publications/standards/Ecma-262.htm>
65 
66    Regarding the XML element support:
67    The earlier standard E4X
68    <https://en.wikipedia.org/wiki/ECMAScript_for_XML>
69    <https://web.archive.org/web/20131104082608/http://www.ecma-international.org/publications/standards/Ecma-357.htm>
70    is no longer widely supported.
71    Instead, nowadays, JSX is widely used.
72    <https://facebook.github.io/jsx/>
73 */
74 
75 /* ====================== Keyword set customization.  ====================== */
76 
77 /* If true extract all strings.  */
78 static bool extract_all = false;
79 
80 static hash_table keywords;
81 static bool default_keywords = true;
82 
83 
84 void
x_javascript_extract_all()85 x_javascript_extract_all ()
86 {
87   extract_all = true;
88 }
89 
90 
91 void
x_javascript_keyword(const char * name)92 x_javascript_keyword (const char *name)
93 {
94   if (name == NULL)
95     default_keywords = false;
96   else
97     {
98       const char *end;
99       struct callshape shape;
100       const char *colon;
101 
102       if (keywords.table == NULL)
103         hash_init (&keywords, 100);
104 
105       split_keywordspec (name, &end, &shape);
106 
107       /* The characters between name and end should form a valid C identifier.
108          A colon means an invalid parse in split_keywordspec().  */
109       colon = strchr (name, ':');
110       if (colon == NULL || colon >= end)
111         insert_keyword_callshape (&keywords, name, end - name, &shape);
112     }
113 }
114 
115 /* Finish initializing the keywords hash table.
116    Called after argument processing, before each file is processed.  */
117 static void
init_keywords()118 init_keywords ()
119 {
120   if (default_keywords)
121     {
122       /* When adding new keywords here, also update the documentation in
123          xgettext.texi!  */
124       x_javascript_keyword ("gettext");
125       x_javascript_keyword ("dgettext:2");
126       x_javascript_keyword ("dcgettext:2");
127       x_javascript_keyword ("ngettext:1,2");
128       x_javascript_keyword ("dngettext:2,3");
129       x_javascript_keyword ("pgettext:1c,2");
130       x_javascript_keyword ("dpgettext:2c,3");
131       x_javascript_keyword ("_");
132       default_keywords = false;
133     }
134 }
135 
136 void
init_flag_table_javascript()137 init_flag_table_javascript ()
138 {
139   xgettext_record_flag ("gettext:1:pass-javascript-format");
140   xgettext_record_flag ("dgettext:2:pass-javascript-format");
141   xgettext_record_flag ("dcgettext:2:pass-javascript-format");
142   xgettext_record_flag ("ngettext:1:pass-javascript-format");
143   xgettext_record_flag ("ngettext:2:pass-javascript-format");
144   xgettext_record_flag ("dngettext:2:pass-javascript-format");
145   xgettext_record_flag ("dngettext:3:pass-javascript-format");
146   xgettext_record_flag ("pgettext:2:pass-javascript-format");
147   xgettext_record_flag ("dpgettext:3:pass-javascript-format");
148   xgettext_record_flag ("_:1:pass-javascript-format");
149 }
150 
151 
152 /* ======================== Reading of characters.  ======================== */
153 
154 /* The input file stream.  */
155 static FILE *fp;
156 
157 
158 /* 1. line_number handling.  */
159 
160 /* Maximum used, roughly a safer MB_LEN_MAX.  */
161 #define MAX_PHASE1_PUSHBACK 16
162 static unsigned char phase1_pushback[MAX_PHASE1_PUSHBACK];
163 static int phase1_pushback_length;
164 
165 /* Read the next single byte from the input file.  */
166 static int
phase1_getc()167 phase1_getc ()
168 {
169   int c;
170 
171   if (phase1_pushback_length)
172     c = phase1_pushback[--phase1_pushback_length];
173   else
174     {
175       c = getc (fp);
176 
177       if (c == EOF)
178         {
179           if (ferror (fp))
180             error (EXIT_FAILURE, errno, _("error while reading \"%s\""),
181                    real_file_name);
182           return EOF;
183         }
184     }
185 
186   if (c == '\n')
187     ++line_number;
188 
189   return c;
190 }
191 
192 /* Supports MAX_PHASE1_PUSHBACK characters of pushback.  */
193 static void
phase1_ungetc(int c)194 phase1_ungetc (int c)
195 {
196   if (c != EOF)
197     {
198       if (c == '\n')
199         --line_number;
200 
201       if (phase1_pushback_length == SIZEOF (phase1_pushback))
202         abort ();
203       phase1_pushback[phase1_pushback_length++] = c;
204     }
205 }
206 
207 
208 /* Phase 2: Conversion to Unicode.
209    For now, we expect JavaScript files to be encoded as UTF-8.  */
210 
211 /* End-of-file indicator for functions returning an UCS-4 character.  */
212 #define UEOF -1
213 
214 static lexical_context_ty lexical_context;
215 
216 /* Maximum used, length of "<![CDATA[" tag minus one.  */
217 static int phase2_pushback[8];
218 static int phase2_pushback_length;
219 
220 /* Read the next Unicode UCS-4 character from the input file.  */
221 static int
phase2_getc()222 phase2_getc ()
223 {
224   if (phase2_pushback_length)
225     return phase2_pushback[--phase2_pushback_length];
226 
227   if (xgettext_current_source_encoding == po_charset_ascii)
228     {
229       int c = phase1_getc ();
230       if (c == EOF)
231         return UEOF;
232       if (!c_isascii (c))
233         {
234           multiline_error (xstrdup (""),
235                            xasprintf ("%s\n%s\n",
236                                       non_ascii_error_message (lexical_context,
237                                                                real_file_name,
238                                                                line_number),
239                                       _("Please specify the source encoding through --from-code\n")));
240           exit (EXIT_FAILURE);
241         }
242       return c;
243     }
244   else if (xgettext_current_source_encoding != po_charset_utf8)
245     {
246 #if HAVE_ICONV
247       /* Use iconv on an increasing number of bytes.  Read only as many bytes
248          through phase1_getc as needed.  This is needed to give reasonable
249          interactive behaviour when fp is connected to an interactive tty.  */
250       unsigned char buf[MAX_PHASE1_PUSHBACK];
251       size_t bufcount;
252       int c = phase1_getc ();
253       if (c == EOF)
254         return UEOF;
255       buf[0] = (unsigned char) c;
256       bufcount = 1;
257 
258       for (;;)
259         {
260           unsigned char scratchbuf[6];
261           const char *inptr = (const char *) &buf[0];
262           size_t insize = bufcount;
263           char *outptr = (char *) &scratchbuf[0];
264           size_t outsize = sizeof (scratchbuf);
265 
266           size_t res = iconv (xgettext_current_source_iconv,
267                               (ICONV_CONST char **) &inptr, &insize,
268                               &outptr, &outsize);
269           /* We expect that a character has been produced if and only if
270              some input bytes have been consumed.  */
271           if ((insize < bufcount) != (outsize < sizeof (scratchbuf)))
272             abort ();
273           if (outsize == sizeof (scratchbuf))
274             {
275               /* No character has been produced.  Must be an error.  */
276               if (res != (size_t)(-1))
277                 abort ();
278 
279               if (errno == EILSEQ)
280                 {
281                   /* An invalid multibyte sequence was encountered.  */
282                   multiline_error (xstrdup (""),
283                                    xasprintf (_("\
284 %s:%d: Invalid multibyte sequence.\n\
285 Please specify the correct source encoding through --from-code\n"),
286                                    real_file_name, line_number));
287                   exit (EXIT_FAILURE);
288                 }
289               else if (errno == EINVAL)
290                 {
291                   /* An incomplete multibyte character.  */
292                   int c;
293 
294                   if (bufcount == MAX_PHASE1_PUSHBACK)
295                     {
296                       /* An overlong incomplete multibyte sequence was
297                          encountered.  */
298                       multiline_error (xstrdup (""),
299                                        xasprintf (_("\
300 %s:%d: Long incomplete multibyte sequence.\n\
301 Please specify the correct source encoding through --from-code\n"),
302                                        real_file_name, line_number));
303                       exit (EXIT_FAILURE);
304                     }
305 
306                   /* Read one more byte and retry iconv.  */
307                   c = phase1_getc ();
308                   if (c == EOF)
309                     {
310                       multiline_error (xstrdup (""),
311                                        xasprintf (_("\
312 %s:%d: Incomplete multibyte sequence at end of file.\n\
313 Please specify the correct source encoding through --from-code\n"),
314                                        real_file_name, line_number));
315                       exit (EXIT_FAILURE);
316                     }
317                   if (c == '\n')
318                     {
319                       multiline_error (xstrdup (""),
320                                        xasprintf (_("\
321 %s:%d: Incomplete multibyte sequence at end of line.\n\
322 Please specify the correct source encoding through --from-code\n"),
323                                        real_file_name, line_number - 1));
324                       exit (EXIT_FAILURE);
325                     }
326                   buf[bufcount++] = (unsigned char) c;
327                 }
328               else
329                 error (EXIT_FAILURE, errno, _("%s:%d: iconv failure"),
330                        real_file_name, line_number);
331             }
332           else
333             {
334               size_t outbytes = sizeof (scratchbuf) - outsize;
335               size_t bytes = bufcount - insize;
336               ucs4_t uc;
337 
338               /* We expect that one character has been produced.  */
339               if (bytes == 0)
340                 abort ();
341               if (outbytes == 0)
342                 abort ();
343               /* Push back the unused bytes.  */
344               while (insize > 0)
345                 phase1_ungetc (buf[--insize]);
346               /* Convert the character from UTF-8 to UCS-4.  */
347               if (u8_mbtoucr (&uc, scratchbuf, outbytes) < (int) outbytes)
348                 {
349                   /* scratchbuf contains an out-of-range Unicode character
350                      (> 0x10ffff).  */
351                   multiline_error (xstrdup (""),
352                                    xasprintf (_("\
353 %s:%d: Invalid multibyte sequence.\n\
354 Please specify the source encoding through --from-code\n"),
355                                    real_file_name, line_number));
356                   exit (EXIT_FAILURE);
357                 }
358               return uc;
359             }
360         }
361 #else
362       /* If we don't have iconv(), the only supported values for
363          xgettext_global_source_encoding and thus also for
364          xgettext_current_source_encoding are ASCII and UTF-8.  */
365       abort ();
366 #endif
367     }
368   else
369     {
370       /* Read an UTF-8 encoded character.  */
371       unsigned char buf[6];
372       unsigned int count;
373       int c;
374       ucs4_t uc;
375 
376       c = phase1_getc ();
377       if (c == EOF)
378         return UEOF;
379       buf[0] = c;
380       count = 1;
381 
382       if (buf[0] >= 0xc0)
383         {
384           c = phase1_getc ();
385           if (c == EOF)
386             return UEOF;
387           buf[1] = c;
388           count = 2;
389         }
390 
391       if (buf[0] >= 0xe0
392           && ((buf[1] ^ 0x80) < 0x40))
393         {
394           c = phase1_getc ();
395           if (c == EOF)
396             return UEOF;
397           buf[2] = c;
398           count = 3;
399         }
400 
401       if (buf[0] >= 0xf0
402           && ((buf[1] ^ 0x80) < 0x40)
403           && ((buf[2] ^ 0x80) < 0x40))
404         {
405           c = phase1_getc ();
406           if (c == EOF)
407             return UEOF;
408           buf[3] = c;
409           count = 4;
410         }
411 
412       if (buf[0] >= 0xf8
413           && ((buf[1] ^ 0x80) < 0x40)
414           && ((buf[2] ^ 0x80) < 0x40)
415           && ((buf[3] ^ 0x80) < 0x40))
416         {
417           c = phase1_getc ();
418           if (c == EOF)
419             return UEOF;
420           buf[4] = c;
421           count = 5;
422         }
423 
424       if (buf[0] >= 0xfc
425           && ((buf[1] ^ 0x80) < 0x40)
426           && ((buf[2] ^ 0x80) < 0x40)
427           && ((buf[3] ^ 0x80) < 0x40)
428           && ((buf[4] ^ 0x80) < 0x40))
429         {
430           c = phase1_getc ();
431           if (c == EOF)
432             return UEOF;
433           buf[5] = c;
434           count = 6;
435         }
436 
437       u8_mbtouc (&uc, buf, count);
438       return uc;
439     }
440 }
441 
442 /* Supports max (9, UNINAME_MAX + 3) pushback characters.  */
443 static void
phase2_ungetc(int c)444 phase2_ungetc (int c)
445 {
446   if (c != UEOF)
447     {
448       if (phase2_pushback_length == SIZEOF (phase2_pushback))
449         abort ();
450       phase2_pushback[phase2_pushback_length++] = c;
451     }
452 }
453 
454 
455 /* ========================= Accumulating strings.  ======================== */
456 
457 /* See xg-mixed-string.h for the API.  */
458 
459 
460 /* ======================== Accumulating comments.  ======================== */
461 
462 
463 /* Accumulating a single comment line.  */
464 
465 static struct mixed_string_buffer comment_buffer;
466 
467 static inline void
comment_start()468 comment_start ()
469 {
470   mixed_string_buffer_init (&comment_buffer, lc_comment,
471                             logical_file_name, line_number);
472 }
473 
474 static inline bool
comment_at_start()475 comment_at_start ()
476 {
477   return mixed_string_buffer_is_empty (&comment_buffer);
478 }
479 
480 static inline void
comment_add(int c)481 comment_add (int c)
482 {
483   mixed_string_buffer_append_unicode (&comment_buffer, c);
484 }
485 
486 static inline const char *
comment_line_end(size_t chars_to_remove)487 comment_line_end (size_t chars_to_remove)
488 {
489   char *buffer =
490     mixed_string_contents_free1 (mixed_string_buffer_result (&comment_buffer));
491   size_t buflen = strlen (buffer) - chars_to_remove;
492 
493   while (buflen >= 1
494          && (buffer[buflen - 1] == ' ' || buffer[buflen - 1] == '\t'))
495     --buflen;
496   buffer[buflen] = '\0';
497   savable_comment_add (buffer);
498   lexical_context = lc_outside;
499   return buffer;
500 }
501 
502 
503 /* These are for tracking whether comments count as immediately before
504    keyword.  */
505 static int last_comment_line;
506 static int last_non_comment_line;
507 
508 
509 /* ======================== Recognizing comments.  ======================== */
510 
511 
512 /* Canonicalized encoding name for the current input file.  */
513 static const char *xgettext_current_file_source_encoding;
514 
515 #if HAVE_ICONV
516 /* Converter from xgettext_current_file_source_encoding to UTF-8 (except from
517    ASCII or UTF-8, when this conversion is a no-op).  */
518 static iconv_t xgettext_current_file_source_iconv;
519 #endif
520 
521 /* Tracking whether the current line is a continuation line or contains a
522    non-blank character.  */
523 static bool continuation_or_nonblank_line;
524 
525 
526 /* Phase 3: Outside strings, replace backslash-newline with nothing and a
527    comment with nothing.  */
528 
529 static int
phase3_getc()530 phase3_getc ()
531 {
532   int c;
533 
534   for (;;)
535     {
536       c = phase2_getc ();
537       if (c == '\\')
538         {
539           c = phase2_getc ();
540           if (c != '\n')
541             {
542               phase2_ungetc (c);
543               /* This shouldn't happen usually, because "A backslash is
544                  illegal elsewhere on a line outside a string literal."  */
545               return '\\';
546             }
547           /* Eat backslash-newline.  */
548           continuation_or_nonblank_line = true;
549         }
550       else if (c == '/')
551         {
552           c = phase2_getc ();
553           if (c == '/')
554             {
555               /* C++ style comment.  */
556               last_comment_line = line_number;
557               comment_start ();
558               for (;;)
559                 {
560                   c = phase2_getc ();
561                   if (c == UEOF || c == '\n')
562                     {
563                       comment_line_end (0);
564                       break;
565                     }
566                   /* We skip all leading white space, but not EOLs.  */
567                   if (!(comment_at_start () && (c == ' ' || c == '\t')))
568                     comment_add (c);
569                 }
570               continuation_or_nonblank_line = false;
571               return c;
572             }
573           else if (c == '*')
574             {
575               /* C style comment.  */
576               bool last_was_star = false;
577               last_comment_line = line_number;
578               comment_start ();
579               for (;;)
580                 {
581                   c = phase2_getc ();
582                   if (c == UEOF)
583                     break;
584                   /* We skip all leading white space, but not EOLs.  */
585                   if (!(comment_at_start () && (c == ' ' || c == '\t')))
586                     comment_add (c);
587                   switch (c)
588                     {
589                     case '\n':
590                       comment_line_end (1);
591                       comment_start ();
592                       last_was_star = false;
593                       continue;
594 
595                     case '*':
596                       last_was_star = true;
597                       continue;
598                     case '/':
599                       if (last_was_star)
600                         {
601                           comment_line_end (2);
602                           break;
603                         }
604                       /* FALLTHROUGH */
605 
606                     default:
607                       last_was_star = false;
608                       continue;
609                     }
610                   break;
611                 }
612               continuation_or_nonblank_line = false;
613             }
614           else
615             {
616               phase2_ungetc (c);
617               return '/';
618             }
619         }
620       else
621         {
622           if (c == '\n')
623             continuation_or_nonblank_line = false;
624           else if (!(c == ' ' || c == '\t' || c == '\f'))
625             continuation_or_nonblank_line = true;
626           return c;
627         }
628     }
629 }
630 
631 /* Supports only one pushback character.  */
632 static void
phase3_ungetc(int c)633 phase3_ungetc (int c)
634 {
635   phase2_ungetc (c);
636 }
637 
638 
639 /* ========================= Accumulating strings.  ======================== */
640 
641 /* Return value of phase7_getuc when EOF is reached.  */
642 #define P7_EOF (-1)
643 #define P7_STRING_END (-2)
644 #define P7_TEMPLATE_START_OF_EXPRESSION (-3) /* ${ */
645 
646 /* Convert an UTF-16 or UTF-32 code point to a return value that can be
647    distinguished from a single-byte return value.  */
648 #define UNICODE(code) (0x100 + (code))
649 
650 /* Test a return value of phase7_getuc whether it designates an UTF-16 or
651    UTF-32 code point.  */
652 #define IS_UNICODE(p7_result) ((p7_result) >= 0x100)
653 
654 /* Extract the UTF-16 or UTF-32 code of a return value that satisfies
655    IS_UNICODE.  */
656 #define UNICODE_VALUE(p7_result) ((p7_result) - 0x100)
657 
658 
659 /* ========================== Reading of tokens.  ========================== */
660 
661 
662 enum token_type_ty
663 {
664   token_type_eof,
665   token_type_start,
666   token_type_lparen,            /* ( */
667   token_type_rparen,            /* ) */
668   token_type_lbrace,            /* { */
669   token_type_rbrace,            /* } */
670   token_type_comma,             /* , */
671   token_type_dot,               /* . */
672   token_type_lbracket,          /* [ */
673   token_type_rbracket,          /* ] */
674   token_type_plus,              /* + */
675   token_type_regexp,            /* /.../ */
676   token_type_operator,          /* - * / % . < > = ~ ! | & ? : ^ */
677   token_type_equal,             /* = */
678   token_type_string,            /* "abc", 'abc' */
679   token_type_template,          /* `abc` */
680   token_type_ltemplate,         /* left part of template: `abc${ */
681   token_type_mtemplate,         /* middle part of template: }abc${ */
682   token_type_rtemplate,         /* right part of template: }abc` */
683   token_type_xml_tag,           /* < or </ */
684   token_type_xml_element_start, /* last token of < ... > */
685   token_type_xml_element_end,   /* last token of </ ... > */
686   token_type_xml_empty_element, /* last token of < ... /> */
687   token_type_keyword,           /* return, else */
688   token_type_symbol,            /* symbol, number */
689   token_type_other              /* misc. operator */
690 };
691 typedef enum token_type_ty token_type_ty;
692 
693 typedef struct token_ty token_ty;
694 struct token_ty
695 {
696   token_type_ty type;
697   char *string;                  /* for token_type_symbol, token_type_keyword */
698   mixed_string_ty *mixed_string;        /* for token_type_string, token_type_template */
699   refcounted_string_list_ty *comment;   /* for token_type_string, token_type_template */
700   int line_number;
701 };
702 
703 
704 /* Free the memory pointed to by a 'struct token_ty'.  */
705 static inline void
free_token(token_ty * tp)706 free_token (token_ty *tp)
707 {
708   if (tp->type == token_type_symbol || tp->type == token_type_keyword)
709     free (tp->string);
710   if (tp->type == token_type_string || tp->type == token_type_template)
711     {
712       mixed_string_free (tp->mixed_string);
713       drop_reference (tp->comment);
714     }
715 }
716 
717 
718 /* JavaScript provides strings with either double or single quotes:
719      "abc" or 'abc' or `abc`
720    Both may contain special sequences after a backslash:
721      \', \", \\, \b, \f, \n, \r, \t, \v
722    Special characters can be entered using hexadecimal escape
723    sequences or deprecated octal escape sequences:
724      \xXX, \OOO
725    Any unicode point can be entered using Unicode escape sequences:
726      \uNNNN
727    If a sequence after a backslash is not a legitimate character
728    escape sequence, the character value is the sequence itself without
729    a backslash.  For example, \xxx is treated as xxx.  */
730 
731 static int
phase7_getuc(int quote_char)732 phase7_getuc (int quote_char)
733 {
734   int c;
735 
736   for (;;)
737     {
738       /* Use phase 2, because phase 3 elides comments.  */
739       c = phase2_getc ();
740 
741       if (c == UEOF)
742         return P7_EOF;
743 
744       if (c == quote_char)
745         return P7_STRING_END;
746 
747       if (c == '$' && quote_char == '`')
748         {
749           int c1 = phase2_getc ();
750 
751           if (c1 == '{')
752             return P7_TEMPLATE_START_OF_EXPRESSION;
753           phase2_ungetc (c1);
754         }
755 
756       if (c == '\n')
757         {
758           if (quote_char == '`')
759             return UNICODE ('\n');
760           else
761             {
762               phase2_ungetc (c);
763               error_with_progname = false;
764               error (0, 0, _("%s:%d: warning: unterminated string"),
765                      logical_file_name, line_number);
766               error_with_progname = true;
767               return P7_STRING_END;
768             }
769         }
770 
771       if (c == '\r' && quote_char == '`')
772         {
773           /* Line terminators inside template literals are normalized to \n,
774              says <http://exploringjs.com/es6/ch_template-literals.html>.  */
775           int c1 = phase2_getc ();
776 
777           if (c1 == '\n')
778             return UNICODE ('\n');
779           phase2_ungetc (c1);
780         }
781 
782       if (c != '\\')
783         return UNICODE (c);
784 
785       /* Dispatch according to the character following the backslash.  */
786       c = phase2_getc ();
787       if (c == UEOF)
788         return P7_EOF;
789 
790       switch (c)
791         {
792         case '\n':
793           continue;
794         case 'b':
795           return UNICODE ('\b');
796         case 'f':
797           return UNICODE ('\f');
798         case 'n':
799           return UNICODE ('\n');
800         case 'r':
801           return UNICODE ('\r');
802         case 't':
803           return UNICODE ('\t');
804         case 'v':
805           return UNICODE ('\v');
806         case '0': case '1': case '2': case '3': case '4':
807         case '5': case '6': case '7':
808           {
809             int n = c - '0';
810 
811             c = phase2_getc ();
812             if (c != UEOF)
813               {
814                 if (c >= '0' && c <= '7')
815                   {
816                     n = (n << 3) + (c - '0');
817                     c = phase2_getc ();
818                     if (c != UEOF)
819                       {
820                         if (c >= '0' && c <= '7')
821                           n = (n << 3) + (c - '0');
822                         else
823                           phase2_ungetc (c);
824                       }
825                   }
826                 else
827                   phase2_ungetc (c);
828               }
829             return UNICODE (n);
830           }
831         case 'x':
832           {
833             int c1 = phase2_getc ();
834             int n1;
835 
836             if (c1 >= '0' && c1 <= '9')
837               n1 = c1 - '0';
838             else if (c1 >= 'A' && c1 <= 'F')
839               n1 = c1 - 'A' + 10;
840             else if (c1 >= 'a' && c1 <= 'f')
841               n1 = c1 - 'a' + 10;
842             else
843               n1 = -1;
844 
845             if (n1 >= 0)
846               {
847                 int c2 = phase2_getc ();
848                 int n2;
849 
850                 if (c2 >= '0' && c2 <= '9')
851                   n2 = c2 - '0';
852                 else if (c2 >= 'A' && c2 <= 'F')
853                   n2 = c2 - 'A' + 10;
854                 else if (c2 >= 'a' && c2 <= 'f')
855                   n2 = c2 - 'a' + 10;
856                 else
857                   n2 = -1;
858 
859                 if (n2 >= 0)
860                   {
861                     int n = (n1 << 4) + n2;
862                     return UNICODE (n);
863                   }
864 
865                 phase2_ungetc (c2);
866               }
867             phase2_ungetc (c1);
868             return UNICODE (c);
869           }
870         case 'u':
871           {
872             unsigned char buf[4];
873             unsigned int n = 0;
874             int i;
875 
876             for (i = 0; i < 4; i++)
877               {
878                 int c1 = phase2_getc ();
879 
880                 if (c1 >= '0' && c1 <= '9')
881                   n = (n << 4) + (c1 - '0');
882                 else if (c1 >= 'A' && c1 <= 'F')
883                   n = (n << 4) + (c1 - 'A' + 10);
884                 else if (c1 >= 'a' && c1 <= 'f')
885                   n = (n << 4) + (c1 - 'a' + 10);
886                 else
887                   {
888                     phase2_ungetc (c1);
889                     while (--i >= 0)
890                       phase2_ungetc (buf[i]);
891                     return UNICODE (c);
892                   }
893 
894                 buf[i] = c1;
895               }
896             return UNICODE (n);
897           }
898         default:
899           return UNICODE (c);
900         }
901     }
902 }
903 
904 
905 /* Combine characters into tokens.  Discard whitespace except newlines at
906    the end of logical lines.  */
907 
908 static token_ty phase5_pushback[2];
909 static int phase5_pushback_length;
910 
911 static token_type_ty last_token_type;
912 
913 /* Returns true if last_token_type indicates that we have just seen the
914    possibly last token of an expression.  In this case, '<', '>', and '/'
915    need to be interpreted as operators, rather than as XML markup or start
916    of a regular expression.  */
917 static bool
is_after_expression(void)918 is_after_expression (void)
919 {
920   switch (last_token_type)
921     {
922     case token_type_rparen:
923     case token_type_rbrace:
924     case token_type_rbracket:
925     case token_type_regexp:
926     case token_type_string:
927     case token_type_template:
928     case token_type_rtemplate:
929     case token_type_xml_element_end:
930     case token_type_xml_empty_element:
931     case token_type_symbol:
932       return true;
933 
934     case token_type_eof:
935     case token_type_start:
936     case token_type_lparen:
937     case token_type_lbrace:
938     case token_type_comma:
939     case token_type_dot:
940     case token_type_lbracket:
941     case token_type_plus:
942     case token_type_operator:
943     case token_type_equal:
944     case token_type_ltemplate:
945     case token_type_mtemplate:
946     case token_type_xml_tag:
947     case token_type_xml_element_start:
948     case token_type_keyword:
949     case token_type_other:
950       return false;
951 
952     default:
953       abort ();
954     }
955 }
956 
957 static void
phase5_scan_regexp(void)958 phase5_scan_regexp (void)
959 {
960   int c;
961 
962   /* Scan for end of RegExp literal ('/').  */
963   for (;;)
964     {
965       /* Must use phase2 as there can't be comments.  */
966       c = phase2_getc ();
967       if (c == '/')
968         break;
969       if (c == '\\')
970         {
971           c = phase2_getc ();
972           if (c != UEOF)
973             continue;
974         }
975       if (c == UEOF)
976         {
977           error_with_progname = false;
978           error (0, 0,
979                  _("%s:%d: warning: RegExp literal terminated too early"),
980                  logical_file_name, line_number);
981           error_with_progname = true;
982           return;
983         }
984     }
985 
986   /* Scan for modifier flags (ECMA-262 5th section 15.10.4.1).  */
987   c = phase2_getc ();
988   if (!(c == 'g' || c == 'i' || c == 'm'))
989     phase2_ungetc (c);
990 }
991 
992 /* Number of open template literals `...${  */
993 static int template_literal_depth;
994 
995 /* Number of open '{' tokens, at each template literal level.
996    The "current" element is brace_depths[template_literal_depth].  */
997 static int *brace_depths;
998 /* Number of allocated elements in brace_depths.  */
999 static size_t brace_depths_alloc;
1000 
1001 /* Adds a new brace_depths level after template_literal_depth was
1002    incremented.  */
1003 static void
new_brace_depth_level(void)1004 new_brace_depth_level (void)
1005 {
1006   if (template_literal_depth == brace_depths_alloc)
1007     {
1008       brace_depths_alloc = 2 * brace_depths_alloc + 1;
1009       /* Now template_literal_depth < brace_depths_alloc.  */
1010       brace_depths =
1011         (int *) xrealloc (brace_depths, brace_depths_alloc * sizeof (int));
1012     }
1013   brace_depths[template_literal_depth] = 0;
1014 }
1015 
1016 /* Number of open XML elements.  */
1017 static int xml_element_depth;
1018 static bool inside_embedded_js_in_xml;
1019 
1020 static bool
phase5_scan_xml_markup(token_ty * tp)1021 phase5_scan_xml_markup (token_ty *tp)
1022 {
1023   struct
1024   {
1025     const char *start;
1026     const char *end;
1027   } markers[] =
1028       {
1029         { "!--", "--" },
1030         { "![CDATA[", "]]" },
1031         { "?", "?" }
1032       };
1033   int i;
1034 
1035   for (i = 0; i < SIZEOF (markers); i++)
1036     {
1037       const char *start = markers[i].start;
1038       const char *end = markers[i].end;
1039       int j;
1040 
1041       /* Look for a start marker.  */
1042       for (j = 0; start[j] != '\0'; j++)
1043         {
1044           int c;
1045 
1046           assert (phase2_pushback_length + j < SIZEOF (phase2_pushback));
1047           c = phase2_getc ();
1048           if (c == UEOF)
1049             goto eof;
1050           if (c != start[j])
1051             {
1052               int k = j;
1053 
1054               phase2_ungetc (c);
1055               k--;
1056 
1057               for (; k >= 0; k--)
1058                 phase2_ungetc (start[k]);
1059               break;
1060             }
1061         }
1062 
1063       if (start[j] == '\0')
1064         /* Skip until the end marker.  */
1065         for (;;)
1066           {
1067             int c;
1068 
1069             for (j = 0; end[j] != '\0'; j++)
1070               {
1071                 assert (phase2_pushback_length + 1 < SIZEOF (phase2_pushback));
1072                 c = phase2_getc ();
1073                 if (c == UEOF)
1074                   goto eof;
1075                 if (c != end[j])
1076                   {
1077                     /* Don't push the first character back so the next
1078                        iteration start from the second character.  */
1079                     if (j > 0)
1080                       {
1081                         int k = j;
1082 
1083                         phase2_ungetc (c);
1084                         k--;
1085 
1086                         for (; k > 0; k--)
1087                           phase2_ungetc (end[k]);
1088                       }
1089                     break;
1090                   }
1091               }
1092 
1093             if (end[j] == '\0')
1094               {
1095                 c = phase2_getc ();
1096                 if (c == UEOF)
1097                   goto eof;
1098                 if (c != '>')
1099                   {
1100                     error_with_progname = false;
1101                     error (0, 0,
1102                            _("%s:%d: warning: %s is not allowed"),
1103                            logical_file_name, line_number,
1104                            end);
1105                     error_with_progname = true;
1106                     return false;
1107                   }
1108                 return true;
1109               }
1110           }
1111     }
1112   return false;
1113 
1114  eof:
1115   error_with_progname = false;
1116   error (0, 0,
1117          _("%s:%d: warning: unterminated XML markup"),
1118          logical_file_name, line_number);
1119   error_with_progname = true;
1120   return false;
1121 }
1122 
1123 static void
phase5_get(token_ty * tp)1124 phase5_get (token_ty *tp)
1125 {
1126   int c;
1127 
1128   if (phase5_pushback_length)
1129     {
1130       *tp = phase5_pushback[--phase5_pushback_length];
1131       last_token_type = tp->type;
1132       return;
1133     }
1134 
1135   for (;;)
1136     {
1137       tp->line_number = line_number;
1138       c = phase3_getc ();
1139 
1140       switch (c)
1141         {
1142         case UEOF:
1143           tp->type = last_token_type = token_type_eof;
1144           return;
1145 
1146         case '\n':
1147           if (last_non_comment_line > last_comment_line)
1148             savable_comment_reset ();
1149           /* FALLTHROUGH */
1150         case ' ':
1151         case '\t':
1152         case '\f':
1153           /* Ignore whitespace and comments.  */
1154           continue;
1155         }
1156 
1157       last_non_comment_line = tp->line_number;
1158 
1159       switch (c)
1160         {
1161         case '.':
1162           {
1163             int c1 = phase3_getc ();
1164             phase3_ungetc (c1);
1165             if (!(c1 >= '0' && c1 <= '9'))
1166               {
1167 
1168                 tp->type = last_token_type = token_type_dot;
1169                 return;
1170               }
1171           }
1172           /* FALLTHROUGH */
1173         case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
1174         case 'G': case 'H': case 'I': case 'J': case 'K': case 'L':
1175         case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R':
1176         case 'S': case 'T': case 'U': case 'V': case 'W': case 'X':
1177         case 'Y': case 'Z':
1178         case '_':
1179         case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
1180         case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
1181         case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
1182         case 's': case 't': case 'u': case 'v': case 'w': case 'x':
1183         case 'y': case 'z':
1184         case '0': case '1': case '2': case '3': case '4':
1185         case '5': case '6': case '7': case '8': case '9':
1186           /* Symbol, or part of a number.  */
1187           {
1188             static char *buffer;
1189             static int bufmax;
1190             int bufpos;
1191 
1192             bufpos = 0;
1193             for (;;)
1194               {
1195                 if (bufpos >= bufmax)
1196                   {
1197                     bufmax = 2 * bufmax + 10;
1198                     buffer = xrealloc (buffer, bufmax);
1199                   }
1200                 buffer[bufpos++] = c;
1201                 c = phase3_getc ();
1202                 switch (c)
1203                   {
1204                   case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
1205                   case 'G': case 'H': case 'I': case 'J': case 'K': case 'L':
1206                   case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R':
1207                   case 'S': case 'T': case 'U': case 'V': case 'W': case 'X':
1208                   case 'Y': case 'Z':
1209                   case '_':
1210                   case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
1211                   case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
1212                   case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
1213                   case 's': case 't': case 'u': case 'v': case 'w': case 'x':
1214                   case 'y': case 'z':
1215                   case '0': case '1': case '2': case '3': case '4':
1216                   case '5': case '6': case '7': case '8': case '9':
1217                     continue;
1218                   default:
1219                     phase3_ungetc (c);
1220                     break;
1221                   }
1222                 break;
1223               }
1224             if (bufpos >= bufmax)
1225               {
1226                 bufmax = 2 * bufmax + 10;
1227                 buffer = xrealloc (buffer, bufmax);
1228               }
1229             buffer[bufpos] = '\0';
1230             tp->string = xstrdup (buffer);
1231             if (strcmp (buffer, "return") == 0
1232                 || strcmp (buffer, "else") == 0)
1233               tp->type = last_token_type = token_type_keyword;
1234             else
1235               tp->type = last_token_type = token_type_symbol;
1236             return;
1237           }
1238 
1239         case '"': case '\'':
1240           /* Strings.  */
1241           {
1242             int quote_char = c;
1243             lexical_context_ty saved_lexical_context = lexical_context;
1244             struct mixed_string_buffer msb;
1245 
1246             lexical_context = lc_string;
1247             /* Start accumulating the string.  */
1248             mixed_string_buffer_init (&msb, lexical_context,
1249                                       logical_file_name, line_number);
1250             for (;;)
1251               {
1252                 int uc = phase7_getuc (quote_char);
1253 
1254                 /* Keep line_number in sync.  */
1255                 msb.line_number = line_number;
1256 
1257                 if (uc == P7_EOF || uc == P7_STRING_END)
1258                   break;
1259 
1260                 if (IS_UNICODE (uc))
1261                   {
1262                     assert (UNICODE_VALUE (uc) >= 0
1263                             && UNICODE_VALUE (uc) < 0x110000);
1264                     mixed_string_buffer_append_unicode (&msb,
1265                                                         UNICODE_VALUE (uc));
1266                   }
1267                 else
1268                   mixed_string_buffer_append_char (&msb, uc);
1269               }
1270             tp->mixed_string = mixed_string_buffer_result (&msb);
1271             tp->comment = add_reference (savable_comment);
1272             lexical_context = saved_lexical_context;
1273             tp->type = last_token_type = token_type_string;
1274             return;
1275           }
1276 
1277         case '`':
1278           /* Template literals.  */
1279           {
1280             struct mixed_string_buffer msb;
1281 
1282             lexical_context = lc_string;
1283             /* Start accumulating the string.  */
1284             mixed_string_buffer_init (&msb, lexical_context,
1285                                       logical_file_name, line_number);
1286             for (;;)
1287               {
1288                 int uc = phase7_getuc ('`');
1289 
1290                 /* Keep line_number in sync.  */
1291                 msb.line_number = line_number;
1292 
1293                 if (uc == P7_EOF || uc == P7_STRING_END)
1294                   {
1295                     tp->mixed_string = mixed_string_buffer_result (&msb);
1296                     tp->comment = add_reference (savable_comment);
1297                     tp->type = last_token_type = token_type_template;
1298                     break;
1299                   }
1300 
1301                 if (uc == P7_TEMPLATE_START_OF_EXPRESSION)
1302                   {
1303                     mixed_string_buffer_destroy (&msb);
1304                     tp->type = last_token_type = token_type_ltemplate;
1305                     template_literal_depth++;
1306                     new_brace_depth_level ();
1307                     break;
1308                   }
1309 
1310                 if (IS_UNICODE (uc))
1311                   {
1312                     assert (UNICODE_VALUE (uc) >= 0
1313                             && UNICODE_VALUE (uc) < 0x110000);
1314                     mixed_string_buffer_append_unicode (&msb,
1315                                                         UNICODE_VALUE (uc));
1316                   }
1317                 else
1318                   mixed_string_buffer_append_char (&msb, uc);
1319               }
1320             lexical_context = lc_outside;
1321             return;
1322           }
1323 
1324         case '+':
1325           tp->type = last_token_type = token_type_plus;
1326           return;
1327 
1328         /* Identify operators. The multiple character ones are simply ignored
1329          * as they are recognized here and are otherwise not relevant. */
1330         case '-': case '*': /* '+' and '/' are not listed here! */
1331         case '%':
1332         case '~': case '!': case '|': case '&': case '^':
1333         case '?': case ':':
1334           tp->type = last_token_type = token_type_operator;
1335           return;
1336 
1337         case '=':
1338           tp->type = last_token_type = token_type_equal;
1339           return;
1340 
1341         case '<':
1342           {
1343             /* We assume:
1344                - XMLMarkup and XMLElement are not allowed after an expression,
1345                - embedded JavaScript expressions in XML do not recurse.
1346              */
1347             if (xml_element_depth > 0
1348                 || (!inside_embedded_js_in_xml
1349                     && ! is_after_expression ()))
1350               {
1351                 /* Comments, PI, or CDATA.  */
1352                 if (phase5_scan_xml_markup (tp))
1353                   /* BUG: *tp is not filled in here!  */
1354                   return;
1355                 c = phase2_getc ();
1356 
1357                 if (c == '/')
1358                   {
1359                     /* Closing tag.  */
1360                     lexical_context = lc_xml_close_tag;
1361                   }
1362                 else
1363                   {
1364                     /* Opening element.  */
1365                     phase2_ungetc (c);
1366                     lexical_context = lc_xml_open_tag;
1367                     xml_element_depth++;
1368                   }
1369                 tp->type = last_token_type = token_type_xml_tag;
1370               }
1371             else
1372               tp->type = last_token_type = token_type_operator;
1373           }
1374           return;
1375 
1376         case '>':
1377           if (xml_element_depth > 0 && !inside_embedded_js_in_xml)
1378             {
1379               switch (lexical_context)
1380                 {
1381                 case lc_xml_open_tag:
1382                   lexical_context = lc_xml_content;
1383                   tp->type = last_token_type = token_type_xml_element_start;
1384                   return;
1385 
1386                 case lc_xml_close_tag:
1387                   if (--xml_element_depth > 0)
1388                     lexical_context = lc_xml_content;
1389                   else
1390                     lexical_context = lc_outside;
1391                   tp->type = last_token_type = token_type_xml_element_end;
1392                   return;
1393 
1394                 default:
1395                   break;
1396                 }
1397             }
1398           tp->type = last_token_type = token_type_operator;
1399           return;
1400 
1401         case '/':
1402           if (xml_element_depth > 0 && !inside_embedded_js_in_xml)
1403             {
1404               /* If it appears in an opening tag of an XML element, it's
1405                  part of '/>'.  */
1406               if (lexical_context == lc_xml_open_tag)
1407                 {
1408                   c = phase2_getc ();
1409                   if (c == '>')
1410                     {
1411                       if (--xml_element_depth > 0)
1412                         lexical_context = lc_xml_content;
1413                       else
1414                         lexical_context = lc_outside;
1415                       tp->type = last_token_type = token_type_xml_empty_element;
1416                       return;
1417                     }
1418                   else
1419                     phase2_ungetc (c);
1420                 }
1421             }
1422 
1423           /* Either a division operator or the start of a regular expression
1424              literal.  If the '/' token is spotted after an expression, it's a
1425              division; otherwise it's a regular expression.  */
1426           if (is_after_expression ())
1427             tp->type = last_token_type = token_type_operator;
1428           else
1429             {
1430               phase5_scan_regexp ();
1431               tp->type = last_token_type = token_type_regexp;
1432             }
1433           return;
1434 
1435         case '{':
1436           if (xml_element_depth > 0 && !inside_embedded_js_in_xml)
1437             inside_embedded_js_in_xml = true;
1438           else
1439             brace_depths[template_literal_depth]++;
1440           tp->type = last_token_type = token_type_lbrace;
1441           return;
1442 
1443         case '}':
1444           if (xml_element_depth > 0 && inside_embedded_js_in_xml)
1445             inside_embedded_js_in_xml = false;
1446           else if (brace_depths[template_literal_depth] > 0)
1447             brace_depths[template_literal_depth]--;
1448           else if (template_literal_depth > 0)
1449             {
1450               /* Middle or right part of template literal.  */
1451               for (;;)
1452                 {
1453                   int uc = phase7_getuc ('`');
1454 
1455                   if (uc == P7_EOF || uc == P7_STRING_END)
1456                     {
1457                       tp->type = last_token_type = token_type_rtemplate;
1458                       template_literal_depth--;
1459                       break;
1460                     }
1461 
1462                   if (uc == P7_TEMPLATE_START_OF_EXPRESSION)
1463                     {
1464                       tp->type = last_token_type = token_type_mtemplate;
1465                       break;
1466                     }
1467                 }
1468               return;
1469             }
1470           tp->type = last_token_type = token_type_rbrace;
1471           return;
1472 
1473         case '(':
1474           tp->type = last_token_type = token_type_lparen;
1475           return;
1476 
1477         case ')':
1478           tp->type = last_token_type = token_type_rparen;
1479           return;
1480 
1481         case ',':
1482           tp->type = last_token_type = token_type_comma;
1483           return;
1484 
1485         case '[':
1486           tp->type = last_token_type = token_type_lbracket;
1487           return;
1488 
1489         case ']':
1490           tp->type = last_token_type = token_type_rbracket;
1491           return;
1492 
1493         default:
1494           /* We could carefully recognize each of the 2 and 3 character
1495              operators, but it is not necessary, as we only need to recognize
1496              gettext invocations.  Don't bother.  */
1497           tp->type = last_token_type = token_type_other;
1498           return;
1499         }
1500     }
1501 }
1502 
1503 /* Supports only one pushback token.  */
1504 static void
phase5_unget(token_ty * tp)1505 phase5_unget (token_ty *tp)
1506 {
1507   if (tp->type != token_type_eof)
1508     {
1509       if (phase5_pushback_length == SIZEOF (phase5_pushback))
1510         abort ();
1511       phase5_pushback[phase5_pushback_length++] = *tp;
1512     }
1513 }
1514 
1515 
1516 /* String concatenation with '+'.
1517    Handling of tagged template literals.  */
1518 
1519 static void
x_javascript_lex(token_ty * tp)1520 x_javascript_lex (token_ty *tp)
1521 {
1522   phase5_get (tp);
1523   if (tp->type == token_type_string || tp->type == token_type_template)
1524     {
1525       mixed_string_ty *sum = tp->mixed_string;
1526 
1527       for (;;)
1528         {
1529           token_ty token2;
1530 
1531           phase5_get (&token2);
1532           if (token2.type == token_type_plus)
1533             {
1534               token_ty token3;
1535 
1536               phase5_get (&token3);
1537               if (token3.type == token_type_string
1538                   || token3.type == token_type_template)
1539                 {
1540                   sum = mixed_string_concat_free1 (sum, token3.mixed_string);
1541 
1542                   free_token (&token3);
1543                   free_token (&token2);
1544                   continue;
1545                 }
1546               phase5_unget (&token3);
1547             }
1548           phase5_unget (&token2);
1549           break;
1550         }
1551       tp->mixed_string = sum;
1552     }
1553   else if (tp->type == token_type_symbol)
1554     {
1555       token_ty token2;
1556 
1557       phase5_get (&token2);
1558       if (token2.type == token_type_template)
1559         {
1560           /* The value of
1561                tag `abc`
1562              is the value of the function call
1563                tag (["abc"])
1564              We don't know anything about this value.  Therefore, don't
1565              let the extractor see this template literal.  */
1566           free_token (&token2);
1567         }
1568       else
1569         phase5_unget (&token2);
1570     }
1571 }
1572 
1573 
1574 /* ========================= Extracting strings.  ========================== */
1575 
1576 
1577 /* Context lookup table.  */
1578 static flag_context_list_table_ty *flag_context_list_table;
1579 
1580 
1581 /* The file is broken into tokens.  Scan the token stream, looking for
1582    a keyword, followed by a left paren, followed by a string.  When we
1583    see this sequence, we have something to remember.  We assume we are
1584    looking at a valid JavaScript program, and leave the complaints about
1585    the grammar to the compiler.
1586 
1587      Normal handling: Look for
1588        keyword ( ... msgid ... )
1589      Plural handling: Look for
1590        keyword ( ... msgid ... msgid_plural ... )
1591 
1592    We use recursion because the arguments before msgid or between msgid
1593    and msgid_plural can contain subexpressions of the same form.  */
1594 
1595 
1596 /* Extract messages until the next balanced closing parenthesis or bracket.
1597    Extracted messages are added to MLP.
1598    DELIM can be either token_type_rparen or token_type_rbracket, or
1599    token_type_eof to accept both.
1600    Return true upon eof, false upon closing parenthesis or bracket.  */
1601 static bool
extract_balanced(message_list_ty * mlp,token_type_ty delim,flag_context_ty outer_context,flag_context_list_iterator_ty context_iter,struct arglist_parser * argparser)1602 extract_balanced (message_list_ty *mlp,
1603                   token_type_ty delim,
1604                   flag_context_ty outer_context,
1605                   flag_context_list_iterator_ty context_iter,
1606                   struct arglist_parser *argparser)
1607 {
1608   /* Current argument number.  */
1609   int arg = 1;
1610   /* 0 when no keyword has been seen.  1 right after a keyword is seen.  */
1611   int state;
1612   /* Parameters of the keyword just seen.  Defined only in state 1.  */
1613   const struct callshapes *next_shapes = NULL;
1614   /* Context iterator that will be used if the next token is a '('.  */
1615   flag_context_list_iterator_ty next_context_iter =
1616     passthrough_context_list_iterator;
1617   /* Current context.  */
1618   flag_context_ty inner_context =
1619     inherited_context (outer_context,
1620                        flag_context_list_iterator_advance (&context_iter));
1621 
1622   /* Start state is 0.  */
1623   state = 0;
1624 
1625   for (;;)
1626     {
1627       token_ty token;
1628 
1629       x_javascript_lex (&token);
1630       switch (token.type)
1631         {
1632         case token_type_symbol:
1633           {
1634             void *keyword_value;
1635 
1636             if (hash_find_entry (&keywords, token.string, strlen (token.string),
1637                                  &keyword_value)
1638                 == 0)
1639               {
1640                 next_shapes = (const struct callshapes *) keyword_value;
1641                 state = 1;
1642               }
1643             else
1644               state = 0;
1645           }
1646           next_context_iter =
1647             flag_context_list_iterator (
1648               flag_context_list_table_lookup (
1649                 flag_context_list_table,
1650                 token.string, strlen (token.string)));
1651           free (token.string);
1652           continue;
1653 
1654         case token_type_lparen:
1655           if (extract_balanced (mlp, token_type_rparen,
1656                                 inner_context, next_context_iter,
1657                                 arglist_parser_alloc (mlp,
1658                                                       state ? next_shapes : NULL)))
1659             {
1660               arglist_parser_done (argparser, arg);
1661               return true;
1662             }
1663           next_context_iter = null_context_list_iterator;
1664           state = 0;
1665           continue;
1666 
1667         case token_type_rparen:
1668           if (delim == token_type_rparen || delim == token_type_eof)
1669             {
1670               arglist_parser_done (argparser, arg);
1671               return false;
1672             }
1673           next_context_iter = null_context_list_iterator;
1674           state = 0;
1675           continue;
1676 
1677         case token_type_comma:
1678           arg++;
1679           inner_context =
1680             inherited_context (outer_context,
1681                                flag_context_list_iterator_advance (
1682                                  &context_iter));
1683           next_context_iter = passthrough_context_list_iterator;
1684           state = 0;
1685           continue;
1686 
1687         case token_type_lbracket:
1688           if (extract_balanced (mlp, token_type_rbracket,
1689                                 null_context, null_context_list_iterator,
1690                                 arglist_parser_alloc (mlp, NULL)))
1691             {
1692               arglist_parser_done (argparser, arg);
1693               return true;
1694             }
1695           next_context_iter = null_context_list_iterator;
1696           state = 0;
1697           continue;
1698 
1699         case token_type_rbracket:
1700           if (delim == token_type_rbracket || delim == token_type_eof)
1701             {
1702               arglist_parser_done (argparser, arg);
1703               return false;
1704             }
1705           next_context_iter = null_context_list_iterator;
1706           state = 0;
1707           continue;
1708 
1709         case token_type_lbrace:
1710           if (extract_balanced (mlp, token_type_rbrace,
1711                                 null_context, null_context_list_iterator,
1712                                 arglist_parser_alloc (mlp, NULL)))
1713             {
1714               arglist_parser_done (argparser, arg);
1715               return true;
1716             }
1717           next_context_iter = null_context_list_iterator;
1718           state = 0;
1719           continue;
1720 
1721         case token_type_rbrace:
1722           if (delim == token_type_rbrace || delim == token_type_eof)
1723             {
1724               arglist_parser_done (argparser, arg);
1725               return false;
1726             }
1727           next_context_iter = null_context_list_iterator;
1728           state = 0;
1729           continue;
1730 
1731         case token_type_string:
1732         case token_type_template:
1733           {
1734             lex_pos_ty pos;
1735 
1736             pos.file_name = logical_file_name;
1737             pos.line_number = token.line_number;
1738 
1739             if (extract_all)
1740               {
1741                 char *string = mixed_string_contents (token.mixed_string);
1742                 mixed_string_free (token.mixed_string);
1743                 remember_a_message (mlp, NULL, string, true, false,
1744                                     inner_context, &pos,
1745                                     NULL, token.comment, true);
1746               }
1747             else
1748               arglist_parser_remember (argparser, arg, token.mixed_string,
1749                                        inner_context,
1750                                        pos.file_name, pos.line_number,
1751                                        token.comment, true);
1752           }
1753           drop_reference (token.comment);
1754           next_context_iter = null_context_list_iterator;
1755           state = 0;
1756           continue;
1757 
1758         case token_type_xml_element_start:
1759           if (extract_balanced (mlp, token_type_xml_element_end,
1760                                 null_context, null_context_list_iterator,
1761                                 arglist_parser_alloc (mlp, NULL)))
1762             {
1763               arglist_parser_done (argparser, arg);
1764               return true;
1765             }
1766           next_context_iter = null_context_list_iterator;
1767           state = 0;
1768           continue;
1769 
1770         case token_type_xml_element_end:
1771           if (delim == token_type_xml_element_end || delim == token_type_eof)
1772             {
1773               arglist_parser_done (argparser, arg);
1774               return false;
1775             }
1776           next_context_iter = null_context_list_iterator;
1777           state = 0;
1778           continue;
1779 
1780         case token_type_eof:
1781           arglist_parser_done (argparser, arg);
1782           return true;
1783 
1784         case token_type_ltemplate:
1785         case token_type_mtemplate:
1786         case token_type_rtemplate:
1787         case token_type_keyword:
1788         case token_type_start:
1789         case token_type_dot:
1790         case token_type_plus:
1791         case token_type_regexp:
1792         case token_type_operator:
1793         case token_type_equal:
1794         case token_type_xml_tag:
1795         case token_type_xml_empty_element:
1796         case token_type_other:
1797           next_context_iter = null_context_list_iterator;
1798           state = 0;
1799           continue;
1800 
1801         default:
1802           abort ();
1803         }
1804     }
1805 }
1806 
1807 
1808 void
extract_javascript(FILE * f,const char * real_filename,const char * logical_filename,flag_context_list_table_ty * flag_table,msgdomain_list_ty * mdlp)1809 extract_javascript (FILE *f,
1810                 const char *real_filename, const char *logical_filename,
1811                 flag_context_list_table_ty *flag_table,
1812                 msgdomain_list_ty *mdlp)
1813 {
1814   message_list_ty *mlp = mdlp->item[0]->messages;
1815 
1816   fp = f;
1817   real_file_name = real_filename;
1818   logical_file_name = xstrdup (logical_filename);
1819   line_number = 1;
1820 
1821   phase1_pushback_length = 0;
1822 
1823   lexical_context = lc_outside;
1824 
1825   phase2_pushback_length = 0;
1826 
1827   last_comment_line = -1;
1828   last_non_comment_line = -1;
1829 
1830   xgettext_current_file_source_encoding =
1831     (xgettext_global_source_encoding != NULL ? xgettext_global_source_encoding :
1832      po_charset_ascii);
1833 #if HAVE_ICONV
1834   xgettext_current_file_source_iconv = xgettext_global_source_iconv;
1835 #endif
1836 
1837   xgettext_current_source_encoding = xgettext_current_file_source_encoding;
1838 #if HAVE_ICONV
1839   xgettext_current_source_iconv = xgettext_current_file_source_iconv;
1840 #endif
1841 
1842   continuation_or_nonblank_line = false;
1843 
1844   phase5_pushback_length = 0;
1845   last_token_type = token_type_start;
1846 
1847   template_literal_depth = 0;
1848   new_brace_depth_level ();
1849   xml_element_depth = 0;
1850   inside_embedded_js_in_xml = false;
1851 
1852   flag_context_list_table = flag_table;
1853 
1854   init_keywords ();
1855 
1856   /* Eat tokens until eof is seen.  When extract_balanced returns
1857      due to an unbalanced closing parenthesis, just restart it.  */
1858   while (!extract_balanced (mlp, token_type_eof,
1859                             null_context, null_context_list_iterator,
1860                             arglist_parser_alloc (mlp, NULL)))
1861     ;
1862 
1863   fp = NULL;
1864   real_file_name = NULL;
1865   logical_file_name = NULL;
1866   line_number = 0;
1867 }
1868