1 /* xgettext sh backend.
2    Copyright (C) 2003, 2005-2009, 2014, 2018-2020 Free Software Foundation, Inc.
3    Written by Bruno Haible <bruno@clisp.org>, 2003.
4 
5    This program is free software: you can redistribute it and/or modify
6    it under the terms of the GNU General Public License as published by
7    the Free Software Foundation; either version 3 of the License, or
8    (at your option) any later version.
9 
10    This program is distributed in the hope that it will be useful,
11    but WITHOUT ANY WARRANTY; without even the implied warranty of
12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13    GNU General Public License for more details.
14 
15    You should have received a copy of the GNU General Public License
16    along with this program.  If not, see <https://www.gnu.org/licenses/>.  */
17 
18 #ifdef HAVE_CONFIG_H
19 # include "config.h"
20 #endif
21 
22 /* Specification.  */
23 #include "x-sh.h"
24 
25 #include <errno.h>
26 #include <limits.h>
27 #include <stdbool.h>
28 #include <stdio.h>
29 #include <stdlib.h>
30 #include <string.h>
31 
32 #include "message.h"
33 #include "xgettext.h"
34 #include "xg-pos.h"
35 #include "xg-mixed-string.h"
36 #include "xg-arglist-context.h"
37 #include "xg-arglist-callshape.h"
38 #include "xg-arglist-parser.h"
39 #include "xg-message.h"
40 #include "error.h"
41 #include "error-progname.h"
42 #include "xalloc.h"
43 #include "mem-hash-map.h"
44 #include "../../gettext-runtime/src/escapes.h"
45 #include "gettext.h"
46 
47 #define _(s) gettext(s)
48 
49 #define SIZEOF(a) (sizeof(a) / sizeof(a[0]))
50 
51 
52 /* The sh syntax is defined in POSIX:2001, see
53      http://www.opengroup.org/onlinepubs/007904975/utilities/xcu_chap02.html
54    Summary of sh syntax:
55    - Input is broken into words, which are then subject to
56      - tilde expansion ~...
57      - command substitution `...`
58      - variable substitution $var
59      - arithmetic substitution $((...))
60      - field splitting at whitespace (IFS)
61      - wildcard pattern expansion *?
62      - quote removal
63    - Strings are enclosed in "..."; command substitution, variable
64      substitution and arithmetic substitution are performed here as well.
65    - '...' is a string without substitutions.
66    - The list of resulting words is split into commands by semicolon and
67      newline.
68    - '#' at the beginning of a word introduces a comment until end of line.
69    The parser is implemented in bash-2.05b/parse.y.  */
70 
71 
72 /* ====================== Keyword set customization.  ====================== */
73 
74 /* If true extract all strings.  */
75 static bool extract_all = false;
76 
77 static hash_table keywords;
78 static bool default_keywords = true;
79 
80 
81 void
x_sh_extract_all()82 x_sh_extract_all ()
83 {
84   extract_all = true;
85 }
86 
87 
88 void
x_sh_keyword(const char * name)89 x_sh_keyword (const char *name)
90 {
91   if (name == NULL)
92     default_keywords = false;
93   else
94     {
95       const char *end;
96       struct callshape shape;
97       const char *colon;
98 
99       if (keywords.table == NULL)
100         hash_init (&keywords, 100);
101 
102       split_keywordspec (name, &end, &shape);
103 
104       /* The characters between name and end should form a valid C identifier.
105          A colon means an invalid parse in split_keywordspec().  */
106       colon = strchr (name, ':');
107       if (colon == NULL || colon >= end)
108         insert_keyword_callshape (&keywords, name, end - name, &shape);
109     }
110 }
111 
112 /* Finish initializing the keywords hash table.
113    Called after argument processing, before each file is processed.  */
114 static void
init_keywords()115 init_keywords ()
116 {
117   if (default_keywords)
118     {
119       /* When adding new keywords here, also update the documentation in
120          xgettext.texi!  */
121       x_sh_keyword ("gettext");
122       x_sh_keyword ("ngettext:1,2");
123       /* Note: There is also special handling for 'gettext' and 'ngettext'
124          in read_command, below.  */
125       x_sh_keyword ("eval_gettext");
126       x_sh_keyword ("eval_ngettext:1,2");
127       x_sh_keyword ("eval_pgettext:1c,2");
128       x_sh_keyword ("eval_npgettext:1c,2,3");
129       default_keywords = false;
130     }
131 }
132 
133 void
init_flag_table_sh()134 init_flag_table_sh ()
135 {
136   xgettext_record_flag ("gettext:1:pass-sh-format");
137   xgettext_record_flag ("ngettext:1:pass-sh-format");
138   xgettext_record_flag ("ngettext:2:pass-sh-format");
139   xgettext_record_flag ("eval_gettext:1:sh-format");
140   xgettext_record_flag ("eval_ngettext:1:sh-format");
141   xgettext_record_flag ("eval_ngettext:2:sh-format");
142   xgettext_record_flag ("eval_pgettext:2:sh-format");
143   xgettext_record_flag ("eval_npgettext:2:sh-format");
144   xgettext_record_flag ("eval_npgettext:3:sh-format");
145 }
146 
147 
148 /* ======================== Reading of characters.  ======================== */
149 
150 /* The input file stream.  */
151 static FILE *fp;
152 
153 
154 /* Fetch the next character from the input file.  */
155 static int
do_getc()156 do_getc ()
157 {
158   int c = getc (fp);
159 
160   if (c == EOF)
161     {
162       if (ferror (fp))
163         error (EXIT_FAILURE, errno,
164                _("error while reading \"%s\""), real_file_name);
165     }
166   else if (c == '\n')
167    line_number++;
168 
169   return c;
170 }
171 
172 /* Put back the last fetched character, not EOF.  */
173 static void
do_ungetc(int c)174 do_ungetc (int c)
175 {
176   if (c == '\n')
177     line_number--;
178   ungetc (c, fp);
179 }
180 
181 
182 /* Remove backslash followed by newline from the input stream.  */
183 
184 static int phase1_pushback[1];
185 static int phase1_pushback_length;
186 
187 static int
phase1_getc()188 phase1_getc ()
189 {
190   int c;
191 
192   if (phase1_pushback_length)
193     {
194       c = phase1_pushback[--phase1_pushback_length];
195       if (c == '\n')
196         ++line_number;
197       return c;
198     }
199   for (;;)
200     {
201       c = do_getc ();
202       if (c != '\\')
203         return c;
204       c = do_getc ();
205       if (c != '\n')
206         {
207           if (c != EOF)
208             do_ungetc (c);
209           return '\\';
210         }
211     }
212 }
213 
214 /* Supports only one pushback character.  */
215 static void
phase1_ungetc(int c)216 phase1_ungetc (int c)
217 {
218   switch (c)
219     {
220     case EOF:
221       break;
222 
223     case '\n':
224       --line_number;
225       /* FALLTHROUGH */
226 
227     default:
228       if (phase1_pushback_length == SIZEOF (phase1_pushback))
229         abort ();
230       phase1_pushback[phase1_pushback_length++] = c;
231       break;
232     }
233 }
234 
235 
236 /* ========================== Reading of tokens.  ========================== */
237 
238 
239 /* A token consists of a sequence of characters.  */
240 struct token
241 {
242   int allocated;                /* number of allocated 'token_char's */
243   int charcount;                /* number of used 'token_char's */
244   char *chars;                  /* the token's constituents */
245 };
246 
247 /* Initialize a 'struct token'.  */
248 static inline void
init_token(struct token * tp)249 init_token (struct token *tp)
250 {
251   tp->allocated = 10;
252   tp->chars = XNMALLOC (tp->allocated, char);
253   tp->charcount = 0;
254 }
255 
256 /* Free the memory pointed to by a 'struct token'.  */
257 static inline void
free_token(struct token * tp)258 free_token (struct token *tp)
259 {
260   free (tp->chars);
261 }
262 
263 /* Ensure there is enough room in the token for one more character.  */
264 static inline void
grow_token(struct token * tp)265 grow_token (struct token *tp)
266 {
267   if (tp->charcount == tp->allocated)
268     {
269       tp->allocated *= 2;
270       tp->chars = (char *) xrealloc (tp->chars, tp->allocated * sizeof (char));
271     }
272 }
273 
274 /* Convert a struct token * to a char*.  */
275 static char *
string_of_token(const struct token * tp)276 string_of_token (const struct token *tp)
277 {
278   char *str;
279   int n;
280 
281   n = tp->charcount;
282   str = XNMALLOC (n + 1, char);
283   memcpy (str, tp->chars, n);
284   str[n] = '\0';
285   return str;
286 }
287 
288 
289 /* ========================= Accumulating messages ========================= */
290 
291 
292 static message_list_ty *mlp;
293 
294 
295 /* ========================= Accumulating comments ========================= */
296 
297 
298 static char *buffer;
299 static size_t bufmax;
300 static size_t buflen;
301 
302 static inline void
comment_start()303 comment_start ()
304 {
305   buflen = 0;
306 }
307 
308 static inline void
comment_add(int c)309 comment_add (int c)
310 {
311   if (buflen >= bufmax)
312     {
313       bufmax = 2 * bufmax + 10;
314       buffer = xrealloc (buffer, bufmax);
315     }
316   buffer[buflen++] = c;
317 }
318 
319 static inline void
comment_line_end()320 comment_line_end ()
321 {
322   while (buflen >= 1
323          && (buffer[buflen - 1] == ' ' || buffer[buflen - 1] == '\t'))
324     --buflen;
325   if (buflen >= bufmax)
326     {
327       bufmax = 2 * bufmax + 10;
328       buffer = xrealloc (buffer, bufmax);
329     }
330   buffer[buflen] = '\0';
331   savable_comment_add (buffer);
332 }
333 
334 
335 /* These are for tracking whether comments count as immediately before
336    keyword.  */
337 static int last_comment_line;
338 static int last_non_comment_line;
339 
340 
341 /* ========================= Debackslashification ========================== */
342 
343 /* This state tracks the effect of backquotes, double-quotes and single-quotes
344    on the parsing of backslashes.  We make a single pass through the input
345    file, keeping the state up to date.  This is much faster than accumulating
346    strings and processing them with explicit debackslashification, like the
347    shell does it.  */
348 
349 /* The number of nested `...` or "`...`" constructs.  Assumed to be <= 32.  */
350 static unsigned int nested_backquotes;
351 
352 /* A bit mask indicating which of the currently open `...` or "`...`"
353    constructs is with double-quotes: "`...`".
354    A bit value of 1 stands for "`...`", a bit value of 0 stands for `...`.
355    Bit position 0 designates the outermost backquotes nesting,
356    bit position 1 the second-outermost backquotes nesting,
357    ...
358    bit position (nested_backquotes-1) the innermost backquotes nesting.  */
359 static unsigned int open_doublequotes_mask;
360 
361 /* A bit indicating whether a double-quote is currently open inside the
362    innermost backquotes nesting.  */
363 static bool open_doublequote;
364 
365 /* A bit indicating whether a single-quote is currently open inside the
366    innermost backquotes nesting.  */
367 static bool open_singlequote;
368 
369 /* The expected terminator of the currently open single-quote.
370    Usually '\'', but can be '"' for i18n-quotes.  */
371 static char open_singlequote_terminator;
372 
373 
374 /* Functions to update the state.  */
375 
376 static inline void
saw_opening_backquote()377 saw_opening_backquote ()
378 {
379   if (open_singlequote)
380     abort ();
381   if (open_doublequote)
382     open_doublequotes_mask |= (unsigned int) 1 << nested_backquotes;
383   nested_backquotes++;
384   open_doublequote = false;
385 }
386 
387 static inline void
saw_closing_backquote()388 saw_closing_backquote ()
389 {
390   nested_backquotes--;
391   open_doublequote = (open_doublequotes_mask >> nested_backquotes) & 1;
392   open_doublequotes_mask &= ((unsigned int) 1 << nested_backquotes) - 1;
393   open_singlequote = false; /* just for safety */
394 }
395 
396 static inline void
saw_opening_doublequote()397 saw_opening_doublequote ()
398 {
399   if (open_singlequote || open_doublequote)
400     abort ();
401   open_doublequote = true;
402 }
403 
404 static inline void
saw_closing_doublequote()405 saw_closing_doublequote ()
406 {
407   if (open_singlequote || !open_doublequote)
408     abort ();
409   open_doublequote = false;
410 }
411 
412 static inline void
saw_opening_singlequote()413 saw_opening_singlequote ()
414 {
415   if (open_doublequote || open_singlequote)
416     abort ();
417   open_singlequote = true;
418   open_singlequote_terminator = '\'';
419 }
420 
421 static inline void
saw_closing_singlequote()422 saw_closing_singlequote ()
423 {
424   if (open_doublequote || !open_singlequote)
425     abort ();
426   open_singlequote = false;
427 }
428 
429 
430 /* ========================== Reading of commands ========================== */
431 
432 /* We are only interested in constant strings.  Other words need not to be
433    represented precisely.  */
434 enum word_type
435 {
436   t_string,     /* constant string */
437   t_assignment, /* variable assignment */
438   t_other,      /* other string */
439   t_separator,  /* command separator: semicolon or newline */
440   t_redirect,   /* redirection: one of < > >| << <<- >> <> <& >& */
441   t_backquote,  /* closing '`' pseudo word */
442   t_paren,      /* closing ')' pseudo word */
443   t_eof         /* EOF marker */
444 };
445 
446 struct word
447 {
448   enum word_type type;
449   struct token *token;          /* for t_string */
450   int line_number_at_start;     /* for t_string */
451 };
452 
453 /* Free the memory pointed to by a 'struct word'.  */
454 static inline void
free_word(struct word * wp)455 free_word (struct word *wp)
456 {
457   if (wp->type == t_string)
458     {
459       free_token (wp->token);
460       free (wp->token);
461     }
462 }
463 
464 /* Convert a t_string token to a char*.  */
465 static char *
string_of_word(const struct word * wp)466 string_of_word (const struct word *wp)
467 {
468   char *str;
469   int n;
470 
471   if (!(wp->type == t_string))
472     abort ();
473   n = wp->token->charcount;
474   str = XNMALLOC (n + 1, char);
475   memcpy (str, wp->token->chars, n);
476   str[n] = '\0';
477   return str;
478 }
479 
480 /* Convert a t_string token to a char*, ignoring the first OFFSET bytes.  */
481 static char *
substring_of_word(const struct word * wp,size_t offset)482 substring_of_word (const struct word *wp, size_t offset)
483 {
484   char *str;
485   int n;
486 
487   if (!(wp->type == t_string))
488     abort ();
489   n = wp->token->charcount;
490   if (!(offset <= n))
491     abort ();
492   str = XNMALLOC (n - offset + 1, char);
493   memcpy (str, wp->token->chars + offset, n - offset);
494   str[n - offset] = '\0';
495   return str;
496 }
497 
498 
499 /* Whitespace recognition.  */
500 
501 static inline bool
is_whitespace(int c)502 is_whitespace (int c)
503 {
504   return (c == ' ' || c == '\t' || c == '\n');
505 }
506 
507 /* Operator character recognition.  */
508 
509 static inline bool
is_operator_start(int c)510 is_operator_start (int c)
511 {
512   return (c == '|' || c == '&' || c == ';' || c == '<' || c == '>'
513           || c == '(' || c == ')');
514 }
515 
516 
517 /* Denotation of a quoted character.
518    The distinction between quoted and unquoted character is important only for
519    the special, whitespace and operator characters; it is irrelevant for
520    alphanumeric characters, '\\' and many others.  */
521 #define QUOTED(c) (UCHAR_MAX + 1 + (c))
522 /* Values in the 'unsigned char' range are implicitly unquoted.  Among these,
523    the following are important:
524      '"'         opening or closing double quote
525      '\''        opening or closing single quote
526      '$'         the unknown result of a dollar expansion
527      '`'         does not occur - replaced with OPENING_BACKQUOTE or
528                  CLOSING_BACKQUOTE
529  */
530 #define OPENING_BACKQUOTE (2 * (UCHAR_MAX + 1) + '`')
531 #define CLOSING_BACKQUOTE (3 * (UCHAR_MAX + 1) + '`')
532 
533 /* 2 characters of pushback are supported.
534    2 characters of pushback occur only when the first is an 'x'; in all
535    other cases only one character of pushback is needed.  */
536 static int phase2_pushback[2];
537 static int phase2_pushback_length;
538 
539 /* Return the next character, with backslashes removed.
540    The result is QUOTED(c) for some unsigned char c, if the next character
541    is escaped sufficiently often to make it a regular constituent character,
542    or simply an 'unsigned char' if it has its special meaning (of special,
543    whitespace or operator charcter), or OPENING_BACKQUOTE, CLOSING_BACKQUOTE,
544    EOF.
545    It's the caller's responsibility to update the state.  */
546 static int
phase2_getc()547 phase2_getc ()
548 {
549   int c;
550 
551   if (phase2_pushback_length)
552     {
553       c = phase2_pushback[--phase2_pushback_length];
554       if (c == '\n')
555         ++line_number;
556       return c;
557     }
558 
559   c = phase1_getc ();
560   if (c == EOF)
561     return c;
562   if (c == '\'')
563     return ((open_doublequote
564              || (open_singlequote && open_singlequote_terminator != c))
565             ? QUOTED (c)
566             : c);
567   if (open_singlequote)
568     {
569       if (c == open_singlequote_terminator)
570         return c;
571     }
572   else
573     {
574       if (c == '"' || c == '$')
575         return c;
576       if (c == '`')
577         return (nested_backquotes > 0 ? CLOSING_BACKQUOTE : OPENING_BACKQUOTE);
578     }
579   if (c == '\\')
580     {
581       /* Number of debackslashification passes that are active at the
582          current point.  */
583       unsigned int debackslashify =
584         nested_backquotes + (open_singlequote ? 0 : 1);
585       /* Normal number of backslashes that yield a single backslash in the
586          final output.  */
587       unsigned int expected_count =
588         (unsigned int) 1 << debackslashify;
589       /* Number of backslashes found.  */
590       unsigned int count;
591 
592       for (count = 1; count < expected_count; count++)
593         {
594           c = phase1_getc ();
595           if (c != '\\')
596             break;
597         }
598       if (count == expected_count)
599         return '\\';
600 
601       /* The count of backslashes is > 0 and < expected_count, therefore the
602          result depends on c, the first character after the backslashes.
603          Note: The formulas below don't necessarily have a logic; they were
604          empirically determined such that 1. the xgettext-sh-1 test succeeds,
605          2. the behaviour for count == 0 would correspond to the one without
606          any baskslash.  */
607       if (c == '\'')
608         {
609           if (!open_singlequote && count > (expected_count >> 1))
610             {
611               phase1_ungetc (c);
612               return '\\';
613             }
614           else
615             return ((open_doublequote
616                      || (open_singlequote
617                          ? open_singlequote_terminator != c
618                          : count == (expected_count >> 1)))
619                     ? QUOTED (c)
620                     : c);
621         }
622       else if (c == '"')
623         {
624           /* Each debackslashification pass converts \\ to \ and \" to ";
625              passes corresponding to `...` drop a lone " whereas passes
626              corresponding to "`...`" leave it alone.  Therefore, the
627              minimum number of backslashes needed to get one double-quote
628              in the end is  open_doublequotes_mask + 1.  */
629           if (open_singlequote)
630             {
631               if (count > open_doublequotes_mask)
632                 {
633                   phase1_ungetc (c);
634                   return '\\';
635                 }
636               else
637                 return (open_singlequote_terminator != c ? QUOTED (c) : c);
638             }
639           else
640             {
641               if (count > open_doublequotes_mask)
642                 return QUOTED (c);
643               else
644                 /* Some of the count values <= open_doublequotes_mask are
645                    actually invalid here, but we assume a syntactically
646                    correct input file anyway.  */
647                 return c;
648             }
649         }
650       else if (c == '`')
651         {
652           /* FIXME: This code looks fishy.  */
653           if (count == expected_count - 1)
654             return c;
655           else
656             /* Some of the count values < expected_count - 1 are
657                actually invalid here, but we assume a syntactically
658                correct input file anyway.  */
659             if (nested_backquotes > 0 && !open_singlequote
660                 && count >= (expected_count >> 2))
661               return OPENING_BACKQUOTE;
662             else
663               return CLOSING_BACKQUOTE;
664         }
665       else if (c == '$')
666         {
667           if (open_singlequote)
668             return QUOTED (c);
669           if (count >= (expected_count >> 1))
670             return QUOTED (c);
671           else
672             return c;
673         }
674       else
675         {
676           /* When not followed by a quoting character or backslash or dollar,
677              a backslash survives a debackslashification pass unmodified.
678              Therefore each debackslashification pass performs a
679                count := (count + 1) >> 1
680              operation.  Therefore the minimum number of backslashes needed
681              to get one backslash in the end is  (expected_count >> 1) + 1.  */
682           if (open_doublequote || open_singlequote)
683             {
684               if (count > 0)
685                 {
686                   phase1_ungetc (c);
687                   return '\\';
688                 }
689               else
690                 return QUOTED (c);
691             }
692           else
693             {
694               if (count > (expected_count >> 1))
695                 {
696                   phase1_ungetc (c);
697                   return '\\';
698                 }
699               else if (count > 0)
700                 return QUOTED (c);
701               else
702                 return c;
703             }
704         }
705     }
706 
707   return (open_singlequote || open_doublequote ? QUOTED (c) : c);
708 }
709 
710 /* Supports 2 characters of pushback.  */
711 static void
phase2_ungetc(int c)712 phase2_ungetc (int c)
713 {
714   switch (c)
715     {
716     case EOF:
717       break;
718 
719     case '\n':
720       --line_number;
721       /* FALLTHROUGH */
722 
723     default:
724       if (phase2_pushback_length == SIZEOF (phase2_pushback))
725         abort ();
726       phase2_pushback[phase2_pushback_length++] = c;
727       break;
728     }
729 }
730 
731 
732 /* Context lookup table.  */
733 static flag_context_list_table_ty *flag_context_list_table;
734 
735 
736 /* Forward declaration of local functions.  */
737 static enum word_type read_command_list (int looking_for,
738                                          flag_context_ty outer_context);
739 
740 
741 
742 /* Read the next word.
743    'looking_for' denotes a parse terminator, either CLOSING_BACKQUOTE, ')'
744    or '\0'.  */
745 static void
read_word(struct word * wp,int looking_for,flag_context_ty context)746 read_word (struct word *wp, int looking_for, flag_context_ty context)
747 {
748   int c;
749   bool all_unquoted_digits;
750   bool all_unquoted_name_characters;
751 
752   do
753     {
754       c = phase2_getc ();
755       if (c == '#')
756         {
757           /* Skip a comment up to end of line.  */
758           last_comment_line = line_number;
759           comment_start ();
760           for (;;)
761             {
762               c = phase1_getc ();
763               if (c == EOF || c == '\n')
764                 break;
765               /* We skip all leading white space, but not EOLs.  */
766               if (!(buflen == 0 && (c == ' ' || c == '\t')))
767                 comment_add (c);
768             }
769           comment_line_end ();
770         }
771       if (c == '\n')
772         {
773           /* Comments assumed to be grouped with a message must immediately
774              precede it, with no non-whitespace token on a line between
775              both.  */
776           if (last_non_comment_line > last_comment_line)
777             savable_comment_reset ();
778           wp->type = t_separator;
779           return;
780         }
781     }
782   while (is_whitespace (c));
783 
784   if (c == EOF)
785     {
786       wp->type = t_eof;
787       return;
788     }
789 
790   if (c == '<' || c == '>')
791     {
792       /* Recognize the redirection operators < > >| << <<- >> <> <& >&
793          But <( and >) are handled below, not here.  */
794       int c2 = phase2_getc ();
795       if (c2 != '(')
796         {
797           if ((c == '<' ? c2 == '<' : c2 == '|') || c2 == '>' || c2 == '&')
798             {
799               if (c == '<' && c2 == '<')
800                 {
801                   int c3 = phase2_getc ();
802                   if (c3 != '-')
803                     phase2_ungetc (c3);
804                 }
805             }
806           else
807             phase2_ungetc (c2);
808           wp->type = t_redirect;
809           return;
810         }
811       else
812         phase2_ungetc (c2);
813     }
814 
815   if (c == CLOSING_BACKQUOTE)
816     {
817       if (looking_for == CLOSING_BACKQUOTE)
818         {
819           saw_closing_backquote ();
820           wp->type = t_backquote;
821           last_non_comment_line = line_number;
822           return;
823         }
824       else if (looking_for == ')')
825         {
826           /* The input is invalid syntax, such as `a<(`
827              Push back the closing backquote and pretend that we have seen a
828              closing parenthesis.  */
829           phase2_ungetc (c);
830           wp->type = t_paren;
831           last_non_comment_line = line_number;
832           return;
833         }
834       else
835         /* We shouldn't be reading a CLOSING_BACKQUOTE when
836            looking_for == '\0'.  */
837         abort ();
838     }
839 
840   if (looking_for == ')' && c == ')')
841     {
842       wp->type = t_paren;
843       last_non_comment_line = line_number;
844       return;
845     }
846 
847   if (is_operator_start (c))
848     {
849       wp->type = (c == ';' ? t_separator : t_other);
850       return;
851     }
852 
853   wp->type = t_string;
854   wp->token = XMALLOC (struct token);
855   init_token (wp->token);
856   wp->line_number_at_start = line_number;
857   /* True while all characters in the token seen so far are digits.  */
858   all_unquoted_digits = true;
859   /* True while all characters in the token seen so far form a "name":
860      all characters are unquoted underscores, digits, or alphabetics from the
861      portable character set, and the first character is not a digit.  Cf.
862      <https://pubs.opengroup.org/onlinepubs/9699919799/basedefs/V1_chap03.html#tag_03_235>
863    */
864   all_unquoted_name_characters = true;
865 
866   for (;; c = phase2_getc ())
867     {
868       if (c == EOF)
869         break;
870 
871       if (all_unquoted_digits && (c == '<' || c == '>'))
872         {
873           /* Recognize the redirection operators < > >| << <<- >> <> <& >&
874              prefixed with a nonempty sequence of unquoted digits.  */
875           int c2 = phase2_getc ();
876           if ((c == '<' ? c2 == '<' : c2 == '|') || c2 == '>' || c2 == '&')
877             {
878               if (c == '<' && c2 == '<')
879                 {
880                   int c3 = phase2_getc ();
881                   if (c3 != '-')
882                     phase2_ungetc (c3);
883                 }
884             }
885           else
886             phase2_ungetc (c2);
887 
888           wp->type = t_redirect;
889           free_token (wp->token);
890           free (wp->token);
891 
892           last_non_comment_line = line_number;
893 
894           return;
895         }
896 
897       all_unquoted_digits = all_unquoted_digits && (c >= '0' && c <= '9');
898 
899       if (all_unquoted_name_characters && wp->token->charcount > 0 && c == '=')
900         {
901           wp->type = t_assignment;
902           continue;
903         }
904 
905       all_unquoted_name_characters =
906          all_unquoted_name_characters
907          && ((c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z') || c == '_'
908              || (wp->token->charcount > 0 && c >= '0' && c <= '9'));
909 
910       if (c == '$')
911         {
912           int c2;
913 
914           /* An unquoted dollar indicates we are not inside '...'.  */
915           if (open_singlequote)
916             abort ();
917           /* After reading a dollar, we know that there is no pushed back
918              character from an earlier lookahead.  */
919           if (phase2_pushback_length > 0)
920             abort ();
921           /* Therefore we can use phase1 without interfering with phase2.
922              We need to recognize $( outside and inside double-quotes.
923              It would be incorrect to do
924                 c2 = phase2_getc ();
925                 if (c2 == '(' || c2 == QUOTED ('('))
926              because that would also trigger for $\(.  */
927           c2 = phase1_getc ();
928           if (c2 == '(')
929             {
930               bool saved_open_doublequote;
931               int c3;
932 
933               phase1_ungetc (c2);
934 
935               /* The entire inner command or arithmetic expression is read
936                  ignoring possible surrounding double-quotes.  */
937               saved_open_doublequote = open_doublequote;
938               open_doublequote = false;
939 
940               c2 = phase2_getc ();
941               if (c2 != '(')
942                 abort ();
943 
944               c3 = phase2_getc ();
945               if (c3 == '(')
946                 {
947                   /* Arithmetic expression (Bash syntax).  Skip until the
948                      matching closing parenthesis.  */
949                   unsigned int depth = 2;
950 
951                   do
952                     {
953                       c = phase2_getc ();
954                       if (c == '(')
955                         depth++;
956                       else if (c == ')')
957                         if (--depth == 0)
958                           break;
959                     }
960                   while (c != EOF);
961                 }
962               else
963                 {
964                   /* Command substitution (Bash syntax).  */
965                   phase2_ungetc (c3);
966                   read_command_list (')', context);
967                 }
968 
969               open_doublequote = saved_open_doublequote;
970             }
971           else
972             {
973               phase1_ungetc (c2);
974               c2 = phase2_getc ();
975 
976               if (c2 == '\'' && !open_singlequote)
977                 {
978                   /* Bash builtin for string with ANSI-C escape sequences.  */
979                   for (;;)
980                     {
981                       /* We have to use phase1 throughout this loop,
982                          because phase2 does debackslashification,
983                          which is undesirable when parsing ANSI-C
984                          escape sequences.  */
985                       c = phase1_getc ();
986                       if (c == EOF)
987                         break;
988                       if (c == '\'')
989                         break;
990                       if (c == '\\')
991                         {
992                           c = phase1_getc ();
993                           switch (c)
994                             {
995                             default:
996                               phase1_ungetc (c);
997                               c = '\\';
998                               break;
999 
1000                             case '\\':
1001                               break;
1002                             case '\'':
1003                               break;
1004                             case '"':
1005                               break;
1006 
1007                             case 'a':
1008                               c = '\a';
1009                               break;
1010                             case 'b':
1011                               c = '\b';
1012                               break;
1013                             case 'e':
1014                             case 'E':
1015                               c = 0x1b; /* ESC */
1016                               break;
1017                             case 'f':
1018                               c = '\f';
1019                               break;
1020                             case 'n':
1021                               c = '\n';
1022                               break;
1023                             case 'r':
1024                               c = '\r';
1025                               break;
1026                             case 't':
1027                               c = '\t';
1028                               break;
1029                             case 'v':
1030                               c = '\v';
1031                               break;
1032 
1033                             case 'x':
1034                               c = phase1_getc ();
1035                               if ((c >= '0' && c <= '9')
1036                                   || (c >= 'A' && c <= 'F')
1037                                   || (c >= 'a' && c <= 'f'))
1038                                 {
1039                                   int n;
1040 
1041                                   if (c >= '0' && c <= '9')
1042                                     n = c - '0';
1043                                   else if (c >= 'A' && c <= 'F')
1044                                     n = 10 + c - 'A';
1045                                   else if (c >= 'a' && c <= 'f')
1046                                     n = 10 + c - 'a';
1047                                   else
1048                                     abort ();
1049 
1050                                   c = phase1_getc ();
1051                                   if ((c >= '0' && c <= '9')
1052                                       || (c >= 'A' && c <= 'F')
1053                                       || (c >= 'a' && c <= 'f'))
1054                                     {
1055                                       if (c >= '0' && c <= '9')
1056                                         n = n * 16 + c - '0';
1057                                       else if (c >= 'A' && c <= 'F')
1058                                         n = n * 16 + 10 + c - 'A';
1059                                       else if (c >= 'a' && c <= 'f')
1060                                         n = n * 16 + 10 + c - 'a';
1061                                       else
1062                                         abort ();
1063                                     }
1064                                   else
1065                                     phase1_ungetc (c);
1066 
1067                                   c = n;
1068                                 }
1069                               else
1070                                 {
1071                                   phase1_ungetc (c);
1072                                   phase1_ungetc ('x');
1073                                   c = '\\';
1074                                 }
1075                               break;
1076 
1077                             case '0': case '1': case '2': case '3':
1078                             case '4': case '5': case '6': case '7':
1079                               {
1080                                 int n = c - '0';
1081 
1082                                 c = phase1_getc ();
1083                                 if (c >= '0' && c <= '7')
1084                                   {
1085                                     n = n * 8 + c - '0';
1086 
1087                                     c = phase1_getc ();
1088                                     if (c >= '0' && c <= '7')
1089                                       n = n * 8 + c - '0';
1090                                     else
1091                                       phase1_ungetc (c);
1092                                   }
1093                                 else
1094                                   phase1_ungetc (c);
1095 
1096                                 c = n;
1097                               }
1098                               break;
1099                             }
1100                         }
1101                       if (wp->type == t_string)
1102                         {
1103                           grow_token (wp->token);
1104                           wp->token->chars[wp->token->charcount++] =
1105                             (unsigned char) c;
1106                         }
1107                     }
1108                   /* The result is a literal string.  Don't change wp->type.  */
1109                   continue;
1110                 }
1111               else if (c2 == '"' && !open_doublequote)
1112                 {
1113                   /* Bash builtin for internationalized string.  */
1114                   lex_pos_ty pos;
1115                   struct token string;
1116 
1117                   saw_opening_singlequote ();
1118                   open_singlequote_terminator = '"';
1119                   pos.file_name = logical_file_name;
1120                   pos.line_number = line_number;
1121                   init_token (&string);
1122                   for (;;)
1123                     {
1124                       c = phase2_getc ();
1125                       if (c == EOF)
1126                         break;
1127                       if (c == '"')
1128                         {
1129                           saw_closing_singlequote ();
1130                           break;
1131                         }
1132                       grow_token (&string);
1133                       string.chars[string.charcount++] = (unsigned char) c;
1134                     }
1135                   remember_a_message (mlp, NULL, string_of_token (&string),
1136                                       false, false, context, &pos,
1137                                       NULL, savable_comment, false);
1138                   free_token (&string);
1139 
1140                   error_with_progname = false;
1141                   error (0, 0, _("%s:%lu: warning: the syntax $\"...\" is deprecated due to security reasons; use eval_gettext instead"),
1142                          pos.file_name, (unsigned long) pos.line_number);
1143                   error_with_progname = true;
1144 
1145                   /* The result at runtime is not constant. Therefore we
1146                      change wp->type.  */
1147                 }
1148               else
1149                 phase2_ungetc (c2);
1150             }
1151           wp->type = t_other;
1152           continue;
1153         }
1154 
1155       if (c == '\'')
1156         {
1157           if (!open_singlequote)
1158             {
1159               /* Handle an opening single quote.  */
1160               saw_opening_singlequote ();
1161             }
1162           else
1163             {
1164               /* Handle a closing single quote.  */
1165               saw_closing_singlequote ();
1166             }
1167           continue;
1168         }
1169 
1170       if (c == '"')
1171         {
1172           if (open_singlequote && open_singlequote_terminator == '"')
1173             {
1174               /* Handle a closing i18n quote.  */
1175               saw_closing_singlequote ();
1176             }
1177           else if (!open_doublequote)
1178             {
1179               /* Handle an opening double quote.  */
1180               saw_opening_doublequote ();
1181             }
1182           else
1183             {
1184               /* Handle a closing double quote.  */
1185               saw_closing_doublequote ();
1186             }
1187           continue;
1188         }
1189 
1190       if (c == OPENING_BACKQUOTE)
1191         {
1192           /* Handle an opening backquote.  */
1193           saw_opening_backquote ();
1194 
1195           read_command_list (CLOSING_BACKQUOTE, context);
1196 
1197           wp->type = t_other;
1198           continue;
1199         }
1200       if (c == CLOSING_BACKQUOTE)
1201         break;
1202 
1203       if (c == '<' || c == '>')
1204         {
1205           int c2;
1206 
1207           /* An unquoted c indicates we are not inside '...' nor "...".  */
1208           if (open_singlequote || open_doublequote)
1209             abort ();
1210 
1211           c2 = phase2_getc ();
1212           if (c2 == '(')
1213             {
1214               /* Process substitution (Bash syntax).  */
1215               read_command_list (')', context);
1216 
1217               wp->type = t_other;
1218               continue;
1219             }
1220           else
1221             phase2_ungetc (c2);
1222         }
1223 
1224       if (!open_singlequote && !open_doublequote
1225           && (is_whitespace (c) || is_operator_start (c)))
1226         break;
1227 
1228       if (wp->type == t_string)
1229         {
1230           grow_token (wp->token);
1231           wp->token->chars[wp->token->charcount++] = (unsigned char) c;
1232         }
1233     }
1234 
1235   phase2_ungetc (c);
1236 
1237   if (wp->type != t_string)
1238     {
1239       free_token (wp->token);
1240       free (wp->token);
1241     }
1242   last_non_comment_line = line_number;
1243 }
1244 
1245 
1246 /* Read the next command.
1247    'looking_for' denotes a parse terminator, either CLOSING_BACKQUOTE, ')'
1248    or '\0'.
1249    Returns the type of the word that terminated the command.  */
1250 static enum word_type
read_command(int looking_for,flag_context_ty outer_context)1251 read_command (int looking_for, flag_context_ty outer_context)
1252 {
1253   /* Read the words that make up the command.
1254      Here we completely ignore field splitting at whitespace and wildcard
1255      expansions; i.e. we assume that the source is written in such a way that
1256      every word in the program determines exactly one word in the resulting
1257      command.
1258      But we do not require that the 'gettext'/'ngettext' command is the
1259      first in the command; this is because 1. we want to allow for prefixes
1260      like "$verbose" that may expand to nothing, and 2. it's a big effort
1261      to know where a command starts in a $(for ...) or $(case ...) compound
1262      command.  */
1263   int arg = 0;                  /* Current argument number.  */
1264   bool arg_of_redirect = false; /* True right after a redirection operator.  */
1265   bool must_expand_arg_strings = false; /* True if need to expand escape
1266                                            sequences in arguments.  */
1267   flag_context_list_iterator_ty context_iter;
1268   const struct callshapes *shapes = NULL;
1269   struct arglist_parser *argparser = NULL;
1270 
1271   for (;;)
1272     {
1273       struct word inner;
1274       flag_context_ty inner_context;
1275 
1276       if (arg == 0)
1277         inner_context = null_context;
1278       else
1279         inner_context =
1280           inherited_context (outer_context,
1281                              flag_context_list_iterator_advance (
1282                                &context_iter));
1283 
1284       read_word (&inner, looking_for, inner_context);
1285 
1286       /* Recognize end of command.  */
1287       if (inner.type == t_separator
1288           || inner.type == t_backquote || inner.type == t_paren
1289           || inner.type == t_eof)
1290         {
1291           if (argparser != NULL)
1292             arglist_parser_done (argparser, arg);
1293           return inner.type;
1294         }
1295 
1296       if (extract_all)
1297         {
1298           if (inner.type == t_string)
1299             {
1300               lex_pos_ty pos;
1301 
1302               pos.file_name = logical_file_name;
1303               pos.line_number = inner.line_number_at_start;
1304               remember_a_message (mlp, NULL, string_of_word (&inner), false,
1305                                   false, inner_context, &pos,
1306                                   NULL, savable_comment, false);
1307             }
1308         }
1309 
1310       if (arg_of_redirect)
1311         {
1312           /* Ignore arguments of redirection operators.  */
1313           arg_of_redirect = false;
1314         }
1315       else if (inner.type == t_redirect)
1316         {
1317           /* Ignore this word and the following one.  */
1318           arg_of_redirect = true;
1319         }
1320       else
1321         {
1322           bool matters_for_argparser = true;
1323 
1324           if (argparser == NULL)
1325             {
1326               /* This is the function position.  */
1327               arg = 0;
1328               if (inner.type == t_assignment)
1329                 {
1330                   /* An assignment just sets an environment variable.
1331                      Ignore it.  */
1332                   /* Don't increment arg in this round.  */
1333                   matters_for_argparser = false;
1334                 }
1335               else if (inner.type == t_string)
1336                 {
1337                   char *function_name = string_of_word (&inner);
1338 
1339                   if (strcmp (function_name, "env") == 0)
1340                     {
1341                       /* The 'env' command just introduces more assignments.
1342                          Ignore it.  */
1343                       /* Don't increment arg in this round.  */
1344                       matters_for_argparser = false;
1345                     }
1346                   else
1347                     {
1348                       void *keyword_value;
1349 
1350                       if (hash_find_entry (&keywords,
1351                                            function_name,
1352                                            strlen (function_name),
1353                                            &keyword_value)
1354                           == 0)
1355                         shapes = (const struct callshapes *) keyword_value;
1356 
1357                       argparser = arglist_parser_alloc (mlp, shapes);
1358 
1359                       context_iter =
1360                         flag_context_list_iterator (
1361                           flag_context_list_table_lookup (
1362                             flag_context_list_table,
1363                             function_name, strlen (function_name)));
1364                     }
1365 
1366                   free (function_name);
1367                 }
1368               else
1369                 context_iter = null_context_list_iterator;
1370             }
1371           else
1372             {
1373               /* These are the argument positions.  */
1374               if (inner.type == t_string)
1375                 {
1376                   bool accepts_context =
1377                     ((argparser->keyword_len == 7
1378                       && memcmp (argparser->keyword, "gettext", 7) == 0)
1379                      || (argparser->keyword_len == 8
1380                          && memcmp (argparser->keyword, "ngettext", 8) == 0));
1381                   bool accepts_expand =
1382                     ((argparser->keyword_len == 7
1383                       && memcmp (argparser->keyword, "gettext", 7) == 0)
1384                      || (argparser->keyword_len == 8
1385                          && memcmp (argparser->keyword, "ngettext", 8) == 0));
1386                   if (accepts_context && argparser->next_is_msgctxt)
1387                     {
1388                       char *s = string_of_word (&inner);
1389                       mixed_string_ty *ms =
1390                         mixed_string_alloc_simple (s, lc_string,
1391                                                    logical_file_name,
1392                                                    inner.line_number_at_start);
1393                       free (s);
1394                       argparser->next_is_msgctxt = false;
1395                       arglist_parser_remember_msgctxt (argparser, ms,
1396                                                        inner_context,
1397                                                        logical_file_name,
1398                                                        inner.line_number_at_start);
1399                       matters_for_argparser = false;
1400                     }
1401                   else if (accepts_context
1402                            && ((inner.token->charcount == 2
1403                                 && memcmp (inner.token->chars, "-c", 2) == 0)
1404                                || (inner.token->charcount == 9
1405                                    && memcmp (inner.token->chars, "--context", 9) == 0)))
1406                     {
1407                       argparser->next_is_msgctxt = true;
1408                       matters_for_argparser = false;
1409                     }
1410                   else if (accepts_context
1411                            && (inner.token->charcount >= 10
1412                                && memcmp (inner.token->chars, "--context=", 10) == 0))
1413                     {
1414                       char *s = substring_of_word (&inner, 10);
1415                       mixed_string_ty *ms =
1416                         mixed_string_alloc_simple (s, lc_string,
1417                                                    logical_file_name,
1418                                                    inner.line_number_at_start);
1419                       free (s);
1420                       argparser->next_is_msgctxt = false;
1421                       arglist_parser_remember_msgctxt (argparser, ms,
1422                                                        inner_context,
1423                                                        logical_file_name,
1424                                                        inner.line_number_at_start);
1425                       matters_for_argparser = false;
1426                     }
1427                   else if (accepts_expand
1428                            && inner.token->charcount == 2
1429                            && memcmp (inner.token->chars, "-e", 2) == 0)
1430                     {
1431                       must_expand_arg_strings = true;
1432                       matters_for_argparser = false;
1433                     }
1434                   else
1435                     {
1436                       char *s = string_of_word (&inner);
1437                       mixed_string_ty *ms;
1438 
1439                       /* When '-e' was specified, expand escape sequences in s.  */
1440                       if (accepts_expand && must_expand_arg_strings)
1441                         {
1442                           bool expands_backslash_c =
1443                             (argparser->keyword_len == 7
1444                              && memcmp (argparser->keyword, "gettext", 7) == 0);
1445                           bool backslash_c = false;
1446                           char *expanded =
1447                             (char *)
1448                             expand_escapes (s, expands_backslash_c ? &backslash_c : NULL);
1449                           /* We can ignore the value of expands_backslash_c, because
1450                              here we don't support the gettext '-s' option.  */
1451                           if (expanded != s)
1452                             free (s);
1453                           s = expanded;
1454                         }
1455 
1456                       ms = mixed_string_alloc_simple (s, lc_string,
1457                                                       logical_file_name,
1458                                                       inner.line_number_at_start);
1459                       free (s);
1460                       arglist_parser_remember (argparser, arg, ms,
1461                                                inner_context,
1462                                                logical_file_name,
1463                                                inner.line_number_at_start,
1464                                                savable_comment, false);
1465                     }
1466                 }
1467 
1468               if (matters_for_argparser)
1469                 if (arglist_parser_decidedp (argparser, arg))
1470                   {
1471                     /* Stop looking for arguments of the last function_name.  */
1472                     /* FIXME: What about context_iter?  */
1473                     arglist_parser_done (argparser, arg);
1474                     shapes = NULL;
1475                     argparser = NULL;
1476                   }
1477             }
1478 
1479           if (matters_for_argparser)
1480             arg++;
1481         }
1482 
1483       free_word (&inner);
1484     }
1485 }
1486 
1487 
1488 /* Read a list of commands.
1489    'looking_for' denotes a parse terminator, either CLOSING_BACKQUOTE, ')'
1490    or '\0'.
1491    Returns the type of the word that terminated the command list.  */
1492 static enum word_type
read_command_list(int looking_for,flag_context_ty outer_context)1493 read_command_list (int looking_for, flag_context_ty outer_context)
1494 {
1495   for (;;)
1496     {
1497       enum word_type terminator;
1498 
1499       terminator = read_command (looking_for, outer_context);
1500       if (terminator != t_separator)
1501         return terminator;
1502     }
1503 }
1504 
1505 
1506 void
extract_sh(FILE * f,const char * real_filename,const char * logical_filename,flag_context_list_table_ty * flag_table,msgdomain_list_ty * mdlp)1507 extract_sh (FILE *f,
1508             const char *real_filename, const char *logical_filename,
1509             flag_context_list_table_ty *flag_table,
1510             msgdomain_list_ty *mdlp)
1511 {
1512   mlp = mdlp->item[0]->messages;
1513 
1514   fp = f;
1515   real_file_name = real_filename;
1516   logical_file_name = xstrdup (logical_filename);
1517   line_number = 1;
1518 
1519   phase1_pushback_length = 0;
1520 
1521   last_comment_line = -1;
1522   last_non_comment_line = -1;
1523 
1524   nested_backquotes = 0;
1525   open_doublequotes_mask = 0;
1526   open_doublequote = false;
1527   open_singlequote = false;
1528 
1529   phase2_pushback_length = 0;
1530 
1531   flag_context_list_table = flag_table;
1532 
1533   init_keywords ();
1534 
1535   /* Eat tokens until eof is seen.  */
1536   read_command_list ('\0', null_context);
1537 
1538   fp = NULL;
1539   real_file_name = NULL;
1540   logical_file_name = NULL;
1541   line_number = 0;
1542 }
1543