1 /* xgettext Perl backend.
2    Copyright (C) 2002-2010, 2013, 2016, 2018-2020 Free Software Foundation, Inc.
3 
4    This file was written by Guido Flohr <guido@imperia.net>, 2002-2010.
5 
6    This program is free software: you can redistribute it and/or modify
7    it under the terms of the GNU General Public License as published by
8    the Free Software Foundation; either version 3 of the License, or
9    (at your option) any later version.
10 
11    This program is distributed in the hope that it will be useful,
12    but WITHOUT ANY WARRANTY; without even the implied warranty of
13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14    GNU General Public License for more details.
15 
16    You should have received a copy of the GNU General Public License
17    along with this program.  If not, see <https://www.gnu.org/licenses/>.  */
18 
19 #ifdef HAVE_CONFIG_H
20 # include "config.h"
21 #endif
22 
23 /* Specification.  */
24 #include "x-perl.h"
25 
26 #include <errno.h>
27 #include <stdbool.h>
28 #include <stdio.h>
29 #include <stdlib.h>
30 #include <string.h>
31 
32 #include "message.h"
33 #include "rc-str-list.h"
34 #include "xgettext.h"
35 #include "xg-pos.h"
36 #include "xg-encoding.h"
37 #include "xg-mixed-string.h"
38 #include "xg-arglist-context.h"
39 #include "xg-arglist-callshape.h"
40 #include "xg-arglist-parser.h"
41 #include "xg-message.h"
42 #include "error.h"
43 #include "error-progname.h"
44 #include "xalloc.h"
45 #include "po-charset.h"
46 #include "unistr.h"
47 #include "uniname.h"
48 #include "gettext.h"
49 
50 #define _(s) gettext(s)
51 
52 /* The Perl syntax is defined in perlsyn.pod.  Try the command
53    "man perlsyn" or "perldoc perlsyn".
54    Also, the syntax after the 'sub' keyword is specified in perlsub.pod.
55    Try the command "man perlsub" or "perldoc perlsub".
56    Perl 5.10 has new operators '//' and '//=', see
57    <https://perldoc.perl.org/perldelta.html#Defined-or-operator>.  */
58 
59 #define DEBUG_PERL 0
60 
61 
62 /* ====================== Keyword set customization.  ====================== */
63 
64 /* If true extract all strings.  */
65 static bool extract_all = false;
66 
67 static hash_table keywords;
68 static bool default_keywords = true;
69 
70 
71 void
x_perl_extract_all()72 x_perl_extract_all ()
73 {
74   extract_all = true;
75 }
76 
77 
78 void
x_perl_keyword(const char * name)79 x_perl_keyword (const char *name)
80 {
81   if (name == NULL)
82     default_keywords = false;
83   else
84     {
85       const char *end;
86       struct callshape shape;
87       const char *colon;
88 
89       if (keywords.table == NULL)
90         hash_init (&keywords, 100);
91 
92       split_keywordspec (name, &end, &shape);
93 
94       /* The characters between name and end should form a valid C identifier.
95          A colon means an invalid parse in split_keywordspec().  */
96       colon = strchr (name, ':');
97       if (colon == NULL || colon >= end)
98         insert_keyword_callshape (&keywords, name, end - name, &shape);
99     }
100 }
101 
102 /* Finish initializing the keywords hash table.
103    Called after argument processing, before each file is processed.  */
104 static void
init_keywords()105 init_keywords ()
106 {
107   if (default_keywords)
108     {
109       /* When adding new keywords here, also update the documentation in
110          xgettext.texi!  */
111       x_perl_keyword ("gettext");
112       x_perl_keyword ("%gettext");
113       x_perl_keyword ("$gettext");
114       x_perl_keyword ("dgettext:2");
115       x_perl_keyword ("dcgettext:2");
116       x_perl_keyword ("ngettext:1,2");
117       x_perl_keyword ("dngettext:2,3");
118       x_perl_keyword ("dcngettext:2,3");
119       x_perl_keyword ("gettext_noop");
120       x_perl_keyword ("pgettext:1c,2");
121       x_perl_keyword ("dpgettext:2c,3");
122       x_perl_keyword ("dcpgettext:2c,3");
123       x_perl_keyword ("npgettext:1c,2,3");
124       x_perl_keyword ("dnpgettext:2c,3,4");
125       x_perl_keyword ("dcnpgettext:2c,3,4");
126 
127 #if 0
128       x_perl_keyword ("__");
129       x_perl_keyword ("$__");
130       x_perl_keyword ("%__");
131       x_perl_keyword ("__x");
132       x_perl_keyword ("__n:1,2");
133       x_perl_keyword ("__nx:1,2");
134       x_perl_keyword ("__xn:1,2");
135       x_perl_keyword ("N__");
136 #endif
137       default_keywords = false;
138     }
139 }
140 
141 void
init_flag_table_perl()142 init_flag_table_perl ()
143 {
144   /* Gettext binding for Perl.  */
145   xgettext_record_flag ("gettext:1:pass-perl-format");
146   xgettext_record_flag ("gettext:1:pass-perl-brace-format");
147   xgettext_record_flag ("%gettext:1:pass-perl-format");
148   xgettext_record_flag ("%gettext:1:pass-perl-brace-format");
149   xgettext_record_flag ("$gettext:1:pass-perl-format");
150   xgettext_record_flag ("$gettext:1:pass-perl-brace-format");
151   xgettext_record_flag ("dgettext:2:pass-perl-format");
152   xgettext_record_flag ("dgettext:2:pass-perl-brace-format");
153   xgettext_record_flag ("dcgettext:2:pass-perl-format");
154   xgettext_record_flag ("dcgettext:2:pass-perl-brace-format");
155   xgettext_record_flag ("ngettext:1:pass-perl-format");
156   xgettext_record_flag ("ngettext:2:pass-perl-format");
157   xgettext_record_flag ("ngettext:1:pass-perl-brace-format");
158   xgettext_record_flag ("ngettext:2:pass-perl-brace-format");
159   xgettext_record_flag ("dngettext:2:pass-perl-format");
160   xgettext_record_flag ("dngettext:3:pass-perl-format");
161   xgettext_record_flag ("dngettext:2:pass-perl-brace-format");
162   xgettext_record_flag ("dngettext:3:pass-perl-brace-format");
163   xgettext_record_flag ("dcngettext:2:pass-perl-format");
164   xgettext_record_flag ("dcngettext:3:pass-perl-format");
165   xgettext_record_flag ("dcngettext:2:pass-perl-brace-format");
166   xgettext_record_flag ("dcngettext:3:pass-perl-brace-format");
167   xgettext_record_flag ("gettext_noop:1:pass-perl-format");
168   xgettext_record_flag ("gettext_noop:1:pass-perl-brace-format");
169   xgettext_record_flag ("pgettext:2:pass-perl-format");
170   xgettext_record_flag ("pgettext:2:pass-perl-brace-format");
171   xgettext_record_flag ("dpgettext:3:pass-perl-format");
172   xgettext_record_flag ("dpgettext:3:pass-perl-brace-format");
173   xgettext_record_flag ("dcpgettext:3:pass-perl-format");
174   xgettext_record_flag ("dcpgettext:3:pass-perl-brace-format");
175   xgettext_record_flag ("npgettext:2:pass-perl-format");
176   xgettext_record_flag ("npgettext:3:pass-perl-format");
177   xgettext_record_flag ("npgettext:2:pass-perl-brace-format");
178   xgettext_record_flag ("npgettext:3:pass-perl-brace-format");
179   xgettext_record_flag ("dnpgettext:3:pass-perl-format");
180   xgettext_record_flag ("dnpgettext:4:pass-perl-format");
181   xgettext_record_flag ("dnpgettext:3:pass-perl-brace-format");
182   xgettext_record_flag ("dnpgettext:4:pass-perl-brace-format");
183   xgettext_record_flag ("dcnpgettext:3:pass-perl-format");
184   xgettext_record_flag ("dcnpgettext:4:pass-perl-format");
185   xgettext_record_flag ("dcnpgettext:3:pass-perl-brace-format");
186   xgettext_record_flag ("dcnpgettext:4:pass-perl-brace-format");
187 
188   /* Perl builtins.  */
189   xgettext_record_flag ("printf:1:perl-format"); /* argument 1 or 2 ?? */
190   xgettext_record_flag ("sprintf:1:perl-format");
191 #if 0
192   /* Shortcuts from libintl-perl.  */
193   xgettext_record_flag ("__:1:pass-perl-format");
194   xgettext_record_flag ("__:1:pass-perl-brace-format");
195   xgettext_record_flag ("%__:1:pass-perl-format");
196   xgettext_record_flag ("%__:1:pass-perl-brace-format");
197   xgettext_record_flag ("$__:1:pass-perl-format");
198   xgettext_record_flag ("$__:1:pass-perl-brace-format");
199   xgettext_record_flag ("__x:1:perl-brace-format");
200   xgettext_record_flag ("__n:1:pass-perl-format");
201   xgettext_record_flag ("__n:2:pass-perl-format");
202   xgettext_record_flag ("__n:1:pass-perl-brace-format");
203   xgettext_record_flag ("__n:2:pass-perl-brace-format");
204   xgettext_record_flag ("__nx:1:perl-brace-format");
205   xgettext_record_flag ("__nx:2:perl-brace-format");
206   xgettext_record_flag ("__xn:1:perl-brace-format");
207   xgettext_record_flag ("__xn:2:perl-brace-format");
208   xgettext_record_flag ("N__:1:pass-perl-format");
209   xgettext_record_flag ("N__:1:pass-perl-brace-format");
210 #endif
211 }
212 
213 
214 /* ======================== Reading of characters.  ======================== */
215 
216 /* The input file stream.  */
217 static FILE *fp;
218 
219 /* The current line buffer.  */
220 static char *linebuf;
221 /* The size of the input buffer.  */
222 static size_t linebuf_size;
223 
224 /* The size of the current line.  */
225 static int linesize;
226 
227 /* The position in the current line.  */
228 static int linepos;
229 
230 /* Number of lines eaten for here documents.  */
231 static int eaten_here;
232 
233 /* Paranoia: EOF marker for __END__ or __DATA__.  */
234 static bool end_of_file;
235 
236 
237 /* 1. line_number handling.  */
238 
239 /* Returns the next character from the input stream or EOF.  */
240 static int
phase1_getc()241 phase1_getc ()
242 {
243   line_number += eaten_here;
244   eaten_here = 0;
245 
246   if (end_of_file)
247     return EOF;
248 
249   if (linepos >= linesize)
250     {
251       linesize = getline (&linebuf, &linebuf_size, fp);
252 
253       if (linesize < 0)
254         {
255           if (ferror (fp))
256             error (EXIT_FAILURE, errno, _("error while reading \"%s\""),
257                    real_file_name);
258           end_of_file = true;
259           return EOF;
260         }
261 
262       linepos = 0;
263       ++line_number;
264 
265       /* Undosify.  This is important for catching the end of <<EOF and
266          <<'EOF'.  We could rely on stdio doing this for us but
267          it is not uncommon to to come across Perl scripts with CRLF
268          newline conventions on systems that do not follow this
269          convention.  */
270       if (linesize >= 2 && linebuf[linesize - 1] == '\n'
271           && linebuf[linesize - 2] == '\r')
272         {
273           linebuf[linesize - 2] = '\n';
274           linebuf[linesize - 1] = '\0';
275           --linesize;
276         }
277     }
278 
279   return linebuf[linepos++];
280 }
281 
282 /* Supports only one pushback character.  */
283 static void
phase1_ungetc(int c)284 phase1_ungetc (int c)
285 {
286   if (c != EOF)
287     {
288       if (linepos == 0)
289         /* Attempt to ungetc across line boundary.  Shouldn't happen.
290            No two phase1_ungetc calls are permitted in a row.  */
291         abort ();
292 
293       --linepos;
294     }
295 }
296 
297 /* Read a here document and return its contents.
298    The delimiter is an UTF-8 encoded string; the resulting string is UTF-8
299    encoded as well.  */
300 
301 static char *
get_here_document(const char * delimiter)302 get_here_document (const char *delimiter)
303 {
304   /* Accumulator for the entire here document, including a NUL byte
305      at the end.  */
306   static char *buffer;
307   static size_t bufmax = 0;
308   size_t bufpos = 0;
309   /* Current line being appended.  */
310   static char *my_linebuf = NULL;
311   static size_t my_linebuf_size = 0;
312 
313   /* Allocate the initial buffer.  Later on, bufmax > 0.  */
314   if (bufmax == 0)
315     {
316       buffer = XNMALLOC (1, char);
317       buffer[0] = '\0';
318       bufmax = 1;
319     }
320 
321   for (;;)
322     {
323       int read_bytes = getline (&my_linebuf, &my_linebuf_size, fp);
324       char *my_line_utf8;
325       bool chomp;
326 
327       if (read_bytes < 0)
328         {
329           if (ferror (fp))
330             {
331               error (EXIT_FAILURE, errno, _("error while reading \"%s\""),
332                      real_file_name);
333             }
334           else
335             {
336               error_with_progname = false;
337               error (EXIT_SUCCESS, 0,
338                      _("%s:%d: can't find string terminator \"%s\" anywhere before EOF"),
339                      real_file_name, line_number, delimiter);
340               error_with_progname = true;
341 
342               break;
343             }
344         }
345 
346       ++eaten_here;
347 
348       /* Convert to UTF-8.  */
349       my_line_utf8 =
350         from_current_source_encoding (my_linebuf, lc_string, logical_file_name,
351                                       line_number + eaten_here);
352       if (my_line_utf8 != my_linebuf)
353         {
354           if (strlen (my_line_utf8) >= my_linebuf_size)
355             {
356               my_linebuf_size = strlen (my_line_utf8) + 1;
357               my_linebuf = xrealloc (my_linebuf, my_linebuf_size);
358             }
359           strcpy (my_linebuf, my_line_utf8);
360           free (my_line_utf8);
361         }
362 
363       /* Undosify.  This is important for catching the end of <<EOF and
364          <<'EOF'.  We could rely on stdio doing this for us but you
365          it is not uncommon to to come across Perl scripts with CRLF
366          newline conventions on systems that do not follow this
367          convention.  */
368       if (read_bytes >= 2 && my_linebuf[read_bytes - 1] == '\n'
369           && my_linebuf[read_bytes - 2] == '\r')
370         {
371           my_linebuf[read_bytes - 2] = '\n';
372           my_linebuf[read_bytes - 1] = '\0';
373           --read_bytes;
374         }
375 
376       /* Temporarily remove the trailing newline from my_linebuf.  */
377       chomp = false;
378       if (read_bytes >= 1 && my_linebuf[read_bytes - 1] == '\n')
379         {
380           chomp = true;
381           my_linebuf[read_bytes - 1] = '\0';
382         }
383 
384       /* See whether this line terminates the here document.  */
385       if (strcmp (my_linebuf, delimiter) == 0)
386         break;
387 
388       /* Add back the trailing newline to my_linebuf.  */
389       if (chomp)
390         my_linebuf[read_bytes - 1] = '\n';
391 
392       /* Ensure room for read_bytes + 1 bytes.  */
393       if (bufpos + read_bytes >= bufmax)
394         {
395           do
396             bufmax = 2 * bufmax + 10;
397           while (bufpos + read_bytes >= bufmax);
398           buffer = xrealloc (buffer, bufmax);
399         }
400       /* Append this line to the accumulator.  */
401       strcpy (buffer + bufpos, my_linebuf);
402       bufpos += read_bytes;
403     }
404 
405   /* Done accumulating the here document.  */
406   return xstrdup (buffer);
407 }
408 
409 /* Skips pod sections.  */
410 static void
skip_pod()411 skip_pod ()
412 {
413   line_number += eaten_here;
414   eaten_here = 0;
415   linepos = 0;
416 
417   for (;;)
418     {
419       linesize = getline (&linebuf, &linebuf_size, fp);
420 
421       if (linesize < 0)
422         {
423           if (ferror (fp))
424             error (EXIT_FAILURE, errno, _("error while reading \"%s\""),
425                    real_file_name);
426           return;
427         }
428 
429       ++line_number;
430 
431       if (strncmp ("=cut", linebuf, 4) == 0)
432         {
433           /* Force reading of a new line on next call to phase1_getc().  */
434           linepos = linesize;
435           return;
436         }
437     }
438 }
439 
440 
441 /* These are for tracking whether comments count as immediately before
442    keyword.  */
443 static int last_comment_line;
444 static int last_non_comment_line;
445 
446 
447 /* 2. Replace each comment that is not inside a string literal or regular
448    expression with a newline character.  We need to remember the comment
449    for later, because it may be attached to a keyword string.  */
450 
451 static int
phase2_getc()452 phase2_getc ()
453 {
454   static char *buffer;
455   static size_t bufmax;
456   size_t buflen;
457   int lineno;
458   int c;
459   char *utf8_string;
460 
461   c = phase1_getc ();
462   if (c == '#')
463     {
464       buflen = 0;
465       lineno = line_number;
466       /* Skip leading whitespace.  */
467       for (;;)
468         {
469           c = phase1_getc ();
470           if (c == EOF)
471             break;
472           if (c != ' ' && c != '\t' && c != '\r' && c != '\f')
473             {
474               phase1_ungetc (c);
475               break;
476             }
477         }
478       /* Accumulate the comment.  */
479       for (;;)
480         {
481           c = phase1_getc ();
482           if (c == '\n' || c == EOF)
483             break;
484           if (buflen >= bufmax)
485             {
486               bufmax = 2 * bufmax + 10;
487               buffer = xrealloc (buffer, bufmax);
488             }
489           buffer[buflen++] = c;
490         }
491       if (buflen >= bufmax)
492         {
493           bufmax = 2 * bufmax + 10;
494           buffer = xrealloc (buffer, bufmax);
495         }
496       buffer[buflen] = '\0';
497       /* Convert it to UTF-8.  */
498       utf8_string =
499         from_current_source_encoding (buffer, lc_comment, logical_file_name,
500                                       lineno);
501       /* Save it until we encounter the corresponding string.  */
502       savable_comment_add (utf8_string);
503       last_comment_line = lineno;
504     }
505   return c;
506 }
507 
508 /* Supports only one pushback character.  */
509 static void
phase2_ungetc(int c)510 phase2_ungetc (int c)
511 {
512   if (c != EOF)
513     phase1_ungetc (c);
514 }
515 
516 /* Whitespace recognition.  */
517 
518 #define case_whitespace \
519   case ' ': case '\t': case '\r': case '\n': case '\f'
520 
521 static inline bool
is_whitespace(int c)522 is_whitespace (int c)
523 {
524   return (c == ' ' || c == '\t' || c == '\r' || c == '\n' || c == '\f');
525 }
526 
527 
528 /* ========================== Reading of tokens.  ========================== */
529 
530 
531 enum token_type_ty
532 {
533   token_type_eof,
534   token_type_lparen,            /* ( */
535   token_type_rparen,            /* ) */
536   token_type_comma,             /* , */
537   token_type_fat_comma,         /* => */
538   token_type_dereference,       /* -> */
539   token_type_semicolon,         /* ; */
540   token_type_lbrace,            /* { */
541   token_type_rbrace,            /* } */
542   token_type_lbracket,          /* [ */
543   token_type_rbracket,          /* ] */
544   token_type_string,            /* quote-like */
545   token_type_number,            /* starting with a digit o dot */
546   token_type_named_op,          /* if, unless, while, ... */
547   token_type_variable,          /* $... */
548   token_type_object,            /* A dereferenced variable, maybe a blessed
549                                    object.  */
550   token_type_symbol,            /* symbol, number */
551   token_type_regex_op,          /* s, tr, y, m.  */
552   token_type_dot,               /* . */
553   token_type_other,             /* regexp, misc. operator */
554   /* The following are not really token types, but variants used by
555      the parser.  */
556   token_type_keyword_symbol     /* keyword symbol */
557 };
558 typedef enum token_type_ty token_type_ty;
559 
560 /* Subtypes for strings, important for interpolation.  */
561 enum string_type_ty
562 {
563   string_type_verbatim,     /* "<<'EOF'", "m'...'", "s'...''...'",
564                                "tr/.../.../", "y/.../.../".  */
565   string_type_q,            /* "'..'", "q/.../".  */
566   string_type_qq,           /* '"..."', "`...`", "qq/.../", "qx/.../",
567                                "<file*glob>".  */
568   string_type_qr            /* Not supported.  */
569 };
570 
571 /* Subtypes for symbols, important for dollar interpretation.  */
572 enum symbol_type_ty
573 {
574   symbol_type_none,         /* Nothing special.  */
575   symbol_type_sub,          /* 'sub'.  */
576   symbol_type_function      /* Function name after 'sub'.  */
577 };
578 
579 typedef struct token_ty token_ty;
580 struct token_ty
581 {
582   token_type_ty type;
583   token_type_ty last_type;
584   int sub_type;                 /* for token_type_string, token_type_symbol */
585   char *string;                 /* for:                 in encoding:
586                                    token_type_named_op  ASCII
587                                    token_type_string    UTF-8
588                                    token_type_symbol    ASCII
589                                    token_type_variable  global_source_encoding
590                                    token_type_object    global_source_encoding
591                                  */
592   refcounted_string_list_ty *comment; /* for token_type_string */
593   int line_number;
594 };
595 
596 #if DEBUG_PERL
597 static const char *
token2string(const token_ty * token)598 token2string (const token_ty *token)
599 {
600   switch (token->type)
601     {
602     case token_type_eof:
603       return "token_type_eof";
604     case token_type_lparen:
605       return "token_type_lparen";
606     case token_type_rparen:
607       return "token_type_rparen";
608     case token_type_comma:
609       return "token_type_comma";
610     case token_type_fat_comma:
611       return "token_type_fat_comma";
612     case token_type_dereference:
613       return "token_type_dereference";
614     case token_type_semicolon:
615       return "token_type_semicolon";
616     case token_type_lbrace:
617       return "token_type_lbrace";
618     case token_type_rbrace:
619       return "token_type_rbrace";
620     case token_type_lbracket:
621       return "token_type_lbracket";
622     case token_type_rbracket:
623       return "token_type_rbracket";
624     case token_type_string:
625       return "token_type_string";
626     case token_type_number:
627       return "token type number";
628     case token_type_named_op:
629       return "token_type_named_op";
630     case token_type_variable:
631       return "token_type_variable";
632     case token_type_object:
633       return "token_type_object";
634     case token_type_symbol:
635       return "token_type_symbol";
636     case token_type_regex_op:
637       return "token_type_regex_op";
638     case token_type_dot:
639       return "token_type_dot";
640     case token_type_other:
641       return "token_type_other";
642     default:
643       return "unknown";
644     }
645 }
646 #endif
647 
648 /* Free the memory pointed to by a 'struct token_ty'.  */
649 static inline void
free_token(token_ty * tp)650 free_token (token_ty *tp)
651 {
652   switch (tp->type)
653     {
654     case token_type_named_op:
655     case token_type_string:
656     case token_type_symbol:
657     case token_type_variable:
658     case token_type_object:
659       free (tp->string);
660       break;
661     default:
662       break;
663     }
664   if (tp->type == token_type_string)
665     drop_reference (tp->comment);
666   free (tp);
667 }
668 
669 /* Pass 1 of extracting quotes: Find the end of the string, regardless
670    of the semantics of the construct.  Return the complete string,
671    including the starting and the trailing delimiter, with backslashes
672    removed where appropriate.  */
673 static char *
extract_quotelike_pass1(int delim)674 extract_quotelike_pass1 (int delim)
675 {
676   /* This function is called recursively.  No way to allocate stuff
677      statically.  Also alloca() is inappropriate due to limited stack
678      size on some platforms.  So we use malloc().  */
679   int bufmax = 10;
680   char *buffer = XNMALLOC (bufmax, char);
681   int bufpos = 0;
682   bool nested = true;
683   int counter_delim;
684 
685   buffer[bufpos++] = delim;
686 
687   /* Find the closing delimiter.  */
688   switch (delim)
689     {
690     case '(':
691       counter_delim = ')';
692       break;
693     case '{':
694       counter_delim = '}';
695       break;
696     case '[':
697       counter_delim = ']';
698       break;
699     case '<':
700       counter_delim = '>';
701       break;
702     default: /* "..." or '...' or |...| etc. */
703       nested = false;
704       counter_delim = delim;
705       break;
706     }
707 
708   for (;;)
709     {
710       int c = phase1_getc ();
711 
712       /* This round can produce 1 or 2 bytes.  Ensure room for 2 bytes.  */
713       if (bufpos + 2 > bufmax)
714         {
715           bufmax = 2 * bufmax + 10;
716           buffer = xrealloc (buffer, bufmax);
717         }
718 
719       if (c == counter_delim || c == EOF)
720         {
721           buffer[bufpos++] = counter_delim; /* will be stripped off later */
722           buffer[bufpos++] = '\0';
723 #if DEBUG_PERL
724           fprintf (stderr, "PASS1: %s\n", buffer);
725 #endif
726           return buffer;
727         }
728 
729       if (nested && c == delim)
730         {
731           char *inner = extract_quotelike_pass1 (delim);
732           size_t len = strlen (inner);
733 
734           /* Ensure room for len + 1 bytes.  */
735           if (bufpos + len >= bufmax)
736             {
737               do
738                 bufmax = 2 * bufmax + 10;
739               while (bufpos + len >= bufmax);
740               buffer = xrealloc (buffer, bufmax);
741             }
742           strcpy (buffer + bufpos, inner);
743           free (inner);
744           bufpos += len;
745         }
746       else if (c == '\\')
747         {
748           c = phase1_getc ();
749           if (c == '\\')
750             {
751               buffer[bufpos++] = '\\';
752               buffer[bufpos++] = '\\';
753             }
754           else if (c == delim || c == counter_delim)
755             {
756               /* This is pass2 in Perl.  */
757               buffer[bufpos++] = c;
758             }
759           else
760             {
761               buffer[bufpos++] = '\\';
762               phase1_ungetc (c);
763             }
764         }
765       else
766         {
767           buffer[bufpos++] = c;
768         }
769     }
770 }
771 
772 /* Like extract_quotelike_pass1, but return the complete string in UTF-8
773    encoding.  */
774 static char *
extract_quotelike_pass1_utf8(int delim)775 extract_quotelike_pass1_utf8 (int delim)
776 {
777   char *string = extract_quotelike_pass1 (delim);
778   char *utf8_string =
779     from_current_source_encoding (string, lc_string, logical_file_name,
780                                   line_number);
781   if (utf8_string != string)
782     free (string);
783   return utf8_string;
784 }
785 
786 
787 /* ========= Reading of tokens and commands.  Extracting strings.  ========= */
788 
789 
790 /* Context lookup table.  */
791 static flag_context_list_table_ty *flag_context_list_table;
792 
793 
794 /* Forward declaration of local functions.  */
795 static void interpolate_keywords (message_list_ty *mlp, const char *string,
796                                   int lineno);
797 static token_ty *x_perl_lex (message_list_ty *mlp);
798 static void x_perl_unlex (token_ty *tp);
799 static bool extract_balanced (message_list_ty *mlp,
800                               token_type_ty delim, bool eat_delim,
801                               bool comma_delim,
802                               flag_context_ty outer_context,
803                               flag_context_list_iterator_ty context_iter,
804                               int arg, struct arglist_parser *argparser);
805 
806 
807 /* Extract an unsigned hexadecimal number from STRING, considering at
808    most LEN bytes and place the result in *RESULT.  Returns a pointer
809    to the first character past the hexadecimal number.  */
810 static const char *
extract_hex(const char * string,size_t len,unsigned int * result)811 extract_hex (const char *string, size_t len, unsigned int *result)
812 {
813   size_t i;
814 
815   *result = 0;
816 
817   for (i = 0; i < len; i++)
818     {
819       char c = string[i];
820       int number;
821 
822       if (c >= 'A' && c <= 'F')
823         number = c - 'A' + 10;
824       else if (c >= 'a' && c <= 'f')
825         number = c - 'a' + 10;
826       else if (c >= '0' && c <= '9')
827         number = c - '0';
828       else
829         break;
830 
831       *result <<= 4;
832       *result |= number;
833     }
834 
835   return string + i;
836 }
837 
838 /* Extract an unsigned octal number from STRING, considering at
839    most LEN bytes and place the result in *RESULT.  Returns a pointer
840    to the first character past the octal number.  */
841 static const char *
extract_oct(const char * string,size_t len,unsigned int * result)842 extract_oct (const char *string, size_t len, unsigned int *result)
843 {
844   size_t i;
845 
846   *result = 0;
847 
848   for (i = 0; i < len; i++)
849     {
850       char c = string[i];
851       int number;
852 
853       if (c >= '0' && c <= '7')
854         number = c - '0';
855       else
856         break;
857 
858       *result <<= 3;
859       *result |= number;
860     }
861 
862   return string + i;
863 }
864 
865 /* Extract the various quotelike constructs except for <<EOF.  See the
866    section "Gory details of parsing quoted constructs" in perlop.pod.
867    Return the resulting token in *tp; tp->type == token_type_string.  */
868 static void
extract_quotelike(token_ty * tp,int delim)869 extract_quotelike (token_ty *tp, int delim)
870 {
871   char *string = extract_quotelike_pass1_utf8 (delim);
872   size_t len = strlen (string);
873 
874   tp->type = token_type_string;
875   /* Take the string without the delimiters at the start and at the end.  */
876   if (!(len >= 2))
877     abort ();
878   string[len - 1] = '\0';
879   tp->string = xstrdup (string + 1);
880   free (string);
881   tp->comment = add_reference (savable_comment);
882 }
883 
884 /* Extract the quotelike constructs with double delimiters, like
885    s/[SEARCH]/[REPLACE]/.  This function does not eat up trailing
886    modifiers (left to the caller).
887    Return the resulting token in *tp; tp->type == token_type_regex_op.  */
888 static void
extract_triple_quotelike(message_list_ty * mlp,token_ty * tp,int delim,bool interpolate)889 extract_triple_quotelike (message_list_ty *mlp, token_ty *tp, int delim,
890                           bool interpolate)
891 {
892   char *string;
893 
894   tp->type = token_type_regex_op;
895 
896   string = extract_quotelike_pass1_utf8 (delim);
897   if (interpolate)
898     interpolate_keywords (mlp, string, line_number);
899   free (string);
900 
901   if (delim == '(' || delim == '<' || delim == '{' || delim == '[')
902     {
903       /* The delimiter for the second string can be different, e.g.
904          s{SEARCH}{REPLACE} or s{SEARCH}/REPLACE/.  See "man perlrequick".  */
905       delim = phase1_getc ();
906       while (is_whitespace (delim))
907         {
908           /* The hash-sign is not a valid delimiter after whitespace, ergo
909              use phase2_getc() and not phase1_getc() now.  */
910           delim = phase2_getc ();
911         }
912     }
913   string = extract_quotelike_pass1_utf8 (delim);
914   if (interpolate)
915     interpolate_keywords (mlp, string, line_number);
916   free (string);
917 }
918 
919 /* Perform pass 3 of quotelike extraction (interpolation).
920    *tp is a token of type token_type_string.
921    This function replaces tp->string.
922    This function does not access tp->comment.  */
923 /* FIXME: Currently may writes null-bytes into the string.  */
924 static void
extract_quotelike_pass3(token_ty * tp,int error_level)925 extract_quotelike_pass3 (token_ty *tp, int error_level)
926 {
927   static char *buffer;
928   static int bufmax = 0;
929   int bufpos = 0;
930   const char *crs;
931   bool uppercase;
932   bool lowercase;
933   bool quotemeta;
934 
935 #if DEBUG_PERL
936   switch (tp->sub_type)
937     {
938     case string_type_verbatim:
939       fprintf (stderr, "Interpolating string_type_verbatim:\n");
940       break;
941     case string_type_q:
942       fprintf (stderr, "Interpolating string_type_q:\n");
943       break;
944     case string_type_qq:
945       fprintf (stderr, "Interpolating string_type_qq:\n");
946       break;
947     case string_type_qr:
948       fprintf (stderr, "Interpolating string_type_qr:\n");
949       break;
950     }
951   fprintf (stderr, "%s\n", tp->string);
952   if (tp->sub_type == string_type_verbatim)
953     fprintf (stderr, "---> %s\n", tp->string);
954 #endif
955 
956   if (tp->sub_type == string_type_verbatim)
957     return;
958 
959   /* Loop over tp->string, accumulating the expansion in buffer.  */
960   crs = tp->string;
961   uppercase = false;
962   lowercase = false;
963   quotemeta = false;
964   while (*crs)
965     {
966       bool backslashed;
967 
968       /* Ensure room for 7 bytes, 6 (multi-)bytes plus a leading backslash
969          if \Q modifier is present.  */
970       if (bufpos + 7 > bufmax)
971         {
972           bufmax = 2 * bufmax + 10;
973           buffer = xrealloc (buffer, bufmax);
974         }
975 
976       if (tp->sub_type == string_type_q)
977         {
978           switch (*crs)
979             {
980             case '\\':
981               if (crs[1] == '\\')
982                 {
983                   crs += 2;
984                   buffer[bufpos++] = '\\';
985                   break;
986                 }
987               /* FALLTHROUGH */
988             default:
989               buffer[bufpos++] = *crs++;
990               break;
991             }
992           continue;
993         }
994 
995       /* We only get here for double-quoted strings or regular expressions.
996          Unescape escape sequences.  */
997       if (*crs == '\\')
998         {
999           switch (crs[1])
1000             {
1001             case 't':
1002               crs += 2;
1003               buffer[bufpos++] = '\t';
1004               continue;
1005             case 'n':
1006               crs += 2;
1007               buffer[bufpos++] = '\n';
1008               continue;
1009             case 'r':
1010               crs += 2;
1011               buffer[bufpos++] = '\r';
1012               continue;
1013             case 'f':
1014               crs += 2;
1015               buffer[bufpos++] = '\f';
1016               continue;
1017             case 'b':
1018               crs += 2;
1019               buffer[bufpos++] = '\b';
1020               continue;
1021             case 'a':
1022               crs += 2;
1023               buffer[bufpos++] = '\a';
1024               continue;
1025             case 'e':
1026               crs += 2;
1027               buffer[bufpos++] = 0x1b;
1028               continue;
1029             case '0': case '1': case '2': case '3':
1030             case '4': case '5': case '6': case '7':
1031               {
1032                 unsigned int oct_number;
1033                 int length;
1034 
1035                 crs = extract_oct (crs + 1, 3, &oct_number);
1036 
1037                 /* FIXME: If one of the variables UPPERCASE or LOWERCASE is
1038                    true, the character should be converted to its uppercase
1039                    resp. lowercase equivalent.  I don't know if the necessary
1040                    facilities are already included in gettext.  For US-Ascii
1041                    the conversion can be already be done, however.  */
1042                 if (uppercase && oct_number >= 'a' && oct_number <= 'z')
1043                   {
1044                     oct_number = oct_number - 'a' + 'A';
1045                   }
1046                 else if (lowercase && oct_number >= 'A' && oct_number <= 'Z')
1047                   {
1048                     oct_number = oct_number - 'A' + 'a';
1049                   }
1050 
1051 
1052                 /* Yes, octal escape sequences in the range 0x100..0x1ff are
1053                    valid.  */
1054                 length = u8_uctomb ((unsigned char *) (buffer + bufpos),
1055                                     oct_number, 2);
1056                 if (length > 0)
1057                   bufpos += length;
1058               }
1059               continue;
1060             case 'x':
1061               {
1062                 unsigned int hex_number = 0;
1063                 int length;
1064 
1065                 crs += 2;
1066                 if (*crs == '{')
1067                   {
1068                     const char *end = strchr (crs, '}');
1069                     if (end == NULL)
1070                       {
1071                         error_with_progname = false;
1072                         error (error_level, 0,
1073                                _("%s:%d: missing right brace on \\x{HEXNUMBER}"),
1074                                real_file_name, line_number);
1075                         error_with_progname = true;
1076                         ++crs;
1077                         continue;
1078                       }
1079                     else
1080                       {
1081                         ++crs;
1082                         (void) extract_hex (crs, end - crs, &hex_number);
1083                         crs = end + 1;
1084                       }
1085                   }
1086                 else
1087                   {
1088                     crs = extract_hex (crs, 2, &hex_number);
1089                   }
1090 
1091                 /* FIXME: If one of the variables UPPERCASE or LOWERCASE is
1092                    true, the character should be converted to its uppercase
1093                    resp. lowercase equivalent.  I don't know if the necessary
1094                    facilities are already included in gettext.  For US-Ascii
1095                    the conversion can be already be done, however.  */
1096                 if (uppercase && hex_number >= 'a' && hex_number <= 'z')
1097                   {
1098                     hex_number = hex_number - 'a' + 'A';
1099                   }
1100                 else if (lowercase && hex_number >= 'A' && hex_number <= 'Z')
1101                   {
1102                     hex_number = hex_number - 'A' + 'a';
1103                   }
1104 
1105                 length = u8_uctomb ((unsigned char *) (buffer + bufpos),
1106                                     hex_number, 6);
1107 
1108                 if (length > 0)
1109                   bufpos += length;
1110               }
1111               continue;
1112             case 'c':
1113               /* Perl's notion of control characters.  */
1114               crs += 2;
1115               if (*crs)
1116                 {
1117                   int the_char = (unsigned char) *crs;
1118                   if (the_char >= 'a' && the_char <= 'z')
1119                     the_char = the_char - 'a' + 'A';
1120                   buffer[bufpos++] = the_char ^ 0x40;
1121                 }
1122               continue;
1123             case 'N':
1124               crs += 2;
1125               if (*crs == '{')
1126                 {
1127                   const char *end = strchr (crs + 1, '}');
1128                   if (end != NULL)
1129                     {
1130                       char *name;
1131                       unsigned int unicode;
1132 
1133                       name = XNMALLOC (end - (crs + 1) + 1, char);
1134                       memcpy (name, crs + 1, end - (crs + 1));
1135                       name[end - (crs + 1)] = '\0';
1136 
1137                       unicode = unicode_name_character (name);
1138                       if (unicode != UNINAME_INVALID)
1139                         {
1140                           /* FIXME: Convert to upper/lowercase if the
1141                              corresponding flag is set to true.  */
1142                           int length =
1143                             u8_uctomb ((unsigned char *) (buffer + bufpos),
1144                                        unicode, 6);
1145                           if (length > 0)
1146                             bufpos += length;
1147                         }
1148 
1149                       free (name);
1150 
1151                       crs = end + 1;
1152                     }
1153                 }
1154               continue;
1155             }
1156         }
1157 
1158       /* No escape sequence, go on.  */
1159       if (*crs == '\\')
1160         {
1161           ++crs;
1162           switch (*crs)
1163             {
1164             case 'E':
1165               uppercase = false;
1166               lowercase = false;
1167               quotemeta = false;
1168               ++crs;
1169               continue;
1170             case 'L':
1171               uppercase = false;
1172               lowercase = true;
1173               ++crs;
1174               continue;
1175             case 'U':
1176               uppercase = true;
1177               lowercase = false;
1178               ++crs;
1179               continue;
1180             case 'Q':
1181               quotemeta = true;
1182               ++crs;
1183               continue;
1184             case 'l':
1185               ++crs;
1186               if (*crs >= 'A' && *crs <= 'Z')
1187                 {
1188                   buffer[bufpos++] = *crs - 'A' + 'a';
1189                 }
1190               else if ((unsigned char) *crs >= 0x80)
1191                 {
1192                   error_with_progname = false;
1193                   error (error_level, 0,
1194                          _("%s:%d: invalid interpolation (\"\\l\") of 8bit character \"%c\""),
1195                          real_file_name, line_number, *crs);
1196                   error_with_progname = true;
1197                 }
1198               else
1199                 {
1200                   buffer[bufpos++] = *crs;
1201                 }
1202               ++crs;
1203               continue;
1204             case 'u':
1205               ++crs;
1206               if (*crs >= 'a' && *crs <= 'z')
1207                 {
1208                   buffer[bufpos++] = *crs - 'a' + 'A';
1209                 }
1210               else if ((unsigned char) *crs >= 0x80)
1211                 {
1212                   error_with_progname = false;
1213                   error (error_level, 0,
1214                          _("%s:%d: invalid interpolation (\"\\u\") of 8bit character \"%c\""),
1215                          real_file_name, line_number, *crs);
1216                   error_with_progname = true;
1217                 }
1218               else
1219                 {
1220                   buffer[bufpos++] = *crs;
1221                 }
1222               ++crs;
1223               continue;
1224             case '\\':
1225               buffer[bufpos++] = *crs;
1226               ++crs;
1227               continue;
1228             default:
1229               backslashed = true;
1230               break;
1231             }
1232         }
1233       else
1234         backslashed = false;
1235 
1236       if (quotemeta
1237           && !((*crs >= 'A' && *crs <= 'Z') || (*crs >= 'A' && *crs <= 'z')
1238                || (*crs >= '0' && *crs <= '9') || *crs == '_'))
1239         {
1240           buffer[bufpos++] = '\\';
1241           backslashed = true;
1242         }
1243 
1244       if (!backslashed && !extract_all && (*crs == '$' || *crs == '@'))
1245         {
1246           error_with_progname = false;
1247           error (error_level, 0,
1248                  _("%s:%d: invalid variable interpolation at \"%c\""),
1249                  real_file_name, line_number, *crs);
1250           error_with_progname = true;
1251           ++crs;
1252         }
1253       else if (lowercase)
1254         {
1255           if (*crs >= 'A' && *crs <= 'Z')
1256             buffer[bufpos++] = *crs - 'A' + 'a';
1257           else if ((unsigned char) *crs >= 0x80)
1258             {
1259               error_with_progname = false;
1260               error (error_level, 0,
1261                      _("%s:%d: invalid interpolation (\"\\L\") of 8bit character \"%c\""),
1262                      real_file_name, line_number, *crs);
1263               error_with_progname = true;
1264               buffer[bufpos++] = *crs;
1265             }
1266           else
1267             buffer[bufpos++] = *crs;
1268           ++crs;
1269         }
1270       else if (uppercase)
1271         {
1272           if (*crs >= 'a' && *crs <= 'z')
1273             buffer[bufpos++] = *crs - 'a' + 'A';
1274           else if ((unsigned char) *crs >= 0x80)
1275             {
1276               error_with_progname = false;
1277               error (error_level, 0,
1278                      _("%s:%d: invalid interpolation (\"\\U\") of 8bit character \"%c\""),
1279                      real_file_name, line_number, *crs);
1280               error_with_progname = true;
1281               buffer[bufpos++] = *crs;
1282             }
1283           else
1284             buffer[bufpos++] = *crs;
1285           ++crs;
1286         }
1287       else
1288         {
1289           buffer[bufpos++] = *crs++;
1290         }
1291     }
1292 
1293   /* Ensure room for 1 more byte.  */
1294   if (bufpos >= bufmax)
1295     {
1296       bufmax = 2 * bufmax + 10;
1297       buffer = xrealloc (buffer, bufmax);
1298     }
1299 
1300   buffer[bufpos++] = '\0';
1301 
1302 #if DEBUG_PERL
1303   fprintf (stderr, "---> %s\n", buffer);
1304 #endif
1305 
1306   /* Replace tp->string.  */
1307   free (tp->string);
1308   tp->string = xstrdup (buffer);
1309 }
1310 
1311 /* Parse a variable.  This is done in several steps:
1312      1) Consume all leading occurencies of '$', '@', '%', and '*'.
1313      2) Determine the name of the variable from the following input.
1314      3) Parse possible following hash keys or array indexes.
1315  */
1316 static void
extract_variable(message_list_ty * mlp,token_ty * tp,int first)1317 extract_variable (message_list_ty *mlp, token_ty *tp, int first)
1318 {
1319   static char *buffer;
1320   static int bufmax = 0;
1321   int bufpos = 0;
1322   int c = first;
1323   size_t varbody_length = 0;
1324   bool maybe_hash_deref = false;
1325   bool maybe_hash_value = false;
1326 
1327   tp->type = token_type_variable;
1328 
1329 #if DEBUG_PERL
1330   fprintf (stderr, "%s:%d: extracting variable type '%c'\n",
1331            real_file_name, line_number, first);
1332 #endif
1333 
1334   /*
1335    * 1) Consume dollars and so on (not euros ...).  Unconditionally
1336    *    accepting the hash sign (#) will maybe lead to inaccurate
1337    *    results.  FIXME!
1338    */
1339   while (c == '$' || c == '*' || c == '#' || c == '@' || c == '%')
1340     {
1341       if (bufpos >= bufmax)
1342         {
1343           bufmax = 2 * bufmax + 10;
1344           buffer = xrealloc (buffer, bufmax);
1345         }
1346       buffer[bufpos++] = c;
1347       c = phase1_getc ();
1348     }
1349 
1350   if (c == EOF)
1351     {
1352       tp->type = token_type_eof;
1353       return;
1354     }
1355 
1356   /* Hash references are treated in a special way, when looking for
1357      our keywords.  */
1358   if (buffer[0] == '$')
1359     {
1360       if (bufpos == 1)
1361         maybe_hash_value = true;
1362       else if (bufpos == 2 && buffer[1] == '$')
1363         {
1364           if (!(c == '{'
1365                 || (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z')
1366                 || (c >= '0' && c <= '9')
1367                 || c == '_' || c == ':' || c == '\'' || c >= 0x80))
1368             {
1369               /* Special variable $$ for pid.  */
1370               if (bufpos >= bufmax)
1371                 {
1372                   bufmax = 2 * bufmax + 10;
1373                   buffer = xrealloc (buffer, bufmax);
1374                 }
1375               buffer[bufpos++] = '\0';
1376               tp->string = xstrdup (buffer);
1377 #if DEBUG_PERL
1378               fprintf (stderr, "%s:%d: is PID ($$)\n",
1379                        real_file_name, line_number);
1380 #endif
1381 
1382               phase1_ungetc (c);
1383               return;
1384             }
1385 
1386           maybe_hash_deref = true;
1387           bufpos = 1;
1388         }
1389     }
1390 
1391   /*
1392    * 2) Get the name of the variable.  The first character is practically
1393    *    arbitrary.  Punctuation and numbers automagically put a variable
1394    *    in the global namespace but that subtle difference is not interesting
1395    *    for us.
1396    */
1397   if (bufpos >= bufmax)
1398     {
1399       bufmax = 2 * bufmax + 10;
1400       buffer = xrealloc (buffer, bufmax);
1401     }
1402   if (c == '{')
1403     {
1404       /* Yuck, we cannot accept ${gettext} as a keyword...  Except for
1405        * debugging purposes it is also harmless, that we suppress the
1406        * real name of the variable.
1407        */
1408 #if DEBUG_PERL
1409       fprintf (stderr, "%s:%d: braced {variable_name}\n",
1410                real_file_name, line_number);
1411 #endif
1412 
1413       if (extract_balanced (mlp, token_type_rbrace, true, false,
1414                             null_context, null_context_list_iterator,
1415                             1, arglist_parser_alloc (mlp, NULL)))
1416         {
1417           tp->type = token_type_eof;
1418           return;
1419         }
1420       buffer[bufpos++] = c;
1421     }
1422   else
1423     {
1424       while ((c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z')
1425              || (c >= '0' && c <= '9')
1426              || c == '_' || c == ':' || c == '\'' || c >= 0x80)
1427         {
1428           ++varbody_length;
1429           if (bufpos >= bufmax)
1430             {
1431               bufmax = 2 * bufmax + 10;
1432               buffer = xrealloc (buffer, bufmax);
1433             }
1434           buffer[bufpos++] = c;
1435           c = phase1_getc ();
1436         }
1437       phase1_ungetc (c);
1438     }
1439 
1440   /* Probably some strange Perl variable like $`.  */
1441   if (varbody_length == 0)
1442     {
1443       c = phase1_getc ();
1444       if (c == EOF || is_whitespace (c))
1445         phase1_ungetc (c);  /* Loser.  */
1446       else
1447         {
1448           if (bufpos >= bufmax)
1449             {
1450               bufmax = 2 * bufmax + 10;
1451               buffer = xrealloc (buffer, bufmax);
1452             }
1453           buffer[bufpos++] = c;
1454         }
1455     }
1456 
1457   if (bufpos >= bufmax)
1458     {
1459       bufmax = 2 * bufmax + 10;
1460       buffer = xrealloc (buffer, bufmax);
1461     }
1462   buffer[bufpos++] = '\0';
1463 
1464   tp->string = xstrdup (buffer);
1465 
1466 #if DEBUG_PERL
1467   fprintf (stderr, "%s:%d: complete variable name: %s\n",
1468            real_file_name, line_number, tp->string);
1469 #endif
1470 
1471   /*
1472    * 3) If the following looks strange to you, this is valid Perl syntax:
1473    *
1474    *      $var = $$hashref    # We can place a
1475    *                          # comment here and then ...
1476    *             {key_into_hashref};
1477    *
1478    *    POD sections are not allowed but we leave complaints about
1479    *    that to the compiler/interpreter.
1480    */
1481   /* We only extract strings from the first hash key (if present).  */
1482 
1483   if (maybe_hash_deref || maybe_hash_value)
1484     {
1485       bool is_dereference = false;
1486       int c;
1487 
1488       do
1489         c = phase2_getc ();
1490       while (is_whitespace (c));
1491 
1492       if (c == '-')
1493         {
1494           int c2 = phase1_getc ();
1495 
1496           if (c2 == '>')
1497             {
1498               is_dereference = true;
1499 
1500               do
1501                 c = phase2_getc ();
1502               while (is_whitespace (c));
1503             }
1504           else if (c2 != '\n')
1505             {
1506               /* Discarding the newline is harmless here.  The only
1507                  special character recognized after a minus is greater-than
1508                  for dereference.  However, the sequence "-\n>" that we
1509                  treat incorrectly here, is a syntax error.  */
1510               phase1_ungetc (c2);
1511             }
1512         }
1513 
1514       if (maybe_hash_value && is_dereference)
1515         {
1516           tp->type = token_type_object;
1517 #if DEBUG_PERL
1518           fprintf (stderr, "%s:%d: first keys preceded by \"->\"\n",
1519                    real_file_name, line_number);
1520 #endif
1521         }
1522       else if (maybe_hash_value)
1523         {
1524           /* Fake it into a hash.  */
1525           tp->string[0] = '%';
1526         }
1527 
1528       /* Do NOT change that into else if (see above).  */
1529       if ((maybe_hash_value || maybe_hash_deref) && c == '{')
1530         {
1531           void *keyword_value;
1532 
1533 #if DEBUG_PERL
1534           fprintf (stderr, "%s:%d: first keys preceded by '{'\n",
1535                    real_file_name, line_number);
1536 #endif
1537 
1538           if (hash_find_entry (&keywords, tp->string, strlen (tp->string),
1539                                &keyword_value) == 0)
1540             {
1541               /* TODO: Shouldn't we use the shapes of the keyword, instead
1542                  of hardwiring argnum1 = 1 ?
1543               const struct callshapes *shapes =
1544                 (const struct callshapes *) keyword_value;
1545               */
1546               struct callshapes shapes;
1547               shapes.keyword = tp->string; /* XXX storage duration? */
1548               shapes.keyword_len = strlen (tp->string);
1549               shapes.nshapes = 1;
1550               shapes.shapes[0].argnum1 = 1;
1551               shapes.shapes[0].argnum2 = 0;
1552               shapes.shapes[0].argnumc = 0;
1553               shapes.shapes[0].argnum1_glib_context = false;
1554               shapes.shapes[0].argnum2_glib_context = false;
1555               shapes.shapes[0].argtotal = 0;
1556               string_list_init (&shapes.shapes[0].xcomments);
1557 
1558               {
1559                 /* Extract a possible string from the key.  Before proceeding
1560                    we check whether the open curly is followed by a symbol and
1561                    then by a right curly.  */
1562                 flag_context_list_iterator_ty context_iter =
1563                   flag_context_list_iterator (
1564                     flag_context_list_table_lookup (
1565                       flag_context_list_table,
1566                       tp->string, strlen (tp->string)));
1567                 token_ty *t1 = x_perl_lex (mlp);
1568 
1569 #if DEBUG_PERL
1570                 fprintf (stderr, "%s:%d: extracting string key\n",
1571                          real_file_name, line_number);
1572 #endif
1573 
1574                 if (t1->type == token_type_symbol
1575                     || t1->type == token_type_named_op)
1576                   {
1577                     token_ty *t2 = x_perl_lex (mlp);
1578                     if (t2->type == token_type_rbrace)
1579                       {
1580                         flag_context_ty context;
1581                         lex_pos_ty pos;
1582 
1583                         context =
1584                           inherited_context (null_context,
1585                                              flag_context_list_iterator_advance (
1586                                                &context_iter));
1587 
1588                         pos.line_number = line_number;
1589                         pos.file_name = logical_file_name;
1590 
1591                         remember_a_message (mlp, NULL, xstrdup (t1->string),
1592                                             true, false, context, &pos,
1593                                             NULL, savable_comment, true);
1594                         free_token (t2);
1595                         free_token (t1);
1596                       }
1597                     else
1598                       {
1599                         x_perl_unlex (t2);
1600                       }
1601                   }
1602                 else
1603                   {
1604                     x_perl_unlex (t1);
1605                     if (extract_balanced (mlp, token_type_rbrace, true, false,
1606                                           null_context, context_iter,
1607                                           1, arglist_parser_alloc (mlp, &shapes)))
1608                       return;
1609                   }
1610               }
1611             }
1612           else
1613             {
1614               phase2_ungetc (c);
1615             }
1616         }
1617       else
1618         {
1619           phase2_ungetc (c);
1620         }
1621     }
1622 
1623   /* Now consume "->", "[...]", and "{...}".  */
1624   for (;;)
1625     {
1626       int c = phase2_getc ();
1627       int c2;
1628 
1629       switch (c)
1630         {
1631         case '{':
1632 #if DEBUG_PERL
1633           fprintf (stderr, "%s:%d: extracting balanced '{' after varname\n",
1634                    real_file_name, line_number);
1635 #endif
1636           extract_balanced (mlp, token_type_rbrace, true, false,
1637                             null_context, null_context_list_iterator,
1638                             1, arglist_parser_alloc (mlp, NULL));
1639           break;
1640 
1641         case '[':
1642 #if DEBUG_PERL
1643           fprintf (stderr, "%s:%d: extracting balanced '[' after varname\n",
1644                    real_file_name, line_number);
1645 #endif
1646           extract_balanced (mlp, token_type_rbracket, true, false,
1647                             null_context, null_context_list_iterator,
1648                             1, arglist_parser_alloc (mlp, NULL));
1649           break;
1650 
1651         case '-':
1652           c2 = phase1_getc ();
1653           if (c2 == '>')
1654             {
1655 #if DEBUG_PERL
1656               fprintf (stderr, "%s:%d: another \"->\" after varname\n",
1657                        real_file_name, line_number);
1658 #endif
1659               break;
1660             }
1661           else if (c2 != '\n')
1662             {
1663               /* Discarding the newline is harmless here.  The only
1664                  special character recognized after a minus is greater-than
1665                  for dereference.  However, the sequence "-\n>" that we
1666                  treat incorrectly here, is a syntax error.  */
1667               phase1_ungetc (c2);
1668             }
1669           /* FALLTHROUGH */
1670 
1671         default:
1672 #if DEBUG_PERL
1673           fprintf (stderr, "%s:%d: variable finished\n",
1674                    real_file_name, line_number);
1675 #endif
1676           phase2_ungetc (c);
1677           return;
1678         }
1679     }
1680 }
1681 
1682 /* Actually a simplified version of extract_variable().  It searches for
1683    variables inside a double-quoted string that may interpolate to
1684    some keyword hash (reference).  The string is UTF-8 encoded.  */
1685 static void
interpolate_keywords(message_list_ty * mlp,const char * string,int lineno)1686 interpolate_keywords (message_list_ty *mlp, const char *string, int lineno)
1687 {
1688   static char *buffer;
1689   static int bufmax = 0;
1690   int bufpos = 0;
1691   flag_context_ty context;
1692   int c;
1693   bool maybe_hash_deref = false;
1694   enum parser_state
1695     {
1696       initial,
1697       one_dollar,
1698       two_dollars,
1699       identifier,
1700       minus,
1701       wait_lbrace,
1702       wait_quote,
1703       dquote,
1704       squote,
1705       barekey,
1706       wait_rbrace
1707     } state;
1708   token_ty token;
1709 
1710   lex_pos_ty pos;
1711 
1712   /* States are:
1713    *
1714    * initial:      initial
1715    * one_dollar:   dollar sign seen in state INITIAL
1716    * two_dollars:  another dollar-sign has been seen in state ONE_DOLLAR
1717    * identifier:   a valid identifier character has been seen in state
1718    *               ONE_DOLLAR or TWO_DOLLARS
1719    * minus:        a minus-sign has been seen in state IDENTIFIER
1720    * wait_lbrace:  a greater-than has been seen in state MINUS
1721    * wait_quote:   a left brace has been seen in state IDENTIFIER or in
1722    *               state WAIT_LBRACE
1723    * dquote:       a double-quote has been seen in state WAIT_QUOTE
1724    * squote:       a single-quote has been seen in state WAIT_QUOTE
1725    * barekey:      an bareword character has been seen in state WAIT_QUOTE
1726    * wait_rbrace:  closing quote has been seen in state DQUOTE or SQUOTE
1727    *
1728    * In the states initial...identifier the context is null_context; in the
1729    * states minus...wait_rbrace the context is the one suitable for the first
1730    * argument of the last seen identifier.
1731    */
1732   state = initial;
1733   context = null_context;
1734 
1735   token.type = token_type_string;
1736   token.sub_type = string_type_qq;
1737   token.line_number = line_number;
1738   /* No need for  token.comment = add_reference (savable_comment);  here.
1739      We can let token.comment uninitialized here, and use savable_comment
1740      directly, because this function only parses the given string and does
1741      not call phase2_getc.  */
1742   pos.file_name = logical_file_name;
1743   pos.line_number = lineno;
1744 
1745   while ((c = (unsigned char) *string++) != '\0')
1746     {
1747       void *keyword_value;
1748 
1749       if (state == initial)
1750         bufpos = 0;
1751 
1752       if (c == '\n')
1753         lineno++;
1754 
1755       if (bufpos + 1 >= bufmax)
1756         {
1757           bufmax = 2 * bufmax + 10;
1758           buffer = xrealloc (buffer, bufmax);
1759         }
1760 
1761       switch (state)
1762         {
1763         case initial:
1764           switch (c)
1765             {
1766             case '\\':
1767               c = (unsigned char) *string++;
1768               if (c == '\0')
1769                 return;
1770               break;
1771             case '$':
1772               buffer[bufpos++] = '$';
1773               maybe_hash_deref = false;
1774               state = one_dollar;
1775               break;
1776             default:
1777               break;
1778             }
1779           break;
1780         case one_dollar:
1781           switch (c)
1782             {
1783             case '$':
1784               /*
1785                * This is enough to make us believe later that we dereference
1786                * a hash reference.
1787                */
1788               maybe_hash_deref = true;
1789               state = two_dollars;
1790               break;
1791             default:
1792               if (c == '_' || c == ':' || c == '\'' || c >= 0x80
1793                   || (c >= 'A' && c <= 'Z')
1794                   || (c >= 'a' && c <= 'z')
1795                   || (c >= '0' && c <= '9'))
1796                 {
1797                   buffer[bufpos++] = c;
1798                   state = identifier;
1799                 }
1800               else
1801                 state = initial;
1802               break;
1803             }
1804           break;
1805         case two_dollars:
1806           if (c == '_' || c == ':' || c == '\'' || c >= 0x80
1807               || (c >= 'A' && c <= 'Z')
1808               || (c >= 'a' && c <= 'z')
1809               || (c >= '0' && c <= '9'))
1810             {
1811               buffer[bufpos++] = c;
1812               state = identifier;
1813             }
1814           else
1815             state = initial;
1816           break;
1817         case identifier:
1818           switch (c)
1819             {
1820             case '-':
1821               if (hash_find_entry (&keywords, buffer, bufpos, &keyword_value)
1822                   == 0)
1823                 {
1824                   flag_context_list_iterator_ty context_iter =
1825                     flag_context_list_iterator (
1826                       flag_context_list_table_lookup (
1827                         flag_context_list_table,
1828                         buffer, bufpos));
1829                   context =
1830                     inherited_context (null_context,
1831                                        flag_context_list_iterator_advance (
1832                                          &context_iter));
1833                   state = minus;
1834                 }
1835               else
1836                 state = initial;
1837               break;
1838             case '{':
1839               if (!maybe_hash_deref)
1840                 buffer[0] = '%';
1841               if (hash_find_entry (&keywords, buffer, bufpos, &keyword_value)
1842                   == 0)
1843                 {
1844                   flag_context_list_iterator_ty context_iter =
1845                     flag_context_list_iterator (
1846                       flag_context_list_table_lookup (
1847                         flag_context_list_table,
1848                         buffer, bufpos));
1849                   context =
1850                     inherited_context (null_context,
1851                                        flag_context_list_iterator_advance (
1852                                          &context_iter));
1853                   state = wait_quote;
1854                 }
1855               else
1856                 state = initial;
1857               break;
1858             default:
1859               if (c == '_' || c == ':' || c == '\'' || c >= 0x80
1860                   || (c >= 'A' && c <= 'Z')
1861                   || (c >= 'a' && c <= 'z')
1862                   || (c >= '0' && c <= '9'))
1863                 {
1864                   buffer[bufpos++] = c;
1865                 }
1866               else
1867                 state = initial;
1868               break;
1869             }
1870           break;
1871         case minus:
1872           switch (c)
1873             {
1874             case '>':
1875               state = wait_lbrace;
1876               break;
1877             default:
1878               context = null_context;
1879               state = initial;
1880               break;
1881             }
1882           break;
1883         case wait_lbrace:
1884           switch (c)
1885             {
1886             case '{':
1887               state = wait_quote;
1888               break;
1889             default:
1890               context = null_context;
1891               state = initial;
1892               break;
1893             }
1894           break;
1895         case wait_quote:
1896           switch (c)
1897             {
1898             case_whitespace:
1899               break;
1900             case '\'':
1901               pos.line_number = lineno;
1902               bufpos = 0;
1903               state = squote;
1904               break;
1905             case '"':
1906               pos.line_number = lineno;
1907               bufpos = 0;
1908               state = dquote;
1909               break;
1910             default:
1911               if (c == '_' || (c >= '0' && c <= '9') || c >= 0x80
1912                   || (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z'))
1913                 {
1914                   pos.line_number = lineno;
1915                   bufpos = 0;
1916                   buffer[bufpos++] = c;
1917                   state = barekey;
1918                 }
1919               else
1920                 {
1921                   context = null_context;
1922                   state = initial;
1923                 }
1924               break;
1925             }
1926           break;
1927         case dquote:
1928           switch (c)
1929             {
1930             case '"':
1931               /* The resulting string has to be interpolated twice.  */
1932               buffer[bufpos] = '\0';
1933               token.string = xstrdup (buffer);
1934               extract_quotelike_pass3 (&token, EXIT_FAILURE);
1935               /* The string can only shrink with interpolation (because
1936                  we ignore \Q).  */
1937               if (!(strlen (token.string) <= bufpos))
1938                 abort ();
1939               strcpy (buffer, token.string);
1940               free (token.string);
1941               state = wait_rbrace;
1942               break;
1943             case '\\':
1944               if (string[0] == '\"')
1945                 {
1946                   buffer[bufpos++] = string++[0];
1947                 }
1948               else if (string[0])
1949                 {
1950                   buffer[bufpos++] = '\\';
1951                   buffer[bufpos++] = string++[0];
1952                 }
1953               else
1954                 {
1955                   context = null_context;
1956                   state = initial;
1957                 }
1958               break;
1959             default:
1960               buffer[bufpos++] = c;
1961               break;
1962             }
1963           break;
1964         case squote:
1965           switch (c)
1966             {
1967             case '\'':
1968               state = wait_rbrace;
1969               break;
1970             case '\\':
1971               if (string[0] == '\'')
1972                 {
1973                   buffer[bufpos++] = string++[0];
1974                 }
1975               else if (string[0])
1976                 {
1977                   buffer[bufpos++] = '\\';
1978                   buffer[bufpos++] = string++[0];
1979                 }
1980               else
1981                 {
1982                   context = null_context;
1983                   state = initial;
1984                 }
1985               break;
1986             default:
1987               buffer[bufpos++] = c;
1988               break;
1989             }
1990           break;
1991         case barekey:
1992           if (c == '_' || (c >= '0' && c <= '9') || c >= 0x80
1993               || (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z'))
1994             {
1995               buffer[bufpos++] = c;
1996               break;
1997             }
1998           else if (is_whitespace (c))
1999             {
2000               state = wait_rbrace;
2001               break;
2002             }
2003           else if (c != '}')
2004             {
2005               context = null_context;
2006               state = initial;
2007               break;
2008             }
2009           /* Must be right brace.  */
2010           /* FALLTHROUGH */
2011         case wait_rbrace:
2012           switch (c)
2013             {
2014             case_whitespace:
2015               break;
2016             case '}':
2017               buffer[bufpos] = '\0';
2018               token.string = xstrdup (buffer);
2019               extract_quotelike_pass3 (&token, EXIT_FAILURE);
2020               remember_a_message (mlp, NULL, token.string, true, false, context,
2021                                   &pos, NULL, savable_comment, true);
2022               /* FALLTHROUGH */
2023             default:
2024               context = null_context;
2025               state = initial;
2026               break;
2027             }
2028           break;
2029         }
2030     }
2031 }
2032 
2033 /* There is an ambiguity about '/' and '?': They can start an operator
2034    (division operator '/' or '/=' or the conditional operator '?'), or they can
2035    start a regular expression.  The distinction is important because inside
2036    regular expressions, '#' loses its special meaning.  This function helps
2037    making the decision (a heuristic).  See the documentation for details.  */
2038 static bool
prefer_regexp_over_division(token_type_ty type)2039 prefer_regexp_over_division (token_type_ty type)
2040 {
2041   bool retval = true;
2042 
2043   switch (type)
2044     {
2045       case token_type_eof:
2046         retval = true;
2047         break;
2048       case token_type_lparen:
2049         retval = true;
2050         break;
2051       case token_type_rparen:
2052         retval = false;
2053         break;
2054       case token_type_comma:
2055         retval = true;
2056         break;
2057       case token_type_fat_comma:
2058         retval = true;
2059         break;
2060       case token_type_dereference:
2061         retval = true;
2062         break;
2063       case token_type_semicolon:
2064         retval = true;
2065         break;
2066       case token_type_lbrace:
2067         retval = true;
2068         break;
2069       case token_type_rbrace:
2070         retval = false;
2071         break;
2072       case token_type_lbracket:
2073         retval = true;
2074         break;
2075       case token_type_rbracket:
2076         retval = false;
2077         break;
2078       case token_type_string:
2079         retval = false;
2080         break;
2081       case token_type_number:
2082         retval = false;
2083         break;
2084       case token_type_named_op:
2085         retval = true;
2086         break;
2087       case token_type_variable:
2088         retval = false;
2089         break;
2090       case token_type_object:
2091         retval = false;
2092         break;
2093       case token_type_symbol:
2094       case token_type_keyword_symbol:
2095         retval = true;
2096         break;
2097       case token_type_regex_op:
2098         retval = false;
2099         break;
2100       case token_type_dot:
2101         retval = true;
2102         break;
2103       case token_type_other:
2104         retval = true;
2105         break;
2106   }
2107 
2108 #if DEBUG_PERL
2109   token_ty ty;
2110   ty.type = type;
2111   fprintf (stderr, "Prefer regexp over division after %s: %s\n",
2112            token2string (&ty), retval ? "true" : "false");
2113 #endif
2114 
2115   return retval;
2116 }
2117 
2118 /* Last token type seen in the stream.  Important for the interpretation
2119    of slash and question mark.  */
2120 static token_type_ty last_token_type;
2121 
2122 /* Combine characters into tokens.  Discard whitespace.  */
2123 
2124 static void
x_perl_prelex(message_list_ty * mlp,token_ty * tp)2125 x_perl_prelex (message_list_ty *mlp, token_ty *tp)
2126 {
2127   static char *buffer;
2128   static int bufmax;
2129   int bufpos;
2130   int c;
2131 
2132   for (;;)
2133     {
2134       c = phase2_getc ();
2135       tp->line_number = line_number;
2136       tp->last_type = last_token_type;
2137 
2138       switch (c)
2139         {
2140         case EOF:
2141           tp->type = token_type_eof;
2142           return;
2143 
2144         case '\n':
2145           if (last_non_comment_line > last_comment_line)
2146             savable_comment_reset ();
2147           /* FALLTHROUGH */
2148         case '\t':
2149         case ' ':
2150           /* Ignore whitespace.  */
2151           continue;
2152 
2153         case '%':
2154         case '@':
2155         case '*':
2156         case '$':
2157           if (!extract_all)
2158             {
2159               extract_variable (mlp, tp, c);
2160               return;
2161             }
2162           break;
2163         }
2164 
2165       last_non_comment_line = tp->line_number;
2166 
2167       switch (c)
2168         {
2169         case '.':
2170           {
2171             int c2 = phase1_getc ();
2172             phase1_ungetc (c2);
2173             if (c2 == '.')
2174               {
2175                 tp->type = token_type_other;
2176                 return;
2177               }
2178             else if (!(c2 >= '0' && c2 <= '9'))
2179               {
2180                 tp->type = token_type_dot;
2181                 return;
2182               }
2183           }
2184           /* FALLTHROUGH */
2185         case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
2186         case 'G': case 'H': case 'I': case 'J': case 'K': case 'L':
2187         case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R':
2188         case 'S': case 'T': case 'U': case 'V': case 'W': case 'X':
2189         case 'Y': case 'Z':
2190         case '_':
2191         case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
2192         case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
2193         case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
2194         case 's': case 't': case 'u': case 'v': case 'w': case 'x':
2195         case 'y': case 'z':
2196         case '0': case '1': case '2': case '3': case '4':
2197         case '5': case '6': case '7': case '8': case '9':
2198           /* Symbol, or part of a number.  */
2199           bufpos = 0;
2200           for (;;)
2201             {
2202               if (bufpos >= bufmax)
2203                 {
2204                   bufmax = 2 * bufmax + 10;
2205                   buffer = xrealloc (buffer, bufmax);
2206                 }
2207               buffer[bufpos++] = c;
2208               c = phase1_getc ();
2209               switch (c)
2210                 {
2211                 case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
2212                 case 'G': case 'H': case 'I': case 'J': case 'K': case 'L':
2213                 case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R':
2214                 case 'S': case 'T': case 'U': case 'V': case 'W': case 'X':
2215                 case 'Y': case 'Z':
2216                 case '_':
2217                 case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
2218                 case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
2219                 case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
2220                 case 's': case 't': case 'u': case 'v': case 'w': case 'x':
2221                 case 'y': case 'z':
2222                 case '0': case '1': case '2': case '3': case '4':
2223                 case '5': case '6': case '7': case '8': case '9':
2224                   continue;
2225 
2226                 default:
2227                   phase1_ungetc (c);
2228                   break;
2229                 }
2230               break;
2231             }
2232           if (bufpos >= bufmax)
2233             {
2234               bufmax = 2 * bufmax + 10;
2235               buffer = xrealloc (buffer, bufmax);
2236             }
2237           buffer[bufpos] = '\0';
2238 
2239           if (strcmp (buffer, "__END__") == 0
2240               || strcmp (buffer, "__DATA__") == 0)
2241             {
2242               end_of_file = true;
2243               tp->type = token_type_eof;
2244               return;
2245             }
2246           else if (strcmp (buffer, "and") == 0
2247                    || strcmp (buffer, "cmp") == 0
2248                    || strcmp (buffer, "eq") == 0
2249                    || strcmp (buffer, "if") == 0
2250                    || strcmp (buffer, "ge") == 0
2251                    || strcmp (buffer, "gt") == 0
2252                    || strcmp (buffer, "le") == 0
2253                    || strcmp (buffer, "lt") == 0
2254                    || strcmp (buffer, "ne") == 0
2255                    || strcmp (buffer, "not") == 0
2256                    || strcmp (buffer, "or") == 0
2257                    || strcmp (buffer, "unless") == 0
2258                    || strcmp (buffer, "while") == 0
2259                    || strcmp (buffer, "xor") == 0)
2260             {
2261               tp->type = token_type_named_op;
2262               tp->string = xstrdup (buffer);
2263               return;
2264             }
2265           else if (strcmp (buffer, "s") == 0
2266                  || strcmp (buffer, "y") == 0
2267                  || strcmp (buffer, "tr") == 0)
2268             {
2269               int delim = phase1_getc ();
2270 
2271               while (is_whitespace (delim))
2272                 delim = phase2_getc ();
2273 
2274               if (delim == EOF)
2275                 {
2276                   tp->type = token_type_eof;
2277                   return;
2278                 }
2279               if ((delim >= '0' && delim <= '9')
2280                   || (delim >= 'A' && delim <= 'Z')
2281                   || (delim >= 'a' && delim <= 'z'))
2282                 {
2283                   /* False positive.  */
2284                   phase2_ungetc (delim);
2285                   tp->type = token_type_symbol;
2286                   tp->sub_type = symbol_type_none;
2287                   tp->string = xstrdup (buffer);
2288                   return;
2289                 }
2290               extract_triple_quotelike (mlp, tp, delim,
2291                                         buffer[0] == 's' && delim != '\'');
2292 
2293               /* Eat the following modifiers.  */
2294               do
2295                 c = phase1_getc ();
2296               while (c >= 'a' && c <= 'z');
2297               phase1_ungetc (c);
2298               return;
2299             }
2300           else if (strcmp (buffer, "m") == 0)
2301             {
2302               int delim = phase1_getc ();
2303 
2304               while (is_whitespace (delim))
2305                 delim = phase2_getc ();
2306 
2307               if (delim == EOF)
2308                 {
2309                   tp->type = token_type_eof;
2310                   return;
2311                 }
2312               if ((delim >= '0' && delim <= '9')
2313                   || (delim >= 'A' && delim <= 'Z')
2314                   || (delim >= 'a' && delim <= 'z'))
2315                 {
2316                   /* False positive.  */
2317                   phase2_ungetc (delim);
2318                   tp->type = token_type_symbol;
2319                   tp->sub_type = symbol_type_none;
2320                   tp->string = xstrdup (buffer);
2321                   return;
2322                 }
2323               extract_quotelike (tp, delim);
2324               if (delim != '\'')
2325                 interpolate_keywords (mlp, tp->string, line_number);
2326               free (tp->string);
2327               drop_reference (tp->comment);
2328               tp->type = token_type_regex_op;
2329 
2330               /* Eat the following modifiers.  */
2331               do
2332                 c = phase1_getc ();
2333               while (c >= 'a' && c <= 'z');
2334               phase1_ungetc (c);
2335               return;
2336             }
2337           else if (strcmp (buffer, "qq") == 0
2338                    || strcmp (buffer, "q") == 0
2339                    || strcmp (buffer, "qx") == 0
2340                    || strcmp (buffer, "qw") == 0
2341                    || strcmp (buffer, "qr") == 0)
2342             {
2343               /* The qw (...) construct is not really a string but we
2344                  can treat in the same manner and then pretend it is
2345                  a symbol.  Rationale: Saying "qw (foo bar)" is the
2346                  same as "my @list = ('foo', 'bar'); @list;".  */
2347 
2348               int delim = phase1_getc ();
2349 
2350               while (is_whitespace (delim))
2351                 delim = phase2_getc ();
2352 
2353               if (delim == EOF)
2354                 {
2355                   tp->type = token_type_eof;
2356                   return;
2357                 }
2358 
2359               if ((delim >= '0' && delim <= '9')
2360                   || (delim >= 'A' && delim <= 'Z')
2361                   || (delim >= 'a' && delim <= 'z'))
2362                 {
2363                   /* False positive.  */
2364                   phase2_ungetc (delim);
2365                   tp->type = token_type_symbol;
2366                   tp->sub_type = symbol_type_none;
2367                   tp->string = xstrdup (buffer);
2368                   return;
2369                 }
2370 
2371               extract_quotelike (tp, delim);
2372 
2373               switch (buffer[1])
2374                 {
2375                 case 'q':
2376                 case 'x':
2377                   tp->type = token_type_string;
2378                   tp->sub_type = string_type_qq;
2379                   interpolate_keywords (mlp, tp->string, line_number);
2380                   break;
2381                 case 'r':
2382                   drop_reference (tp->comment);
2383                   tp->type = token_type_regex_op;
2384                   break;
2385                 case 'w':
2386                   drop_reference (tp->comment);
2387                   tp->type = token_type_symbol;
2388                   tp->sub_type = symbol_type_none;
2389                   break;
2390                 case '\0':
2391                   tp->type = token_type_string;
2392                   tp->sub_type = string_type_q;
2393                   break;
2394                 default:
2395                   abort ();
2396                 }
2397               return;
2398             }
2399           else if ((buffer[0] >= '0' && buffer[0] <= '9') || buffer[0] == '.')
2400             {
2401               tp->type = token_type_number;
2402               return;
2403             }
2404           tp->type = token_type_symbol;
2405           tp->sub_type = (strcmp (buffer, "sub") == 0
2406                           ? symbol_type_sub
2407                           : symbol_type_none);
2408           tp->string = xstrdup (buffer);
2409           return;
2410 
2411         case '"':
2412           extract_quotelike (tp, c);
2413           tp->sub_type = string_type_qq;
2414           interpolate_keywords (mlp, tp->string, line_number);
2415           return;
2416 
2417         case '`':
2418           extract_quotelike (tp, c);
2419           tp->sub_type = string_type_qq;
2420           interpolate_keywords (mlp, tp->string, line_number);
2421           return;
2422 
2423         case '\'':
2424           extract_quotelike (tp, c);
2425           tp->sub_type = string_type_q;
2426           return;
2427 
2428         case '(':
2429           tp->type = token_type_lparen;
2430           return;
2431 
2432         case ')':
2433           tp->type = token_type_rparen;
2434           return;
2435 
2436         case '{':
2437           tp->type = token_type_lbrace;
2438           return;
2439 
2440         case '}':
2441           tp->type = token_type_rbrace;
2442           return;
2443 
2444         case '[':
2445           tp->type = token_type_lbracket;
2446           return;
2447 
2448         case ']':
2449           tp->type = token_type_rbracket;
2450           return;
2451 
2452         case ';':
2453           tp->type = token_type_semicolon;
2454           return;
2455 
2456         case ',':
2457           tp->type = token_type_comma;
2458           return;
2459 
2460         case '=':
2461           /* Check for fat comma.  */
2462           c = phase1_getc ();
2463           if (c == '>')
2464             {
2465               tp->type = token_type_fat_comma;
2466               return;
2467             }
2468           else if (linepos == 2
2469                    && (last_token_type == token_type_semicolon
2470                        || last_token_type == token_type_rbrace)
2471                    && ((c >= 'A' && c <='Z')
2472                        || (c >= 'a' && c <= 'z')))
2473             {
2474 #if DEBUG_PERL
2475               fprintf (stderr, "%s:%d: start pod section\n",
2476                        real_file_name, line_number);
2477 #endif
2478               skip_pod ();
2479 #if DEBUG_PERL
2480               fprintf (stderr, "%s:%d: end pod section\n",
2481                        real_file_name, line_number);
2482 #endif
2483               continue;
2484             }
2485           phase1_ungetc (c);
2486           tp->type = token_type_other;
2487           return;
2488 
2489         case '<':
2490           /* Check for <<EOF and friends.  */
2491           c = phase1_getc ();
2492           if (c == '<')
2493             {
2494               c = phase1_getc ();
2495               if (c == '\'')
2496                 {
2497                   char *string;
2498                   extract_quotelike (tp, c);
2499                   string = get_here_document (tp->string);
2500                   free (tp->string);
2501                   tp->string = string;
2502                   tp->type = token_type_string;
2503                   tp->sub_type = string_type_verbatim;
2504                   tp->line_number = line_number + 1;
2505                   return;
2506                 }
2507               else if (c == '"')
2508                 {
2509                   char *string;
2510                   extract_quotelike (tp, c);
2511                   string = get_here_document (tp->string);
2512                   free (tp->string);
2513                   tp->string = string;
2514                   tp->type = token_type_string;
2515                   tp->sub_type = string_type_qq;
2516                   tp->line_number = line_number + 1;
2517                   interpolate_keywords (mlp, tp->string, tp->line_number);
2518                   return;
2519                 }
2520               else if ((c >= 'A' && c <= 'Z')
2521                        || (c >= 'a' && c <= 'z')
2522                        || c == '_')
2523                 {
2524                   bufpos = 0;
2525                   while ((c >= 'A' && c <= 'Z')
2526                          || (c >= 'a' && c <= 'z')
2527                          || (c >= '0' && c <= '9')
2528                          || c == '_' || c >= 0x80)
2529                     {
2530                       if (bufpos >= bufmax)
2531                         {
2532                           bufmax = 2 * bufmax + 10;
2533                           buffer = xrealloc (buffer, bufmax);
2534                         }
2535                       buffer[bufpos++] = c;
2536                       c = phase1_getc ();
2537                     }
2538                   if (c == EOF)
2539                     {
2540                       tp->type = token_type_eof;
2541                       return;
2542                     }
2543                   else
2544                     {
2545                       char *string;
2546                       phase1_ungetc (c);
2547                       if (bufpos >= bufmax)
2548                         {
2549                           bufmax = 2 * bufmax + 10;
2550                           buffer = xrealloc (buffer, bufmax);
2551                         }
2552                       buffer[bufpos++] = '\0';
2553                       string = get_here_document (buffer);
2554                       tp->string = string;
2555                       tp->type = token_type_string;
2556                       tp->sub_type = string_type_qq;
2557                       tp->comment = add_reference (savable_comment);
2558                       tp->line_number = line_number + 1;
2559                       interpolate_keywords (mlp, tp->string, tp->line_number);
2560                       return;
2561                     }
2562                 }
2563               else
2564                 {
2565                   tp->type = token_type_other;
2566                   return;
2567                 }
2568             }
2569           else
2570             {
2571               phase1_ungetc (c);
2572               tp->type = token_type_other;
2573             }
2574           return;  /* End of case '>'.  */
2575 
2576         case '-':
2577           /* Check for dereferencing operator.  */
2578           c = phase1_getc ();
2579           if (c == '>')
2580             {
2581               tp->type = token_type_dereference;
2582               return;
2583             }
2584           else if ((c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z'))
2585             {
2586               /* One of the -X (filetest) functions.  We play safe
2587                  and accept all alphabetical characters here.  */
2588               tp->type = token_type_other;
2589               return;
2590             }
2591           phase1_ungetc (c);
2592           tp->type = token_type_other;
2593           return;
2594 
2595         case '/':
2596         case '?':
2597           if (prefer_regexp_over_division (tp->last_type))
2598             {
2599               extract_quotelike (tp, c);
2600               interpolate_keywords (mlp, tp->string, line_number);
2601               free (tp->string);
2602               drop_reference (tp->comment);
2603               tp->type = token_type_regex_op;
2604               /* Eat the following modifiers.  */
2605               do
2606                 c = phase1_getc ();
2607               while (c >= 'a' && c <= 'z');
2608               phase1_ungetc (c);
2609               return;
2610             }
2611           /* Recognize operator '//'.  */
2612           if (c == '/')
2613             {
2614               c = phase1_getc ();
2615               if (c != '/')
2616                 phase1_ungetc (c);
2617             }
2618           /* FALLTHROUGH */
2619 
2620         default:
2621           /* We could carefully recognize each of the 2 and 3 character
2622              operators, but it is not necessary, except for the '//' operator,
2623              as we only need to recognize gettext invocations.  Don't
2624              bother.  */
2625           tp->type = token_type_other;
2626           return;
2627         }
2628     }
2629 }
2630 
2631 
2632 /* A token stack used as a lookahead buffer.  */
2633 
2634 typedef struct token_stack_ty token_stack_ty;
2635 struct token_stack_ty
2636 {
2637   token_ty **items;
2638   size_t nitems;
2639   size_t nitems_max;
2640 };
2641 
2642 static struct token_stack_ty token_stack;
2643 
2644 #if DEBUG_PERL
2645 /* Dumps all resources allocated by stack STACK.  */
2646 static int
token_stack_dump(token_stack_ty * stack)2647 token_stack_dump (token_stack_ty *stack)
2648 {
2649   size_t i;
2650 
2651   fprintf (stderr, "BEGIN STACK DUMP\n");
2652   for (i = 0; i < stack->nitems; i++)
2653     {
2654       token_ty *token = stack->items[i];
2655       fprintf (stderr, "  [%s]\n", token2string (token));
2656       switch (token->type)
2657         {
2658         case token_type_named_op:
2659         case token_type_string:
2660         case token_type_symbol:
2661         case token_type_variable:
2662           fprintf (stderr, "    string: %s\n", token->string);
2663           break;
2664         case token_type_object:
2665           fprintf (stderr, "    string: %s->\n", token->string);
2666         default:
2667           break;
2668         }
2669     }
2670   fprintf (stderr, "END STACK DUMP\n");
2671   return 0;
2672 }
2673 #endif
2674 
2675 /* Pushes the token TOKEN onto the stack STACK.  */
2676 static inline void
token_stack_push(token_stack_ty * stack,token_ty * token)2677 token_stack_push (token_stack_ty *stack, token_ty *token)
2678 {
2679   if (stack->nitems >= stack->nitems_max)
2680     {
2681       size_t nbytes;
2682 
2683       stack->nitems_max = 2 * stack->nitems_max + 4;
2684       nbytes = stack->nitems_max * sizeof (token_ty *);
2685       stack->items = xrealloc (stack->items, nbytes);
2686     }
2687   stack->items[stack->nitems++] = token;
2688 }
2689 
2690 /* Pops the most recently pushed token from the stack STACK and returns it.
2691    Returns NULL if the stack is empty.  */
2692 static inline token_ty *
token_stack_pop(token_stack_ty * stack)2693 token_stack_pop (token_stack_ty *stack)
2694 {
2695   if (stack->nitems > 0)
2696     return stack->items[--(stack->nitems)];
2697   else
2698     return NULL;
2699 }
2700 
2701 /* Return the top of the stack without removing it from the stack, or
2702    NULL if the stack is empty.  */
2703 static inline token_ty *
token_stack_peek(const token_stack_ty * stack)2704 token_stack_peek (const token_stack_ty *stack)
2705 {
2706   if (stack->nitems > 0)
2707     return stack->items[stack->nitems - 1];
2708   else
2709     return NULL;
2710 }
2711 
2712 /* Frees all resources allocated by stack STACK.  */
2713 static inline void
token_stack_free(token_stack_ty * stack)2714 token_stack_free (token_stack_ty *stack)
2715 {
2716   size_t i;
2717 
2718   for (i = 0; i < stack->nitems; i++)
2719     free_token (stack->items[i]);
2720   free (stack->items);
2721 }
2722 
2723 
2724 static token_ty *
x_perl_lex(message_list_ty * mlp)2725 x_perl_lex (message_list_ty *mlp)
2726 {
2727 #if DEBUG_PERL
2728   int dummy = token_stack_dump (&token_stack);
2729 #endif
2730   token_ty *tp = token_stack_pop (&token_stack);
2731 
2732   if (!tp)
2733     {
2734       tp = XMALLOC (token_ty);
2735       x_perl_prelex (mlp, tp);
2736       tp->last_type = last_token_type;
2737       last_token_type = tp->type;
2738 
2739 #if DEBUG_PERL
2740       fprintf (stderr, "%s:%d: x_perl_prelex returned %s\n",
2741                real_file_name, line_number, token2string (tp));
2742 #endif
2743 
2744       /* The interpretation of a slash or question mark after a function call
2745          depends on the prototype of that function.  If the function expects
2746          at least one argument, a regular expression is preferred, otherwise
2747          an operator.  With our limited means, we can only guess here.  If
2748          the function is a builtin that takes no arguments, we prefer an
2749          operator by silently turning the last symbol into a variable instead
2750          of a symbol.
2751 
2752          Method calls without parentheses are not ambiguous.  After them, an
2753          operator must follow.  Due to some ideosyncrasies in this parser
2754          they are treated in two different manners.  If the call is
2755          chained ($foo->bar->baz) the token left of the symbol is a
2756          dereference operator.  If it is not chained ($foo->bar) the
2757          dereference operator is consumed with the extracted variable.  The
2758          latter case is handled below.  */
2759       if (tp->type == token_type_symbol)
2760         {
2761           if (tp->last_type == token_type_dereference)
2762             {
2763               /* Class method call or chained method call (with at least
2764                  two arrow operators).  */
2765               last_token_type = token_type_variable;
2766             }
2767           else if (tp->last_type == token_type_object)
2768             {
2769               /* Instance method, not chained.  */
2770               last_token_type = token_type_variable;
2771             }
2772           else if (strcmp (tp->string, "wantarray") == 0
2773                    || strcmp (tp->string, "fork") == 0
2774                    || strcmp (tp->string, "getlogin") == 0
2775                    || strcmp (tp->string, "getppid") == 0
2776                    || strcmp (tp->string, "getpwent") == 0
2777                    || strcmp (tp->string, "getgrent") == 0
2778                    || strcmp (tp->string, "gethostent") == 0
2779                    || strcmp (tp->string, "getnetent") == 0
2780                    || strcmp (tp->string, "getprotoent") == 0
2781                    || strcmp (tp->string, "getservent") == 0
2782                    || strcmp (tp->string, "setpwent") == 0
2783                    || strcmp (tp->string, "setgrent") == 0
2784                    || strcmp (tp->string, "endpwent") == 0
2785                    || strcmp (tp->string, "endgrent") == 0
2786                    || strcmp (tp->string, "endhostent") == 0
2787                    || strcmp (tp->string, "endnetent") == 0
2788                    || strcmp (tp->string, "endprotoent") == 0
2789                    || strcmp (tp->string, "endservent") == 0
2790                    || strcmp (tp->string, "time") == 0
2791                    || strcmp (tp->string, "times") == 0
2792                    || strcmp (tp->string, "wait") == 0
2793                    || strcmp (tp->string, "wantarray") == 0)
2794             {
2795               /* A Perl built-in function that does not accept arguments.  */
2796               last_token_type = token_type_variable;
2797             }
2798         }
2799     }
2800 #if DEBUG_PERL
2801   else
2802     {
2803       fprintf (stderr, "%s:%d: %s recycled from stack\n",
2804                real_file_name, line_number, token2string (tp));
2805     }
2806 #endif
2807 
2808   /* A symbol followed by a fat comma is really a single-quoted string.
2809      Function definitions or forward declarations also need a special
2810      handling because the dollars and at signs inside the parentheses
2811      must not be interpreted as the beginning of a variable ')'.  */
2812   if (tp->type == token_type_symbol || tp->type == token_type_named_op)
2813     {
2814       token_ty *next = token_stack_peek (&token_stack);
2815 
2816       if (!next)
2817         {
2818 #if DEBUG_PERL
2819           fprintf (stderr, "%s:%d: pre-fetching next token\n",
2820                    real_file_name, line_number);
2821 #endif
2822           next = x_perl_lex (mlp);
2823           x_perl_unlex (next);
2824 #if DEBUG_PERL
2825           fprintf (stderr, "%s:%d: unshifted next token\n",
2826                    real_file_name, line_number);
2827 #endif
2828         }
2829 
2830 #if DEBUG_PERL
2831       fprintf (stderr, "%s:%d: next token is %s\n",
2832                real_file_name, line_number, token2string (next));
2833 #endif
2834 
2835       if (next->type == token_type_fat_comma)
2836         {
2837           tp->type = token_type_string;
2838           tp->sub_type = string_type_q;
2839           tp->comment = add_reference (savable_comment);
2840 #if DEBUG_PERL
2841           fprintf (stderr,
2842                    "%s:%d: token %s mutated to token_type_string\n",
2843                    real_file_name, line_number, token2string (tp));
2844 #endif
2845         }
2846       else if (tp->type == token_type_symbol && tp->sub_type == symbol_type_sub
2847                && next->type == token_type_symbol)
2848         {
2849           /* Start of a function declaration or definition.  Mark this
2850              symbol as a function name, so that we can later eat up
2851              possible prototype information.  */
2852 #if DEBUG_PERL
2853           fprintf (stderr, "%s:%d: subroutine declaration/definition '%s'\n",
2854                    real_file_name, line_number, next->string);
2855 #endif
2856           next->sub_type = symbol_type_function;
2857         }
2858       else if (tp->type == token_type_symbol
2859                && (tp->sub_type == symbol_type_sub
2860                    || tp->sub_type == symbol_type_function)
2861                && next->type == token_type_lparen)
2862         {
2863           /* For simplicity we simply consume everything up to the
2864              closing parenthesis.  Actually only a limited set of
2865              characters is allowed inside parentheses but we leave
2866              complaints to the interpreter and are prepared for
2867              future extensions to the Perl syntax.  */
2868           int c;
2869 
2870 #if DEBUG_PERL
2871           fprintf (stderr, "%s:%d: consuming prototype information\n",
2872                    real_file_name, line_number);
2873 #endif
2874 
2875           do
2876             {
2877               c = phase1_getc ();
2878 #if DEBUG_PERL
2879               fprintf (stderr, "  consuming character '%c'\n", c);
2880 #endif
2881             }
2882           while (c != EOF && c != ')');
2883           phase1_ungetc (c);
2884         }
2885     }
2886 
2887   return tp;
2888 }
2889 
2890 static void
x_perl_unlex(token_ty * tp)2891 x_perl_unlex (token_ty *tp)
2892 {
2893   token_stack_push (&token_stack, tp);
2894 }
2895 
2896 
2897 /* ========================= Extracting strings.  ========================== */
2898 
2899 /* Assuming TP is a string token, this function accumulates all subsequent
2900    . string2 . string3 ... to the string.  (String concatenation.)  */
2901 
2902 static char *
collect_message(message_list_ty * mlp,token_ty * tp,int error_level)2903 collect_message (message_list_ty *mlp, token_ty *tp, int error_level)
2904 {
2905   char *string;
2906   size_t len;
2907 
2908   extract_quotelike_pass3 (tp, error_level);
2909   string = xstrdup (tp->string);
2910   len = strlen (tp->string) + 1;
2911 
2912   for (;;)
2913     {
2914       int c;
2915 
2916       do
2917         c = phase2_getc ();
2918       while (is_whitespace (c));
2919 
2920       if (c != '.')
2921         {
2922           phase2_ungetc (c);
2923           return string;
2924         }
2925 
2926       do
2927         c = phase2_getc ();
2928       while (is_whitespace (c));
2929 
2930       phase2_ungetc (c);
2931 
2932       if (c == '"' || c == '\'' || c == '`'
2933           || ((c == '/' || c == '?')
2934               && prefer_regexp_over_division (tp->last_type))
2935           || c == 'q')
2936         {
2937           token_ty *qstring = x_perl_lex (mlp);
2938           if (qstring->type != token_type_string)
2939             {
2940               /* assert (qstring->type == token_type_symbol) */
2941               x_perl_unlex (qstring);
2942               return string;
2943             }
2944 
2945           extract_quotelike_pass3 (qstring, error_level);
2946           len += strlen (qstring->string);
2947           string = xrealloc (string, len);
2948           strcat (string, qstring->string);
2949           free_token (qstring);
2950         }
2951     }
2952 }
2953 
2954 /* The file is broken into tokens.  Scan the token stream, looking for
2955    a keyword, followed by a left paren, followed by a string.  When we
2956    see this sequence, we have something to remember.  We assume we are
2957    looking at a valid Perl program, and leave the complaints about
2958    the grammar to the compiler.
2959 
2960      Normal handling: Look for
2961        keyword ( ... msgid ... )
2962      Plural handling: Look for
2963        keyword ( ... msgid ... msgid_plural ... )
2964 
2965    We use recursion because the arguments before msgid or between msgid
2966    and msgid_plural can contain subexpressions of the same form.
2967 
2968    In Perl, parentheses around function arguments can be omitted.
2969 
2970    The general rules are:
2971      1) Functions declared with a prototype take exactly the specified number
2972         of arguments.
2973           sub one_arg ($) { ... }
2974           sub two_args ($$) { ... }
2975      2) When a function name is immediately followed by an opening parenthesis,
2976         the argument list ends at the corresponding closing parenthesis.
2977 
2978    If rule 1 and rule 2 are contradictory, i.e. when the program calls a
2979    function with an explicit argument list and the wrong number of arguments,
2980    the program is invalid:
2981      sub two_args ($$) { ... }
2982      foo two_args (x), y             - invalid due to rules 1 and 2
2983 
2984    Ambiguities are resolved as follows:
2985      3) Some built-ins, such as 'abs', 'sqrt', 'sin', 'cos', ..., and functions
2986         declared with a prototype of exactly one argument take exactly one
2987         argument:
2988           foo sin x, y  ==>  foo (sin (x), y)
2989           sub one_arg ($) { ... }
2990           foo one_arg x, y, z  ==>  foo (one_arg (x), y, z)
2991      4) Other identifiers, if not immediately followed by an opening
2992         parenthesis, consume the entire remaining argument list:
2993           foo bar x, y  ==>  foo (bar (x, y))
2994           sub two_args ($$) { ... }
2995           foo two_args x, y  ==>  foo (two_args (x, y))
2996 
2997    Other series of comma separated expressions without a function name at
2998    the beginning are comma expressions:
2999           sub two_args ($$) { ... }
3000           foo two_args x, (y, z)  ==>  foo (two_args (x, (y, z)))
3001    Note that the evaluation of comma expressions returns a list of values
3002    when in list context (e.g. inside the argument list of a function without
3003    prototype) but only one value when inside the argument list of a function
3004    with a prototype:
3005           sub print3 ($$$) { print @_ }
3006           print3 5, (6, 7), 8  ==>  578
3007           print 5, (6, 7), 8  ==>  5678
3008 
3009    Where rule 3 or 4 contradict rule 1 or 2, the program is invalid:
3010      sin (x, y)                      - invalid due to rules 2 and 3
3011      sub one_arg ($) { ... }
3012      one_arg (x, y)                  - invalid due to rules 2 and 3
3013      sub two_args ($$) { ... }
3014      foo two_args x, y, z            - invalid due to rules 1 and 4
3015  */
3016 
3017 /* Extract messages until the next balanced closing parenthesis.
3018    Extracted messages are added to MLP.
3019 
3020    DELIM can be either token_type_rbrace, token_type_rbracket,
3021    token_type_rparen.  Additionally, if COMMA_DELIM is true, parsing
3022    stops at the next comma outside parentheses.
3023 
3024    ARG is the current argument list position, starts with 1.
3025    ARGPARSER is the corresponding argument list parser.
3026 
3027    Returns true for EOF, false otherwise.  */
3028 
3029 static bool
extract_balanced(message_list_ty * mlp,token_type_ty delim,bool eat_delim,bool comma_delim,flag_context_ty outer_context,flag_context_list_iterator_ty context_iter,int arg,struct arglist_parser * argparser)3030 extract_balanced (message_list_ty *mlp,
3031                   token_type_ty delim, bool eat_delim, bool comma_delim,
3032                   flag_context_ty outer_context,
3033                   flag_context_list_iterator_ty context_iter,
3034                   int arg, struct arglist_parser *argparser)
3035 {
3036   /* Whether to implicitly assume the next tokens are arguments even without
3037      a '('.  */
3038   bool next_is_argument = false;
3039   /* Parameters of the keyword just seen.  Defined only when next_is_argument
3040      is true.  */
3041   const struct callshapes *next_shapes = NULL;
3042   struct arglist_parser *next_argparser = NULL;
3043 
3044   /* Whether to not consider strings until the next comma.  */
3045   bool skip_until_comma = false;
3046 
3047   /* Context iterator that will be used if the next token is a '('.  */
3048   flag_context_list_iterator_ty next_context_iter =
3049     passthrough_context_list_iterator;
3050   /* Current context.  */
3051   flag_context_ty inner_context =
3052     inherited_context (outer_context,
3053                        flag_context_list_iterator_advance (&context_iter));
3054 
3055 #if DEBUG_PERL
3056   static int nesting_level = 0;
3057 
3058   ++nesting_level;
3059 #endif
3060 
3061   for (;;)
3062     {
3063       /* The current token.  */
3064       token_ty *tp;
3065 
3066       tp = x_perl_lex (mlp);
3067 
3068       if (delim == tp->type)
3069         {
3070           arglist_parser_done (argparser, arg);
3071           if (next_argparser != NULL)
3072             free (next_argparser);
3073 #if DEBUG_PERL
3074           fprintf (stderr, "%s:%d: extract_balanced finished (%d)\n",
3075                    logical_file_name, tp->line_number, --nesting_level);
3076 #endif
3077           if (eat_delim)
3078             free_token (tp);
3079           else
3080             /* Preserve the delimiter for the caller.  */
3081             x_perl_unlex (tp);
3082           return false;
3083         }
3084 
3085       if (comma_delim && tp->type == token_type_comma)
3086         {
3087           arglist_parser_done (argparser, arg);
3088           if (next_argparser != NULL)
3089             free (next_argparser);
3090 #if DEBUG_PERL
3091           fprintf (stderr, "%s:%d: extract_balanced finished at comma (%d)\n",
3092                    logical_file_name, tp->line_number, --nesting_level);
3093 #endif
3094           x_perl_unlex (tp);
3095           return false;
3096         }
3097 
3098       if (next_is_argument && tp->type != token_type_lparen)
3099         {
3100           /* An argument list starts, even though there is no '('.  */
3101           bool next_comma_delim;
3102 
3103           x_perl_unlex (tp);
3104 
3105           if (next_shapes != NULL)
3106             /* We know something about the function being called.  Assume
3107                that it consumes only one argument if no argument number or
3108                total > 1 is specified.  */
3109             {
3110               size_t i;
3111 
3112               next_comma_delim = true;
3113               for (i = 0; i < next_shapes->nshapes; i++)
3114                 {
3115                   const struct callshape *shape = &next_shapes->shapes[i];
3116 
3117                   if (shape->argnum1 > 1
3118                       || shape->argnum2 > 1
3119                       || shape->argnumc > 1
3120                       || shape->argtotal > 1)
3121                     next_comma_delim = false;
3122                 }
3123             }
3124           else
3125             /* We know nothing about the function being called.  It could be
3126                a function prototyped to take only one argument, or on the other
3127                hand it could be prototyped to take more than one argument or an
3128                arbitrary argument list or it could be unprototyped.  Due to
3129                the way the parser works, assuming the first case gives the
3130                best results.  */
3131             next_comma_delim = true;
3132 
3133           if (extract_balanced (mlp, delim, false, next_comma_delim,
3134                                 inner_context, next_context_iter,
3135                                 1, next_argparser))
3136             {
3137               arglist_parser_done (argparser, arg);
3138               return true;
3139             }
3140 
3141           next_is_argument = false;
3142           next_argparser = NULL;
3143           next_context_iter = null_context_list_iterator;
3144           continue;
3145         }
3146 
3147       switch (tp->type)
3148         {
3149         case token_type_symbol:
3150         case token_type_keyword_symbol:
3151 #if DEBUG_PERL
3152           fprintf (stderr, "%s:%d: type symbol (%d) \"%s\"\n",
3153                    logical_file_name, tp->line_number, nesting_level,
3154                    tp->string);
3155 #endif
3156 
3157           {
3158             void *keyword_value;
3159 
3160             if (hash_find_entry (&keywords, tp->string, strlen (tp->string),
3161                                  &keyword_value) == 0)
3162               {
3163                 const struct callshapes *shapes =
3164                   (const struct callshapes *) keyword_value;
3165 
3166                 next_shapes = shapes;
3167                 next_argparser = arglist_parser_alloc (mlp, shapes);
3168               }
3169             else
3170               {
3171                 next_shapes = NULL;
3172                 next_argparser = arglist_parser_alloc (mlp, NULL);
3173               }
3174           }
3175           next_is_argument = true;
3176           next_context_iter =
3177             flag_context_list_iterator (
3178               flag_context_list_table_lookup (
3179                 flag_context_list_table,
3180                 tp->string, strlen (tp->string)));
3181           break;
3182 
3183         case token_type_variable:
3184 #if DEBUG_PERL
3185           fprintf (stderr, "%s:%d: type variable (%d) \"%s\"\n",
3186                    logical_file_name, tp->line_number, nesting_level,
3187                    tp->string);
3188 #endif
3189           next_is_argument = false;
3190           if (next_argparser != NULL)
3191             free (next_argparser);
3192           next_argparser = NULL;
3193           next_context_iter = null_context_list_iterator;
3194           break;
3195 
3196         case token_type_object:
3197 #if DEBUG_PERL
3198           fprintf (stderr, "%s:%d: type object (%d) \"%s->\"\n",
3199                    logical_file_name, tp->line_number, nesting_level,
3200                    tp->string);
3201 #endif
3202           next_is_argument = false;
3203           if (next_argparser != NULL)
3204             free (next_argparser);
3205           next_argparser = NULL;
3206           next_context_iter = null_context_list_iterator;
3207           break;
3208 
3209         case token_type_lparen:
3210 #if DEBUG_PERL
3211           fprintf (stderr, "%s:%d: type left parenthesis (%d)\n",
3212                    logical_file_name, tp->line_number, nesting_level);
3213 #endif
3214           if (next_is_argument)
3215             {
3216               /* Parse the argument list of a function call.  */
3217               if (extract_balanced (mlp, token_type_rparen, true, false,
3218                                     inner_context, next_context_iter,
3219                                     1, next_argparser))
3220                 {
3221                   arglist_parser_done (argparser, arg);
3222                   return true;
3223                 }
3224               next_is_argument = false;
3225               next_argparser = NULL;
3226             }
3227           else
3228             {
3229               /* Parse a parenthesized expression or comma expression.  */
3230               if (extract_balanced (mlp, token_type_rparen, true, false,
3231                                     inner_context, next_context_iter,
3232                                     arg, arglist_parser_clone (argparser)))
3233                 {
3234                   arglist_parser_done (argparser, arg);
3235                   if (next_argparser != NULL)
3236                     free (next_argparser);
3237                   free_token (tp);
3238                   return true;
3239                 }
3240               next_is_argument = false;
3241               if (next_argparser != NULL)
3242                 free (next_argparser);
3243               next_argparser = NULL;
3244             }
3245           skip_until_comma = true;
3246           next_context_iter = null_context_list_iterator;
3247           break;
3248 
3249         case token_type_rparen:
3250 #if DEBUG_PERL
3251           fprintf (stderr, "%s:%d: type right parenthesis (%d)\n",
3252                    logical_file_name, tp->line_number, nesting_level);
3253 #endif
3254           next_is_argument = false;
3255           if (next_argparser != NULL)
3256             free (next_argparser);
3257           next_argparser = NULL;
3258           skip_until_comma = true;
3259           next_context_iter = null_context_list_iterator;
3260           break;
3261 
3262         case token_type_comma:
3263         case token_type_fat_comma:
3264 #if DEBUG_PERL
3265           fprintf (stderr, "%s:%d: type comma (%d)\n",
3266                    logical_file_name, tp->line_number, nesting_level);
3267 #endif
3268           if (arglist_parser_decidedp (argparser, arg))
3269             {
3270               /* We have missed the argument.  */
3271               arglist_parser_done (argparser, arg);
3272               argparser = arglist_parser_alloc (mlp, NULL);
3273               arg = 0;
3274             }
3275           arg++;
3276 #if DEBUG_PERL
3277           fprintf (stderr, "%s:%d: arg: %d\n",
3278                    real_file_name, tp->line_number, arg);
3279 #endif
3280           inner_context =
3281             inherited_context (outer_context,
3282                                flag_context_list_iterator_advance (
3283                                  &context_iter));
3284           next_is_argument = false;
3285           if (next_argparser != NULL)
3286             free (next_argparser);
3287           next_argparser = NULL;
3288           skip_until_comma = false;
3289           next_context_iter = passthrough_context_list_iterator;
3290           break;
3291 
3292         case token_type_string:
3293 #if DEBUG_PERL
3294           fprintf (stderr, "%s:%d: type string (%d): \"%s\"\n",
3295                    logical_file_name, tp->line_number, nesting_level,
3296                    tp->string);
3297 #endif
3298 
3299           if (extract_all)
3300             {
3301               char *string = collect_message (mlp, tp, EXIT_SUCCESS);
3302               lex_pos_ty pos;
3303 
3304               pos.file_name = logical_file_name;
3305               pos.line_number = tp->line_number;
3306               remember_a_message (mlp, NULL, string, true, false, inner_context,
3307                                   &pos, NULL, tp->comment, true);
3308             }
3309           else if (!skip_until_comma)
3310             {
3311               /* Need to collect the complete string, with error checking,
3312                  only if the argument ARG is used in ARGPARSER.  */
3313               bool must_collect = false;
3314               {
3315                 size_t nalternatives = argparser->nalternatives;
3316                 size_t i;
3317 
3318                 for (i = 0; i < nalternatives; i++)
3319                   {
3320                     struct partial_call *cp = &argparser->alternative[i];
3321 
3322                     if (arg == cp->argnumc
3323                         || arg == cp->argnum1 || arg == cp->argnum2)
3324                       must_collect = true;
3325                   }
3326               }
3327 
3328               if (must_collect)
3329                 {
3330                   char *string = collect_message (mlp, tp, EXIT_FAILURE);
3331                   mixed_string_ty *ms =
3332                     mixed_string_alloc_utf8 (string, lc_string,
3333                                              logical_file_name, tp->line_number);
3334                   free (string);
3335                   arglist_parser_remember (argparser, arg, ms, inner_context,
3336                                            logical_file_name, tp->line_number,
3337                                            tp->comment, true);
3338                 }
3339             }
3340 
3341           if (arglist_parser_decidedp (argparser, arg))
3342             {
3343               arglist_parser_done (argparser, arg);
3344               argparser = arglist_parser_alloc (mlp, NULL);
3345             }
3346 
3347           next_is_argument = false;
3348           if (next_argparser != NULL)
3349             free (next_argparser);
3350           next_argparser = NULL;
3351           next_context_iter = null_context_list_iterator;
3352           break;
3353 
3354         case token_type_number:
3355 #if DEBUG_PERL
3356           fprintf (stderr, "%s:%d: type number (%d)\n",
3357                    logical_file_name, tp->line_number, nesting_level);
3358 #endif
3359           next_is_argument = false;
3360           if (next_argparser != NULL)
3361             free (next_argparser);
3362           next_argparser = NULL;
3363           next_context_iter = null_context_list_iterator;
3364           break;
3365 
3366         case token_type_eof:
3367 #if DEBUG_PERL
3368           fprintf (stderr, "%s:%d: type EOF (%d)\n",
3369                    logical_file_name, tp->line_number, nesting_level);
3370 #endif
3371           arglist_parser_done (argparser, arg);
3372           if (next_argparser != NULL)
3373             free (next_argparser);
3374           next_argparser = NULL;
3375           free_token (tp);
3376           return true;
3377 
3378         case token_type_lbrace:
3379 #if DEBUG_PERL
3380           fprintf (stderr, "%s:%d: type lbrace (%d)\n",
3381                    logical_file_name, tp->line_number, nesting_level);
3382 #endif
3383           if (extract_balanced (mlp, token_type_rbrace, true, false,
3384                                 null_context, null_context_list_iterator,
3385                                 1, arglist_parser_alloc (mlp, NULL)))
3386             {
3387               arglist_parser_done (argparser, arg);
3388               if (next_argparser != NULL)
3389                 free (next_argparser);
3390               free_token (tp);
3391               return true;
3392             }
3393           next_is_argument = false;
3394           if (next_argparser != NULL)
3395             free (next_argparser);
3396           next_argparser = NULL;
3397           next_context_iter = null_context_list_iterator;
3398           break;
3399 
3400         case token_type_rbrace:
3401 #if DEBUG_PERL
3402           fprintf (stderr, "%s:%d: type rbrace (%d)\n",
3403                    logical_file_name, tp->line_number, nesting_level);
3404 #endif
3405           next_is_argument = false;
3406           if (next_argparser != NULL)
3407             free (next_argparser);
3408           next_argparser = NULL;
3409           next_context_iter = null_context_list_iterator;
3410           break;
3411 
3412         case token_type_lbracket:
3413 #if DEBUG_PERL
3414           fprintf (stderr, "%s:%d: type lbracket (%d)\n",
3415                    logical_file_name, tp->line_number, nesting_level);
3416 #endif
3417           if (extract_balanced (mlp, token_type_rbracket, true, false,
3418                                 null_context, null_context_list_iterator,
3419                                 1, arglist_parser_alloc (mlp, NULL)))
3420             {
3421               arglist_parser_done (argparser, arg);
3422               if (next_argparser != NULL)
3423                 free (next_argparser);
3424               free_token (tp);
3425               return true;
3426             }
3427           next_is_argument = false;
3428           if (next_argparser != NULL)
3429             free (next_argparser);
3430           next_argparser = NULL;
3431           next_context_iter = null_context_list_iterator;
3432           break;
3433 
3434         case token_type_rbracket:
3435 #if DEBUG_PERL
3436           fprintf (stderr, "%s:%d: type rbracket (%d)\n",
3437                    logical_file_name, tp->line_number, nesting_level);
3438 #endif
3439           next_is_argument = false;
3440           if (next_argparser != NULL)
3441             free (next_argparser);
3442           next_argparser = NULL;
3443           next_context_iter = null_context_list_iterator;
3444           break;
3445 
3446         case token_type_semicolon:
3447 #if DEBUG_PERL
3448           fprintf (stderr, "%s:%d: type semicolon (%d)\n",
3449                    logical_file_name, tp->line_number, nesting_level);
3450 #endif
3451 
3452           /* The ultimate sign.  */
3453           arglist_parser_done (argparser, arg);
3454           argparser = arglist_parser_alloc (mlp, NULL);
3455 
3456           /* FIXME: Instead of resetting outer_context here, it may be better
3457              to recurse in the next_is_argument handling above, waiting for
3458              the next semicolon or other statement terminator.  */
3459           outer_context = null_context;
3460           context_iter = null_context_list_iterator;
3461           next_is_argument = false;
3462           if (next_argparser != NULL)
3463             free (next_argparser);
3464           next_argparser = NULL;
3465           next_context_iter = passthrough_context_list_iterator;
3466           inner_context =
3467             inherited_context (outer_context,
3468                                flag_context_list_iterator_advance (
3469                                  &context_iter));
3470           break;
3471 
3472         case token_type_dereference:
3473 #if DEBUG_PERL
3474           fprintf (stderr, "%s:%d: type dereference (%d)\n",
3475                    logical_file_name, tp->line_number, nesting_level);
3476 #endif
3477           next_is_argument = false;
3478           if (next_argparser != NULL)
3479             free (next_argparser);
3480           next_argparser = NULL;
3481           next_context_iter = null_context_list_iterator;
3482           break;
3483 
3484         case token_type_dot:
3485 #if DEBUG_PERL
3486           fprintf (stderr, "%s:%d: type dot (%d)\n",
3487                    logical_file_name, tp->line_number, nesting_level);
3488 #endif
3489           next_is_argument = false;
3490           if (next_argparser != NULL)
3491             free (next_argparser);
3492           next_argparser = NULL;
3493           next_context_iter = null_context_list_iterator;
3494           break;
3495 
3496         case token_type_named_op:
3497 #if DEBUG_PERL
3498           fprintf (stderr, "%s:%d: type named operator (%d): %s\n",
3499                    logical_file_name, tp->line_number, nesting_level,
3500                    tp->string);
3501 #endif
3502           next_is_argument = false;
3503           if (next_argparser != NULL)
3504             free (next_argparser);
3505           next_argparser = NULL;
3506           next_context_iter = null_context_list_iterator;
3507           break;
3508 
3509         case token_type_regex_op:
3510 #if DEBUG_PERL
3511           fprintf (stderr, "%s:%d: type regex operator (%d)\n",
3512                    logical_file_name, tp->line_number, nesting_level);
3513 #endif
3514           next_is_argument = false;
3515           if (next_argparser != NULL)
3516             free (next_argparser);
3517           next_argparser = NULL;
3518           next_context_iter = null_context_list_iterator;
3519           break;
3520 
3521         case token_type_other:
3522 #if DEBUG_PERL
3523           fprintf (stderr, "%s:%d: type other (%d)\n",
3524                    logical_file_name, tp->line_number, nesting_level);
3525 #endif
3526           next_is_argument = false;
3527           if (next_argparser != NULL)
3528             free (next_argparser);
3529           next_argparser = NULL;
3530           next_context_iter = null_context_list_iterator;
3531           break;
3532 
3533         default:
3534           fprintf (stderr, "%s:%d: unknown token type %d\n",
3535                    real_file_name, tp->line_number, tp->type);
3536           abort ();
3537         }
3538 
3539       free_token (tp);
3540     }
3541 }
3542 
3543 void
extract_perl(FILE * f,const char * real_filename,const char * logical_filename,flag_context_list_table_ty * flag_table,msgdomain_list_ty * mdlp)3544 extract_perl (FILE *f, const char *real_filename, const char *logical_filename,
3545               flag_context_list_table_ty *flag_table,
3546               msgdomain_list_ty *mdlp)
3547 {
3548   message_list_ty *mlp = mdlp->item[0]->messages;
3549 
3550   fp = f;
3551   real_file_name = real_filename;
3552   logical_file_name = xstrdup (logical_filename);
3553   line_number = 0;
3554 
3555   linesize = 0;
3556   linepos = 0;
3557   eaten_here = 0;
3558   end_of_file = false;
3559 
3560   last_comment_line = -1;
3561   last_non_comment_line = -1;
3562 
3563   flag_context_list_table = flag_table;
3564 
3565   /* Safe assumption.  */
3566   last_token_type = token_type_semicolon;
3567 
3568   token_stack.items = NULL;
3569   token_stack.nitems = 0;
3570   token_stack.nitems_max = 0;
3571 
3572   init_keywords ();
3573 
3574   /* Eat tokens until eof is seen.  When extract_balanced returns
3575      due to an unbalanced closing brace, just restart it.  */
3576   while (!extract_balanced (mlp, token_type_rbrace, true, false,
3577                             null_context, null_context_list_iterator,
3578                             1, arglist_parser_alloc (mlp, NULL)))
3579     ;
3580 
3581   fp = NULL;
3582   real_file_name = NULL;
3583   free (logical_file_name);
3584   logical_file_name = NULL;
3585   line_number = 0;
3586   last_token_type = token_type_semicolon;
3587   token_stack_free (&token_stack);
3588   eaten_here = 0;
3589   end_of_file = true;
3590 }
3591