1 /* xgettext Perl backend.
2 Copyright (C) 2002-2010, 2013, 2016, 2018-2020 Free Software Foundation, Inc.
3
4 This file was written by Guido Flohr <guido@imperia.net>, 2002-2010.
5
6 This program is free software: you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 3 of the License, or
9 (at your option) any later version.
10
11 This program is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
15
16 You should have received a copy of the GNU General Public License
17 along with this program. If not, see <https://www.gnu.org/licenses/>. */
18
19 #ifdef HAVE_CONFIG_H
20 # include "config.h"
21 #endif
22
23 /* Specification. */
24 #include "x-perl.h"
25
26 #include <errno.h>
27 #include <stdbool.h>
28 #include <stdio.h>
29 #include <stdlib.h>
30 #include <string.h>
31
32 #include "message.h"
33 #include "rc-str-list.h"
34 #include "xgettext.h"
35 #include "xg-pos.h"
36 #include "xg-encoding.h"
37 #include "xg-mixed-string.h"
38 #include "xg-arglist-context.h"
39 #include "xg-arglist-callshape.h"
40 #include "xg-arglist-parser.h"
41 #include "xg-message.h"
42 #include "error.h"
43 #include "error-progname.h"
44 #include "xalloc.h"
45 #include "po-charset.h"
46 #include "unistr.h"
47 #include "uniname.h"
48 #include "gettext.h"
49
50 #define _(s) gettext(s)
51
52 /* The Perl syntax is defined in perlsyn.pod. Try the command
53 "man perlsyn" or "perldoc perlsyn".
54 Also, the syntax after the 'sub' keyword is specified in perlsub.pod.
55 Try the command "man perlsub" or "perldoc perlsub".
56 Perl 5.10 has new operators '//' and '//=', see
57 <https://perldoc.perl.org/perldelta.html#Defined-or-operator>. */
58
59 #define DEBUG_PERL 0
60
61
62 /* ====================== Keyword set customization. ====================== */
63
64 /* If true extract all strings. */
65 static bool extract_all = false;
66
67 static hash_table keywords;
68 static bool default_keywords = true;
69
70
71 void
x_perl_extract_all()72 x_perl_extract_all ()
73 {
74 extract_all = true;
75 }
76
77
78 void
x_perl_keyword(const char * name)79 x_perl_keyword (const char *name)
80 {
81 if (name == NULL)
82 default_keywords = false;
83 else
84 {
85 const char *end;
86 struct callshape shape;
87 const char *colon;
88
89 if (keywords.table == NULL)
90 hash_init (&keywords, 100);
91
92 split_keywordspec (name, &end, &shape);
93
94 /* The characters between name and end should form a valid C identifier.
95 A colon means an invalid parse in split_keywordspec(). */
96 colon = strchr (name, ':');
97 if (colon == NULL || colon >= end)
98 insert_keyword_callshape (&keywords, name, end - name, &shape);
99 }
100 }
101
102 /* Finish initializing the keywords hash table.
103 Called after argument processing, before each file is processed. */
104 static void
init_keywords()105 init_keywords ()
106 {
107 if (default_keywords)
108 {
109 /* When adding new keywords here, also update the documentation in
110 xgettext.texi! */
111 x_perl_keyword ("gettext");
112 x_perl_keyword ("%gettext");
113 x_perl_keyword ("$gettext");
114 x_perl_keyword ("dgettext:2");
115 x_perl_keyword ("dcgettext:2");
116 x_perl_keyword ("ngettext:1,2");
117 x_perl_keyword ("dngettext:2,3");
118 x_perl_keyword ("dcngettext:2,3");
119 x_perl_keyword ("gettext_noop");
120 x_perl_keyword ("pgettext:1c,2");
121 x_perl_keyword ("dpgettext:2c,3");
122 x_perl_keyword ("dcpgettext:2c,3");
123 x_perl_keyword ("npgettext:1c,2,3");
124 x_perl_keyword ("dnpgettext:2c,3,4");
125 x_perl_keyword ("dcnpgettext:2c,3,4");
126
127 #if 0
128 x_perl_keyword ("__");
129 x_perl_keyword ("$__");
130 x_perl_keyword ("%__");
131 x_perl_keyword ("__x");
132 x_perl_keyword ("__n:1,2");
133 x_perl_keyword ("__nx:1,2");
134 x_perl_keyword ("__xn:1,2");
135 x_perl_keyword ("N__");
136 #endif
137 default_keywords = false;
138 }
139 }
140
141 void
init_flag_table_perl()142 init_flag_table_perl ()
143 {
144 /* Gettext binding for Perl. */
145 xgettext_record_flag ("gettext:1:pass-perl-format");
146 xgettext_record_flag ("gettext:1:pass-perl-brace-format");
147 xgettext_record_flag ("%gettext:1:pass-perl-format");
148 xgettext_record_flag ("%gettext:1:pass-perl-brace-format");
149 xgettext_record_flag ("$gettext:1:pass-perl-format");
150 xgettext_record_flag ("$gettext:1:pass-perl-brace-format");
151 xgettext_record_flag ("dgettext:2:pass-perl-format");
152 xgettext_record_flag ("dgettext:2:pass-perl-brace-format");
153 xgettext_record_flag ("dcgettext:2:pass-perl-format");
154 xgettext_record_flag ("dcgettext:2:pass-perl-brace-format");
155 xgettext_record_flag ("ngettext:1:pass-perl-format");
156 xgettext_record_flag ("ngettext:2:pass-perl-format");
157 xgettext_record_flag ("ngettext:1:pass-perl-brace-format");
158 xgettext_record_flag ("ngettext:2:pass-perl-brace-format");
159 xgettext_record_flag ("dngettext:2:pass-perl-format");
160 xgettext_record_flag ("dngettext:3:pass-perl-format");
161 xgettext_record_flag ("dngettext:2:pass-perl-brace-format");
162 xgettext_record_flag ("dngettext:3:pass-perl-brace-format");
163 xgettext_record_flag ("dcngettext:2:pass-perl-format");
164 xgettext_record_flag ("dcngettext:3:pass-perl-format");
165 xgettext_record_flag ("dcngettext:2:pass-perl-brace-format");
166 xgettext_record_flag ("dcngettext:3:pass-perl-brace-format");
167 xgettext_record_flag ("gettext_noop:1:pass-perl-format");
168 xgettext_record_flag ("gettext_noop:1:pass-perl-brace-format");
169 xgettext_record_flag ("pgettext:2:pass-perl-format");
170 xgettext_record_flag ("pgettext:2:pass-perl-brace-format");
171 xgettext_record_flag ("dpgettext:3:pass-perl-format");
172 xgettext_record_flag ("dpgettext:3:pass-perl-brace-format");
173 xgettext_record_flag ("dcpgettext:3:pass-perl-format");
174 xgettext_record_flag ("dcpgettext:3:pass-perl-brace-format");
175 xgettext_record_flag ("npgettext:2:pass-perl-format");
176 xgettext_record_flag ("npgettext:3:pass-perl-format");
177 xgettext_record_flag ("npgettext:2:pass-perl-brace-format");
178 xgettext_record_flag ("npgettext:3:pass-perl-brace-format");
179 xgettext_record_flag ("dnpgettext:3:pass-perl-format");
180 xgettext_record_flag ("dnpgettext:4:pass-perl-format");
181 xgettext_record_flag ("dnpgettext:3:pass-perl-brace-format");
182 xgettext_record_flag ("dnpgettext:4:pass-perl-brace-format");
183 xgettext_record_flag ("dcnpgettext:3:pass-perl-format");
184 xgettext_record_flag ("dcnpgettext:4:pass-perl-format");
185 xgettext_record_flag ("dcnpgettext:3:pass-perl-brace-format");
186 xgettext_record_flag ("dcnpgettext:4:pass-perl-brace-format");
187
188 /* Perl builtins. */
189 xgettext_record_flag ("printf:1:perl-format"); /* argument 1 or 2 ?? */
190 xgettext_record_flag ("sprintf:1:perl-format");
191 #if 0
192 /* Shortcuts from libintl-perl. */
193 xgettext_record_flag ("__:1:pass-perl-format");
194 xgettext_record_flag ("__:1:pass-perl-brace-format");
195 xgettext_record_flag ("%__:1:pass-perl-format");
196 xgettext_record_flag ("%__:1:pass-perl-brace-format");
197 xgettext_record_flag ("$__:1:pass-perl-format");
198 xgettext_record_flag ("$__:1:pass-perl-brace-format");
199 xgettext_record_flag ("__x:1:perl-brace-format");
200 xgettext_record_flag ("__n:1:pass-perl-format");
201 xgettext_record_flag ("__n:2:pass-perl-format");
202 xgettext_record_flag ("__n:1:pass-perl-brace-format");
203 xgettext_record_flag ("__n:2:pass-perl-brace-format");
204 xgettext_record_flag ("__nx:1:perl-brace-format");
205 xgettext_record_flag ("__nx:2:perl-brace-format");
206 xgettext_record_flag ("__xn:1:perl-brace-format");
207 xgettext_record_flag ("__xn:2:perl-brace-format");
208 xgettext_record_flag ("N__:1:pass-perl-format");
209 xgettext_record_flag ("N__:1:pass-perl-brace-format");
210 #endif
211 }
212
213
214 /* ======================== Reading of characters. ======================== */
215
216 /* The input file stream. */
217 static FILE *fp;
218
219 /* The current line buffer. */
220 static char *linebuf;
221 /* The size of the input buffer. */
222 static size_t linebuf_size;
223
224 /* The size of the current line. */
225 static int linesize;
226
227 /* The position in the current line. */
228 static int linepos;
229
230 /* Number of lines eaten for here documents. */
231 static int eaten_here;
232
233 /* Paranoia: EOF marker for __END__ or __DATA__. */
234 static bool end_of_file;
235
236
237 /* 1. line_number handling. */
238
239 /* Returns the next character from the input stream or EOF. */
240 static int
phase1_getc()241 phase1_getc ()
242 {
243 line_number += eaten_here;
244 eaten_here = 0;
245
246 if (end_of_file)
247 return EOF;
248
249 if (linepos >= linesize)
250 {
251 linesize = getline (&linebuf, &linebuf_size, fp);
252
253 if (linesize < 0)
254 {
255 if (ferror (fp))
256 error (EXIT_FAILURE, errno, _("error while reading \"%s\""),
257 real_file_name);
258 end_of_file = true;
259 return EOF;
260 }
261
262 linepos = 0;
263 ++line_number;
264
265 /* Undosify. This is important for catching the end of <<EOF and
266 <<'EOF'. We could rely on stdio doing this for us but
267 it is not uncommon to to come across Perl scripts with CRLF
268 newline conventions on systems that do not follow this
269 convention. */
270 if (linesize >= 2 && linebuf[linesize - 1] == '\n'
271 && linebuf[linesize - 2] == '\r')
272 {
273 linebuf[linesize - 2] = '\n';
274 linebuf[linesize - 1] = '\0';
275 --linesize;
276 }
277 }
278
279 return linebuf[linepos++];
280 }
281
282 /* Supports only one pushback character. */
283 static void
phase1_ungetc(int c)284 phase1_ungetc (int c)
285 {
286 if (c != EOF)
287 {
288 if (linepos == 0)
289 /* Attempt to ungetc across line boundary. Shouldn't happen.
290 No two phase1_ungetc calls are permitted in a row. */
291 abort ();
292
293 --linepos;
294 }
295 }
296
297 /* Read a here document and return its contents.
298 The delimiter is an UTF-8 encoded string; the resulting string is UTF-8
299 encoded as well. */
300
301 static char *
get_here_document(const char * delimiter)302 get_here_document (const char *delimiter)
303 {
304 /* Accumulator for the entire here document, including a NUL byte
305 at the end. */
306 static char *buffer;
307 static size_t bufmax = 0;
308 size_t bufpos = 0;
309 /* Current line being appended. */
310 static char *my_linebuf = NULL;
311 static size_t my_linebuf_size = 0;
312
313 /* Allocate the initial buffer. Later on, bufmax > 0. */
314 if (bufmax == 0)
315 {
316 buffer = XNMALLOC (1, char);
317 buffer[0] = '\0';
318 bufmax = 1;
319 }
320
321 for (;;)
322 {
323 int read_bytes = getline (&my_linebuf, &my_linebuf_size, fp);
324 char *my_line_utf8;
325 bool chomp;
326
327 if (read_bytes < 0)
328 {
329 if (ferror (fp))
330 {
331 error (EXIT_FAILURE, errno, _("error while reading \"%s\""),
332 real_file_name);
333 }
334 else
335 {
336 error_with_progname = false;
337 error (EXIT_SUCCESS, 0,
338 _("%s:%d: can't find string terminator \"%s\" anywhere before EOF"),
339 real_file_name, line_number, delimiter);
340 error_with_progname = true;
341
342 break;
343 }
344 }
345
346 ++eaten_here;
347
348 /* Convert to UTF-8. */
349 my_line_utf8 =
350 from_current_source_encoding (my_linebuf, lc_string, logical_file_name,
351 line_number + eaten_here);
352 if (my_line_utf8 != my_linebuf)
353 {
354 if (strlen (my_line_utf8) >= my_linebuf_size)
355 {
356 my_linebuf_size = strlen (my_line_utf8) + 1;
357 my_linebuf = xrealloc (my_linebuf, my_linebuf_size);
358 }
359 strcpy (my_linebuf, my_line_utf8);
360 free (my_line_utf8);
361 }
362
363 /* Undosify. This is important for catching the end of <<EOF and
364 <<'EOF'. We could rely on stdio doing this for us but you
365 it is not uncommon to to come across Perl scripts with CRLF
366 newline conventions on systems that do not follow this
367 convention. */
368 if (read_bytes >= 2 && my_linebuf[read_bytes - 1] == '\n'
369 && my_linebuf[read_bytes - 2] == '\r')
370 {
371 my_linebuf[read_bytes - 2] = '\n';
372 my_linebuf[read_bytes - 1] = '\0';
373 --read_bytes;
374 }
375
376 /* Temporarily remove the trailing newline from my_linebuf. */
377 chomp = false;
378 if (read_bytes >= 1 && my_linebuf[read_bytes - 1] == '\n')
379 {
380 chomp = true;
381 my_linebuf[read_bytes - 1] = '\0';
382 }
383
384 /* See whether this line terminates the here document. */
385 if (strcmp (my_linebuf, delimiter) == 0)
386 break;
387
388 /* Add back the trailing newline to my_linebuf. */
389 if (chomp)
390 my_linebuf[read_bytes - 1] = '\n';
391
392 /* Ensure room for read_bytes + 1 bytes. */
393 if (bufpos + read_bytes >= bufmax)
394 {
395 do
396 bufmax = 2 * bufmax + 10;
397 while (bufpos + read_bytes >= bufmax);
398 buffer = xrealloc (buffer, bufmax);
399 }
400 /* Append this line to the accumulator. */
401 strcpy (buffer + bufpos, my_linebuf);
402 bufpos += read_bytes;
403 }
404
405 /* Done accumulating the here document. */
406 return xstrdup (buffer);
407 }
408
409 /* Skips pod sections. */
410 static void
skip_pod()411 skip_pod ()
412 {
413 line_number += eaten_here;
414 eaten_here = 0;
415 linepos = 0;
416
417 for (;;)
418 {
419 linesize = getline (&linebuf, &linebuf_size, fp);
420
421 if (linesize < 0)
422 {
423 if (ferror (fp))
424 error (EXIT_FAILURE, errno, _("error while reading \"%s\""),
425 real_file_name);
426 return;
427 }
428
429 ++line_number;
430
431 if (strncmp ("=cut", linebuf, 4) == 0)
432 {
433 /* Force reading of a new line on next call to phase1_getc(). */
434 linepos = linesize;
435 return;
436 }
437 }
438 }
439
440
441 /* These are for tracking whether comments count as immediately before
442 keyword. */
443 static int last_comment_line;
444 static int last_non_comment_line;
445
446
447 /* 2. Replace each comment that is not inside a string literal or regular
448 expression with a newline character. We need to remember the comment
449 for later, because it may be attached to a keyword string. */
450
451 static int
phase2_getc()452 phase2_getc ()
453 {
454 static char *buffer;
455 static size_t bufmax;
456 size_t buflen;
457 int lineno;
458 int c;
459 char *utf8_string;
460
461 c = phase1_getc ();
462 if (c == '#')
463 {
464 buflen = 0;
465 lineno = line_number;
466 /* Skip leading whitespace. */
467 for (;;)
468 {
469 c = phase1_getc ();
470 if (c == EOF)
471 break;
472 if (c != ' ' && c != '\t' && c != '\r' && c != '\f')
473 {
474 phase1_ungetc (c);
475 break;
476 }
477 }
478 /* Accumulate the comment. */
479 for (;;)
480 {
481 c = phase1_getc ();
482 if (c == '\n' || c == EOF)
483 break;
484 if (buflen >= bufmax)
485 {
486 bufmax = 2 * bufmax + 10;
487 buffer = xrealloc (buffer, bufmax);
488 }
489 buffer[buflen++] = c;
490 }
491 if (buflen >= bufmax)
492 {
493 bufmax = 2 * bufmax + 10;
494 buffer = xrealloc (buffer, bufmax);
495 }
496 buffer[buflen] = '\0';
497 /* Convert it to UTF-8. */
498 utf8_string =
499 from_current_source_encoding (buffer, lc_comment, logical_file_name,
500 lineno);
501 /* Save it until we encounter the corresponding string. */
502 savable_comment_add (utf8_string);
503 last_comment_line = lineno;
504 }
505 return c;
506 }
507
508 /* Supports only one pushback character. */
509 static void
phase2_ungetc(int c)510 phase2_ungetc (int c)
511 {
512 if (c != EOF)
513 phase1_ungetc (c);
514 }
515
516 /* Whitespace recognition. */
517
518 #define case_whitespace \
519 case ' ': case '\t': case '\r': case '\n': case '\f'
520
521 static inline bool
is_whitespace(int c)522 is_whitespace (int c)
523 {
524 return (c == ' ' || c == '\t' || c == '\r' || c == '\n' || c == '\f');
525 }
526
527
528 /* ========================== Reading of tokens. ========================== */
529
530
531 enum token_type_ty
532 {
533 token_type_eof,
534 token_type_lparen, /* ( */
535 token_type_rparen, /* ) */
536 token_type_comma, /* , */
537 token_type_fat_comma, /* => */
538 token_type_dereference, /* -> */
539 token_type_semicolon, /* ; */
540 token_type_lbrace, /* { */
541 token_type_rbrace, /* } */
542 token_type_lbracket, /* [ */
543 token_type_rbracket, /* ] */
544 token_type_string, /* quote-like */
545 token_type_number, /* starting with a digit o dot */
546 token_type_named_op, /* if, unless, while, ... */
547 token_type_variable, /* $... */
548 token_type_object, /* A dereferenced variable, maybe a blessed
549 object. */
550 token_type_symbol, /* symbol, number */
551 token_type_regex_op, /* s, tr, y, m. */
552 token_type_dot, /* . */
553 token_type_other, /* regexp, misc. operator */
554 /* The following are not really token types, but variants used by
555 the parser. */
556 token_type_keyword_symbol /* keyword symbol */
557 };
558 typedef enum token_type_ty token_type_ty;
559
560 /* Subtypes for strings, important for interpolation. */
561 enum string_type_ty
562 {
563 string_type_verbatim, /* "<<'EOF'", "m'...'", "s'...''...'",
564 "tr/.../.../", "y/.../.../". */
565 string_type_q, /* "'..'", "q/.../". */
566 string_type_qq, /* '"..."', "`...`", "qq/.../", "qx/.../",
567 "<file*glob>". */
568 string_type_qr /* Not supported. */
569 };
570
571 /* Subtypes for symbols, important for dollar interpretation. */
572 enum symbol_type_ty
573 {
574 symbol_type_none, /* Nothing special. */
575 symbol_type_sub, /* 'sub'. */
576 symbol_type_function /* Function name after 'sub'. */
577 };
578
579 typedef struct token_ty token_ty;
580 struct token_ty
581 {
582 token_type_ty type;
583 token_type_ty last_type;
584 int sub_type; /* for token_type_string, token_type_symbol */
585 char *string; /* for: in encoding:
586 token_type_named_op ASCII
587 token_type_string UTF-8
588 token_type_symbol ASCII
589 token_type_variable global_source_encoding
590 token_type_object global_source_encoding
591 */
592 refcounted_string_list_ty *comment; /* for token_type_string */
593 int line_number;
594 };
595
596 #if DEBUG_PERL
597 static const char *
token2string(const token_ty * token)598 token2string (const token_ty *token)
599 {
600 switch (token->type)
601 {
602 case token_type_eof:
603 return "token_type_eof";
604 case token_type_lparen:
605 return "token_type_lparen";
606 case token_type_rparen:
607 return "token_type_rparen";
608 case token_type_comma:
609 return "token_type_comma";
610 case token_type_fat_comma:
611 return "token_type_fat_comma";
612 case token_type_dereference:
613 return "token_type_dereference";
614 case token_type_semicolon:
615 return "token_type_semicolon";
616 case token_type_lbrace:
617 return "token_type_lbrace";
618 case token_type_rbrace:
619 return "token_type_rbrace";
620 case token_type_lbracket:
621 return "token_type_lbracket";
622 case token_type_rbracket:
623 return "token_type_rbracket";
624 case token_type_string:
625 return "token_type_string";
626 case token_type_number:
627 return "token type number";
628 case token_type_named_op:
629 return "token_type_named_op";
630 case token_type_variable:
631 return "token_type_variable";
632 case token_type_object:
633 return "token_type_object";
634 case token_type_symbol:
635 return "token_type_symbol";
636 case token_type_regex_op:
637 return "token_type_regex_op";
638 case token_type_dot:
639 return "token_type_dot";
640 case token_type_other:
641 return "token_type_other";
642 default:
643 return "unknown";
644 }
645 }
646 #endif
647
648 /* Free the memory pointed to by a 'struct token_ty'. */
649 static inline void
free_token(token_ty * tp)650 free_token (token_ty *tp)
651 {
652 switch (tp->type)
653 {
654 case token_type_named_op:
655 case token_type_string:
656 case token_type_symbol:
657 case token_type_variable:
658 case token_type_object:
659 free (tp->string);
660 break;
661 default:
662 break;
663 }
664 if (tp->type == token_type_string)
665 drop_reference (tp->comment);
666 free (tp);
667 }
668
669 /* Pass 1 of extracting quotes: Find the end of the string, regardless
670 of the semantics of the construct. Return the complete string,
671 including the starting and the trailing delimiter, with backslashes
672 removed where appropriate. */
673 static char *
extract_quotelike_pass1(int delim)674 extract_quotelike_pass1 (int delim)
675 {
676 /* This function is called recursively. No way to allocate stuff
677 statically. Also alloca() is inappropriate due to limited stack
678 size on some platforms. So we use malloc(). */
679 int bufmax = 10;
680 char *buffer = XNMALLOC (bufmax, char);
681 int bufpos = 0;
682 bool nested = true;
683 int counter_delim;
684
685 buffer[bufpos++] = delim;
686
687 /* Find the closing delimiter. */
688 switch (delim)
689 {
690 case '(':
691 counter_delim = ')';
692 break;
693 case '{':
694 counter_delim = '}';
695 break;
696 case '[':
697 counter_delim = ']';
698 break;
699 case '<':
700 counter_delim = '>';
701 break;
702 default: /* "..." or '...' or |...| etc. */
703 nested = false;
704 counter_delim = delim;
705 break;
706 }
707
708 for (;;)
709 {
710 int c = phase1_getc ();
711
712 /* This round can produce 1 or 2 bytes. Ensure room for 2 bytes. */
713 if (bufpos + 2 > bufmax)
714 {
715 bufmax = 2 * bufmax + 10;
716 buffer = xrealloc (buffer, bufmax);
717 }
718
719 if (c == counter_delim || c == EOF)
720 {
721 buffer[bufpos++] = counter_delim; /* will be stripped off later */
722 buffer[bufpos++] = '\0';
723 #if DEBUG_PERL
724 fprintf (stderr, "PASS1: %s\n", buffer);
725 #endif
726 return buffer;
727 }
728
729 if (nested && c == delim)
730 {
731 char *inner = extract_quotelike_pass1 (delim);
732 size_t len = strlen (inner);
733
734 /* Ensure room for len + 1 bytes. */
735 if (bufpos + len >= bufmax)
736 {
737 do
738 bufmax = 2 * bufmax + 10;
739 while (bufpos + len >= bufmax);
740 buffer = xrealloc (buffer, bufmax);
741 }
742 strcpy (buffer + bufpos, inner);
743 free (inner);
744 bufpos += len;
745 }
746 else if (c == '\\')
747 {
748 c = phase1_getc ();
749 if (c == '\\')
750 {
751 buffer[bufpos++] = '\\';
752 buffer[bufpos++] = '\\';
753 }
754 else if (c == delim || c == counter_delim)
755 {
756 /* This is pass2 in Perl. */
757 buffer[bufpos++] = c;
758 }
759 else
760 {
761 buffer[bufpos++] = '\\';
762 phase1_ungetc (c);
763 }
764 }
765 else
766 {
767 buffer[bufpos++] = c;
768 }
769 }
770 }
771
772 /* Like extract_quotelike_pass1, but return the complete string in UTF-8
773 encoding. */
774 static char *
extract_quotelike_pass1_utf8(int delim)775 extract_quotelike_pass1_utf8 (int delim)
776 {
777 char *string = extract_quotelike_pass1 (delim);
778 char *utf8_string =
779 from_current_source_encoding (string, lc_string, logical_file_name,
780 line_number);
781 if (utf8_string != string)
782 free (string);
783 return utf8_string;
784 }
785
786
787 /* ========= Reading of tokens and commands. Extracting strings. ========= */
788
789
790 /* Context lookup table. */
791 static flag_context_list_table_ty *flag_context_list_table;
792
793
794 /* Forward declaration of local functions. */
795 static void interpolate_keywords (message_list_ty *mlp, const char *string,
796 int lineno);
797 static token_ty *x_perl_lex (message_list_ty *mlp);
798 static void x_perl_unlex (token_ty *tp);
799 static bool extract_balanced (message_list_ty *mlp,
800 token_type_ty delim, bool eat_delim,
801 bool comma_delim,
802 flag_context_ty outer_context,
803 flag_context_list_iterator_ty context_iter,
804 int arg, struct arglist_parser *argparser);
805
806
807 /* Extract an unsigned hexadecimal number from STRING, considering at
808 most LEN bytes and place the result in *RESULT. Returns a pointer
809 to the first character past the hexadecimal number. */
810 static const char *
extract_hex(const char * string,size_t len,unsigned int * result)811 extract_hex (const char *string, size_t len, unsigned int *result)
812 {
813 size_t i;
814
815 *result = 0;
816
817 for (i = 0; i < len; i++)
818 {
819 char c = string[i];
820 int number;
821
822 if (c >= 'A' && c <= 'F')
823 number = c - 'A' + 10;
824 else if (c >= 'a' && c <= 'f')
825 number = c - 'a' + 10;
826 else if (c >= '0' && c <= '9')
827 number = c - '0';
828 else
829 break;
830
831 *result <<= 4;
832 *result |= number;
833 }
834
835 return string + i;
836 }
837
838 /* Extract an unsigned octal number from STRING, considering at
839 most LEN bytes and place the result in *RESULT. Returns a pointer
840 to the first character past the octal number. */
841 static const char *
extract_oct(const char * string,size_t len,unsigned int * result)842 extract_oct (const char *string, size_t len, unsigned int *result)
843 {
844 size_t i;
845
846 *result = 0;
847
848 for (i = 0; i < len; i++)
849 {
850 char c = string[i];
851 int number;
852
853 if (c >= '0' && c <= '7')
854 number = c - '0';
855 else
856 break;
857
858 *result <<= 3;
859 *result |= number;
860 }
861
862 return string + i;
863 }
864
865 /* Extract the various quotelike constructs except for <<EOF. See the
866 section "Gory details of parsing quoted constructs" in perlop.pod.
867 Return the resulting token in *tp; tp->type == token_type_string. */
868 static void
extract_quotelike(token_ty * tp,int delim)869 extract_quotelike (token_ty *tp, int delim)
870 {
871 char *string = extract_quotelike_pass1_utf8 (delim);
872 size_t len = strlen (string);
873
874 tp->type = token_type_string;
875 /* Take the string without the delimiters at the start and at the end. */
876 if (!(len >= 2))
877 abort ();
878 string[len - 1] = '\0';
879 tp->string = xstrdup (string + 1);
880 free (string);
881 tp->comment = add_reference (savable_comment);
882 }
883
884 /* Extract the quotelike constructs with double delimiters, like
885 s/[SEARCH]/[REPLACE]/. This function does not eat up trailing
886 modifiers (left to the caller).
887 Return the resulting token in *tp; tp->type == token_type_regex_op. */
888 static void
extract_triple_quotelike(message_list_ty * mlp,token_ty * tp,int delim,bool interpolate)889 extract_triple_quotelike (message_list_ty *mlp, token_ty *tp, int delim,
890 bool interpolate)
891 {
892 char *string;
893
894 tp->type = token_type_regex_op;
895
896 string = extract_quotelike_pass1_utf8 (delim);
897 if (interpolate)
898 interpolate_keywords (mlp, string, line_number);
899 free (string);
900
901 if (delim == '(' || delim == '<' || delim == '{' || delim == '[')
902 {
903 /* The delimiter for the second string can be different, e.g.
904 s{SEARCH}{REPLACE} or s{SEARCH}/REPLACE/. See "man perlrequick". */
905 delim = phase1_getc ();
906 while (is_whitespace (delim))
907 {
908 /* The hash-sign is not a valid delimiter after whitespace, ergo
909 use phase2_getc() and not phase1_getc() now. */
910 delim = phase2_getc ();
911 }
912 }
913 string = extract_quotelike_pass1_utf8 (delim);
914 if (interpolate)
915 interpolate_keywords (mlp, string, line_number);
916 free (string);
917 }
918
919 /* Perform pass 3 of quotelike extraction (interpolation).
920 *tp is a token of type token_type_string.
921 This function replaces tp->string.
922 This function does not access tp->comment. */
923 /* FIXME: Currently may writes null-bytes into the string. */
924 static void
extract_quotelike_pass3(token_ty * tp,int error_level)925 extract_quotelike_pass3 (token_ty *tp, int error_level)
926 {
927 static char *buffer;
928 static int bufmax = 0;
929 int bufpos = 0;
930 const char *crs;
931 bool uppercase;
932 bool lowercase;
933 bool quotemeta;
934
935 #if DEBUG_PERL
936 switch (tp->sub_type)
937 {
938 case string_type_verbatim:
939 fprintf (stderr, "Interpolating string_type_verbatim:\n");
940 break;
941 case string_type_q:
942 fprintf (stderr, "Interpolating string_type_q:\n");
943 break;
944 case string_type_qq:
945 fprintf (stderr, "Interpolating string_type_qq:\n");
946 break;
947 case string_type_qr:
948 fprintf (stderr, "Interpolating string_type_qr:\n");
949 break;
950 }
951 fprintf (stderr, "%s\n", tp->string);
952 if (tp->sub_type == string_type_verbatim)
953 fprintf (stderr, "---> %s\n", tp->string);
954 #endif
955
956 if (tp->sub_type == string_type_verbatim)
957 return;
958
959 /* Loop over tp->string, accumulating the expansion in buffer. */
960 crs = tp->string;
961 uppercase = false;
962 lowercase = false;
963 quotemeta = false;
964 while (*crs)
965 {
966 bool backslashed;
967
968 /* Ensure room for 7 bytes, 6 (multi-)bytes plus a leading backslash
969 if \Q modifier is present. */
970 if (bufpos + 7 > bufmax)
971 {
972 bufmax = 2 * bufmax + 10;
973 buffer = xrealloc (buffer, bufmax);
974 }
975
976 if (tp->sub_type == string_type_q)
977 {
978 switch (*crs)
979 {
980 case '\\':
981 if (crs[1] == '\\')
982 {
983 crs += 2;
984 buffer[bufpos++] = '\\';
985 break;
986 }
987 /* FALLTHROUGH */
988 default:
989 buffer[bufpos++] = *crs++;
990 break;
991 }
992 continue;
993 }
994
995 /* We only get here for double-quoted strings or regular expressions.
996 Unescape escape sequences. */
997 if (*crs == '\\')
998 {
999 switch (crs[1])
1000 {
1001 case 't':
1002 crs += 2;
1003 buffer[bufpos++] = '\t';
1004 continue;
1005 case 'n':
1006 crs += 2;
1007 buffer[bufpos++] = '\n';
1008 continue;
1009 case 'r':
1010 crs += 2;
1011 buffer[bufpos++] = '\r';
1012 continue;
1013 case 'f':
1014 crs += 2;
1015 buffer[bufpos++] = '\f';
1016 continue;
1017 case 'b':
1018 crs += 2;
1019 buffer[bufpos++] = '\b';
1020 continue;
1021 case 'a':
1022 crs += 2;
1023 buffer[bufpos++] = '\a';
1024 continue;
1025 case 'e':
1026 crs += 2;
1027 buffer[bufpos++] = 0x1b;
1028 continue;
1029 case '0': case '1': case '2': case '3':
1030 case '4': case '5': case '6': case '7':
1031 {
1032 unsigned int oct_number;
1033 int length;
1034
1035 crs = extract_oct (crs + 1, 3, &oct_number);
1036
1037 /* FIXME: If one of the variables UPPERCASE or LOWERCASE is
1038 true, the character should be converted to its uppercase
1039 resp. lowercase equivalent. I don't know if the necessary
1040 facilities are already included in gettext. For US-Ascii
1041 the conversion can be already be done, however. */
1042 if (uppercase && oct_number >= 'a' && oct_number <= 'z')
1043 {
1044 oct_number = oct_number - 'a' + 'A';
1045 }
1046 else if (lowercase && oct_number >= 'A' && oct_number <= 'Z')
1047 {
1048 oct_number = oct_number - 'A' + 'a';
1049 }
1050
1051
1052 /* Yes, octal escape sequences in the range 0x100..0x1ff are
1053 valid. */
1054 length = u8_uctomb ((unsigned char *) (buffer + bufpos),
1055 oct_number, 2);
1056 if (length > 0)
1057 bufpos += length;
1058 }
1059 continue;
1060 case 'x':
1061 {
1062 unsigned int hex_number = 0;
1063 int length;
1064
1065 crs += 2;
1066 if (*crs == '{')
1067 {
1068 const char *end = strchr (crs, '}');
1069 if (end == NULL)
1070 {
1071 error_with_progname = false;
1072 error (error_level, 0,
1073 _("%s:%d: missing right brace on \\x{HEXNUMBER}"),
1074 real_file_name, line_number);
1075 error_with_progname = true;
1076 ++crs;
1077 continue;
1078 }
1079 else
1080 {
1081 ++crs;
1082 (void) extract_hex (crs, end - crs, &hex_number);
1083 crs = end + 1;
1084 }
1085 }
1086 else
1087 {
1088 crs = extract_hex (crs, 2, &hex_number);
1089 }
1090
1091 /* FIXME: If one of the variables UPPERCASE or LOWERCASE is
1092 true, the character should be converted to its uppercase
1093 resp. lowercase equivalent. I don't know if the necessary
1094 facilities are already included in gettext. For US-Ascii
1095 the conversion can be already be done, however. */
1096 if (uppercase && hex_number >= 'a' && hex_number <= 'z')
1097 {
1098 hex_number = hex_number - 'a' + 'A';
1099 }
1100 else if (lowercase && hex_number >= 'A' && hex_number <= 'Z')
1101 {
1102 hex_number = hex_number - 'A' + 'a';
1103 }
1104
1105 length = u8_uctomb ((unsigned char *) (buffer + bufpos),
1106 hex_number, 6);
1107
1108 if (length > 0)
1109 bufpos += length;
1110 }
1111 continue;
1112 case 'c':
1113 /* Perl's notion of control characters. */
1114 crs += 2;
1115 if (*crs)
1116 {
1117 int the_char = (unsigned char) *crs;
1118 if (the_char >= 'a' && the_char <= 'z')
1119 the_char = the_char - 'a' + 'A';
1120 buffer[bufpos++] = the_char ^ 0x40;
1121 }
1122 continue;
1123 case 'N':
1124 crs += 2;
1125 if (*crs == '{')
1126 {
1127 const char *end = strchr (crs + 1, '}');
1128 if (end != NULL)
1129 {
1130 char *name;
1131 unsigned int unicode;
1132
1133 name = XNMALLOC (end - (crs + 1) + 1, char);
1134 memcpy (name, crs + 1, end - (crs + 1));
1135 name[end - (crs + 1)] = '\0';
1136
1137 unicode = unicode_name_character (name);
1138 if (unicode != UNINAME_INVALID)
1139 {
1140 /* FIXME: Convert to upper/lowercase if the
1141 corresponding flag is set to true. */
1142 int length =
1143 u8_uctomb ((unsigned char *) (buffer + bufpos),
1144 unicode, 6);
1145 if (length > 0)
1146 bufpos += length;
1147 }
1148
1149 free (name);
1150
1151 crs = end + 1;
1152 }
1153 }
1154 continue;
1155 }
1156 }
1157
1158 /* No escape sequence, go on. */
1159 if (*crs == '\\')
1160 {
1161 ++crs;
1162 switch (*crs)
1163 {
1164 case 'E':
1165 uppercase = false;
1166 lowercase = false;
1167 quotemeta = false;
1168 ++crs;
1169 continue;
1170 case 'L':
1171 uppercase = false;
1172 lowercase = true;
1173 ++crs;
1174 continue;
1175 case 'U':
1176 uppercase = true;
1177 lowercase = false;
1178 ++crs;
1179 continue;
1180 case 'Q':
1181 quotemeta = true;
1182 ++crs;
1183 continue;
1184 case 'l':
1185 ++crs;
1186 if (*crs >= 'A' && *crs <= 'Z')
1187 {
1188 buffer[bufpos++] = *crs - 'A' + 'a';
1189 }
1190 else if ((unsigned char) *crs >= 0x80)
1191 {
1192 error_with_progname = false;
1193 error (error_level, 0,
1194 _("%s:%d: invalid interpolation (\"\\l\") of 8bit character \"%c\""),
1195 real_file_name, line_number, *crs);
1196 error_with_progname = true;
1197 }
1198 else
1199 {
1200 buffer[bufpos++] = *crs;
1201 }
1202 ++crs;
1203 continue;
1204 case 'u':
1205 ++crs;
1206 if (*crs >= 'a' && *crs <= 'z')
1207 {
1208 buffer[bufpos++] = *crs - 'a' + 'A';
1209 }
1210 else if ((unsigned char) *crs >= 0x80)
1211 {
1212 error_with_progname = false;
1213 error (error_level, 0,
1214 _("%s:%d: invalid interpolation (\"\\u\") of 8bit character \"%c\""),
1215 real_file_name, line_number, *crs);
1216 error_with_progname = true;
1217 }
1218 else
1219 {
1220 buffer[bufpos++] = *crs;
1221 }
1222 ++crs;
1223 continue;
1224 case '\\':
1225 buffer[bufpos++] = *crs;
1226 ++crs;
1227 continue;
1228 default:
1229 backslashed = true;
1230 break;
1231 }
1232 }
1233 else
1234 backslashed = false;
1235
1236 if (quotemeta
1237 && !((*crs >= 'A' && *crs <= 'Z') || (*crs >= 'A' && *crs <= 'z')
1238 || (*crs >= '0' && *crs <= '9') || *crs == '_'))
1239 {
1240 buffer[bufpos++] = '\\';
1241 backslashed = true;
1242 }
1243
1244 if (!backslashed && !extract_all && (*crs == '$' || *crs == '@'))
1245 {
1246 error_with_progname = false;
1247 error (error_level, 0,
1248 _("%s:%d: invalid variable interpolation at \"%c\""),
1249 real_file_name, line_number, *crs);
1250 error_with_progname = true;
1251 ++crs;
1252 }
1253 else if (lowercase)
1254 {
1255 if (*crs >= 'A' && *crs <= 'Z')
1256 buffer[bufpos++] = *crs - 'A' + 'a';
1257 else if ((unsigned char) *crs >= 0x80)
1258 {
1259 error_with_progname = false;
1260 error (error_level, 0,
1261 _("%s:%d: invalid interpolation (\"\\L\") of 8bit character \"%c\""),
1262 real_file_name, line_number, *crs);
1263 error_with_progname = true;
1264 buffer[bufpos++] = *crs;
1265 }
1266 else
1267 buffer[bufpos++] = *crs;
1268 ++crs;
1269 }
1270 else if (uppercase)
1271 {
1272 if (*crs >= 'a' && *crs <= 'z')
1273 buffer[bufpos++] = *crs - 'a' + 'A';
1274 else if ((unsigned char) *crs >= 0x80)
1275 {
1276 error_with_progname = false;
1277 error (error_level, 0,
1278 _("%s:%d: invalid interpolation (\"\\U\") of 8bit character \"%c\""),
1279 real_file_name, line_number, *crs);
1280 error_with_progname = true;
1281 buffer[bufpos++] = *crs;
1282 }
1283 else
1284 buffer[bufpos++] = *crs;
1285 ++crs;
1286 }
1287 else
1288 {
1289 buffer[bufpos++] = *crs++;
1290 }
1291 }
1292
1293 /* Ensure room for 1 more byte. */
1294 if (bufpos >= bufmax)
1295 {
1296 bufmax = 2 * bufmax + 10;
1297 buffer = xrealloc (buffer, bufmax);
1298 }
1299
1300 buffer[bufpos++] = '\0';
1301
1302 #if DEBUG_PERL
1303 fprintf (stderr, "---> %s\n", buffer);
1304 #endif
1305
1306 /* Replace tp->string. */
1307 free (tp->string);
1308 tp->string = xstrdup (buffer);
1309 }
1310
1311 /* Parse a variable. This is done in several steps:
1312 1) Consume all leading occurencies of '$', '@', '%', and '*'.
1313 2) Determine the name of the variable from the following input.
1314 3) Parse possible following hash keys or array indexes.
1315 */
1316 static void
extract_variable(message_list_ty * mlp,token_ty * tp,int first)1317 extract_variable (message_list_ty *mlp, token_ty *tp, int first)
1318 {
1319 static char *buffer;
1320 static int bufmax = 0;
1321 int bufpos = 0;
1322 int c = first;
1323 size_t varbody_length = 0;
1324 bool maybe_hash_deref = false;
1325 bool maybe_hash_value = false;
1326
1327 tp->type = token_type_variable;
1328
1329 #if DEBUG_PERL
1330 fprintf (stderr, "%s:%d: extracting variable type '%c'\n",
1331 real_file_name, line_number, first);
1332 #endif
1333
1334 /*
1335 * 1) Consume dollars and so on (not euros ...). Unconditionally
1336 * accepting the hash sign (#) will maybe lead to inaccurate
1337 * results. FIXME!
1338 */
1339 while (c == '$' || c == '*' || c == '#' || c == '@' || c == '%')
1340 {
1341 if (bufpos >= bufmax)
1342 {
1343 bufmax = 2 * bufmax + 10;
1344 buffer = xrealloc (buffer, bufmax);
1345 }
1346 buffer[bufpos++] = c;
1347 c = phase1_getc ();
1348 }
1349
1350 if (c == EOF)
1351 {
1352 tp->type = token_type_eof;
1353 return;
1354 }
1355
1356 /* Hash references are treated in a special way, when looking for
1357 our keywords. */
1358 if (buffer[0] == '$')
1359 {
1360 if (bufpos == 1)
1361 maybe_hash_value = true;
1362 else if (bufpos == 2 && buffer[1] == '$')
1363 {
1364 if (!(c == '{'
1365 || (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z')
1366 || (c >= '0' && c <= '9')
1367 || c == '_' || c == ':' || c == '\'' || c >= 0x80))
1368 {
1369 /* Special variable $$ for pid. */
1370 if (bufpos >= bufmax)
1371 {
1372 bufmax = 2 * bufmax + 10;
1373 buffer = xrealloc (buffer, bufmax);
1374 }
1375 buffer[bufpos++] = '\0';
1376 tp->string = xstrdup (buffer);
1377 #if DEBUG_PERL
1378 fprintf (stderr, "%s:%d: is PID ($$)\n",
1379 real_file_name, line_number);
1380 #endif
1381
1382 phase1_ungetc (c);
1383 return;
1384 }
1385
1386 maybe_hash_deref = true;
1387 bufpos = 1;
1388 }
1389 }
1390
1391 /*
1392 * 2) Get the name of the variable. The first character is practically
1393 * arbitrary. Punctuation and numbers automagically put a variable
1394 * in the global namespace but that subtle difference is not interesting
1395 * for us.
1396 */
1397 if (bufpos >= bufmax)
1398 {
1399 bufmax = 2 * bufmax + 10;
1400 buffer = xrealloc (buffer, bufmax);
1401 }
1402 if (c == '{')
1403 {
1404 /* Yuck, we cannot accept ${gettext} as a keyword... Except for
1405 * debugging purposes it is also harmless, that we suppress the
1406 * real name of the variable.
1407 */
1408 #if DEBUG_PERL
1409 fprintf (stderr, "%s:%d: braced {variable_name}\n",
1410 real_file_name, line_number);
1411 #endif
1412
1413 if (extract_balanced (mlp, token_type_rbrace, true, false,
1414 null_context, null_context_list_iterator,
1415 1, arglist_parser_alloc (mlp, NULL)))
1416 {
1417 tp->type = token_type_eof;
1418 return;
1419 }
1420 buffer[bufpos++] = c;
1421 }
1422 else
1423 {
1424 while ((c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z')
1425 || (c >= '0' && c <= '9')
1426 || c == '_' || c == ':' || c == '\'' || c >= 0x80)
1427 {
1428 ++varbody_length;
1429 if (bufpos >= bufmax)
1430 {
1431 bufmax = 2 * bufmax + 10;
1432 buffer = xrealloc (buffer, bufmax);
1433 }
1434 buffer[bufpos++] = c;
1435 c = phase1_getc ();
1436 }
1437 phase1_ungetc (c);
1438 }
1439
1440 /* Probably some strange Perl variable like $`. */
1441 if (varbody_length == 0)
1442 {
1443 c = phase1_getc ();
1444 if (c == EOF || is_whitespace (c))
1445 phase1_ungetc (c); /* Loser. */
1446 else
1447 {
1448 if (bufpos >= bufmax)
1449 {
1450 bufmax = 2 * bufmax + 10;
1451 buffer = xrealloc (buffer, bufmax);
1452 }
1453 buffer[bufpos++] = c;
1454 }
1455 }
1456
1457 if (bufpos >= bufmax)
1458 {
1459 bufmax = 2 * bufmax + 10;
1460 buffer = xrealloc (buffer, bufmax);
1461 }
1462 buffer[bufpos++] = '\0';
1463
1464 tp->string = xstrdup (buffer);
1465
1466 #if DEBUG_PERL
1467 fprintf (stderr, "%s:%d: complete variable name: %s\n",
1468 real_file_name, line_number, tp->string);
1469 #endif
1470
1471 /*
1472 * 3) If the following looks strange to you, this is valid Perl syntax:
1473 *
1474 * $var = $$hashref # We can place a
1475 * # comment here and then ...
1476 * {key_into_hashref};
1477 *
1478 * POD sections are not allowed but we leave complaints about
1479 * that to the compiler/interpreter.
1480 */
1481 /* We only extract strings from the first hash key (if present). */
1482
1483 if (maybe_hash_deref || maybe_hash_value)
1484 {
1485 bool is_dereference = false;
1486 int c;
1487
1488 do
1489 c = phase2_getc ();
1490 while (is_whitespace (c));
1491
1492 if (c == '-')
1493 {
1494 int c2 = phase1_getc ();
1495
1496 if (c2 == '>')
1497 {
1498 is_dereference = true;
1499
1500 do
1501 c = phase2_getc ();
1502 while (is_whitespace (c));
1503 }
1504 else if (c2 != '\n')
1505 {
1506 /* Discarding the newline is harmless here. The only
1507 special character recognized after a minus is greater-than
1508 for dereference. However, the sequence "-\n>" that we
1509 treat incorrectly here, is a syntax error. */
1510 phase1_ungetc (c2);
1511 }
1512 }
1513
1514 if (maybe_hash_value && is_dereference)
1515 {
1516 tp->type = token_type_object;
1517 #if DEBUG_PERL
1518 fprintf (stderr, "%s:%d: first keys preceded by \"->\"\n",
1519 real_file_name, line_number);
1520 #endif
1521 }
1522 else if (maybe_hash_value)
1523 {
1524 /* Fake it into a hash. */
1525 tp->string[0] = '%';
1526 }
1527
1528 /* Do NOT change that into else if (see above). */
1529 if ((maybe_hash_value || maybe_hash_deref) && c == '{')
1530 {
1531 void *keyword_value;
1532
1533 #if DEBUG_PERL
1534 fprintf (stderr, "%s:%d: first keys preceded by '{'\n",
1535 real_file_name, line_number);
1536 #endif
1537
1538 if (hash_find_entry (&keywords, tp->string, strlen (tp->string),
1539 &keyword_value) == 0)
1540 {
1541 /* TODO: Shouldn't we use the shapes of the keyword, instead
1542 of hardwiring argnum1 = 1 ?
1543 const struct callshapes *shapes =
1544 (const struct callshapes *) keyword_value;
1545 */
1546 struct callshapes shapes;
1547 shapes.keyword = tp->string; /* XXX storage duration? */
1548 shapes.keyword_len = strlen (tp->string);
1549 shapes.nshapes = 1;
1550 shapes.shapes[0].argnum1 = 1;
1551 shapes.shapes[0].argnum2 = 0;
1552 shapes.shapes[0].argnumc = 0;
1553 shapes.shapes[0].argnum1_glib_context = false;
1554 shapes.shapes[0].argnum2_glib_context = false;
1555 shapes.shapes[0].argtotal = 0;
1556 string_list_init (&shapes.shapes[0].xcomments);
1557
1558 {
1559 /* Extract a possible string from the key. Before proceeding
1560 we check whether the open curly is followed by a symbol and
1561 then by a right curly. */
1562 flag_context_list_iterator_ty context_iter =
1563 flag_context_list_iterator (
1564 flag_context_list_table_lookup (
1565 flag_context_list_table,
1566 tp->string, strlen (tp->string)));
1567 token_ty *t1 = x_perl_lex (mlp);
1568
1569 #if DEBUG_PERL
1570 fprintf (stderr, "%s:%d: extracting string key\n",
1571 real_file_name, line_number);
1572 #endif
1573
1574 if (t1->type == token_type_symbol
1575 || t1->type == token_type_named_op)
1576 {
1577 token_ty *t2 = x_perl_lex (mlp);
1578 if (t2->type == token_type_rbrace)
1579 {
1580 flag_context_ty context;
1581 lex_pos_ty pos;
1582
1583 context =
1584 inherited_context (null_context,
1585 flag_context_list_iterator_advance (
1586 &context_iter));
1587
1588 pos.line_number = line_number;
1589 pos.file_name = logical_file_name;
1590
1591 remember_a_message (mlp, NULL, xstrdup (t1->string),
1592 true, false, context, &pos,
1593 NULL, savable_comment, true);
1594 free_token (t2);
1595 free_token (t1);
1596 }
1597 else
1598 {
1599 x_perl_unlex (t2);
1600 }
1601 }
1602 else
1603 {
1604 x_perl_unlex (t1);
1605 if (extract_balanced (mlp, token_type_rbrace, true, false,
1606 null_context, context_iter,
1607 1, arglist_parser_alloc (mlp, &shapes)))
1608 return;
1609 }
1610 }
1611 }
1612 else
1613 {
1614 phase2_ungetc (c);
1615 }
1616 }
1617 else
1618 {
1619 phase2_ungetc (c);
1620 }
1621 }
1622
1623 /* Now consume "->", "[...]", and "{...}". */
1624 for (;;)
1625 {
1626 int c = phase2_getc ();
1627 int c2;
1628
1629 switch (c)
1630 {
1631 case '{':
1632 #if DEBUG_PERL
1633 fprintf (stderr, "%s:%d: extracting balanced '{' after varname\n",
1634 real_file_name, line_number);
1635 #endif
1636 extract_balanced (mlp, token_type_rbrace, true, false,
1637 null_context, null_context_list_iterator,
1638 1, arglist_parser_alloc (mlp, NULL));
1639 break;
1640
1641 case '[':
1642 #if DEBUG_PERL
1643 fprintf (stderr, "%s:%d: extracting balanced '[' after varname\n",
1644 real_file_name, line_number);
1645 #endif
1646 extract_balanced (mlp, token_type_rbracket, true, false,
1647 null_context, null_context_list_iterator,
1648 1, arglist_parser_alloc (mlp, NULL));
1649 break;
1650
1651 case '-':
1652 c2 = phase1_getc ();
1653 if (c2 == '>')
1654 {
1655 #if DEBUG_PERL
1656 fprintf (stderr, "%s:%d: another \"->\" after varname\n",
1657 real_file_name, line_number);
1658 #endif
1659 break;
1660 }
1661 else if (c2 != '\n')
1662 {
1663 /* Discarding the newline is harmless here. The only
1664 special character recognized after a minus is greater-than
1665 for dereference. However, the sequence "-\n>" that we
1666 treat incorrectly here, is a syntax error. */
1667 phase1_ungetc (c2);
1668 }
1669 /* FALLTHROUGH */
1670
1671 default:
1672 #if DEBUG_PERL
1673 fprintf (stderr, "%s:%d: variable finished\n",
1674 real_file_name, line_number);
1675 #endif
1676 phase2_ungetc (c);
1677 return;
1678 }
1679 }
1680 }
1681
1682 /* Actually a simplified version of extract_variable(). It searches for
1683 variables inside a double-quoted string that may interpolate to
1684 some keyword hash (reference). The string is UTF-8 encoded. */
1685 static void
interpolate_keywords(message_list_ty * mlp,const char * string,int lineno)1686 interpolate_keywords (message_list_ty *mlp, const char *string, int lineno)
1687 {
1688 static char *buffer;
1689 static int bufmax = 0;
1690 int bufpos = 0;
1691 flag_context_ty context;
1692 int c;
1693 bool maybe_hash_deref = false;
1694 enum parser_state
1695 {
1696 initial,
1697 one_dollar,
1698 two_dollars,
1699 identifier,
1700 minus,
1701 wait_lbrace,
1702 wait_quote,
1703 dquote,
1704 squote,
1705 barekey,
1706 wait_rbrace
1707 } state;
1708 token_ty token;
1709
1710 lex_pos_ty pos;
1711
1712 /* States are:
1713 *
1714 * initial: initial
1715 * one_dollar: dollar sign seen in state INITIAL
1716 * two_dollars: another dollar-sign has been seen in state ONE_DOLLAR
1717 * identifier: a valid identifier character has been seen in state
1718 * ONE_DOLLAR or TWO_DOLLARS
1719 * minus: a minus-sign has been seen in state IDENTIFIER
1720 * wait_lbrace: a greater-than has been seen in state MINUS
1721 * wait_quote: a left brace has been seen in state IDENTIFIER or in
1722 * state WAIT_LBRACE
1723 * dquote: a double-quote has been seen in state WAIT_QUOTE
1724 * squote: a single-quote has been seen in state WAIT_QUOTE
1725 * barekey: an bareword character has been seen in state WAIT_QUOTE
1726 * wait_rbrace: closing quote has been seen in state DQUOTE or SQUOTE
1727 *
1728 * In the states initial...identifier the context is null_context; in the
1729 * states minus...wait_rbrace the context is the one suitable for the first
1730 * argument of the last seen identifier.
1731 */
1732 state = initial;
1733 context = null_context;
1734
1735 token.type = token_type_string;
1736 token.sub_type = string_type_qq;
1737 token.line_number = line_number;
1738 /* No need for token.comment = add_reference (savable_comment); here.
1739 We can let token.comment uninitialized here, and use savable_comment
1740 directly, because this function only parses the given string and does
1741 not call phase2_getc. */
1742 pos.file_name = logical_file_name;
1743 pos.line_number = lineno;
1744
1745 while ((c = (unsigned char) *string++) != '\0')
1746 {
1747 void *keyword_value;
1748
1749 if (state == initial)
1750 bufpos = 0;
1751
1752 if (c == '\n')
1753 lineno++;
1754
1755 if (bufpos + 1 >= bufmax)
1756 {
1757 bufmax = 2 * bufmax + 10;
1758 buffer = xrealloc (buffer, bufmax);
1759 }
1760
1761 switch (state)
1762 {
1763 case initial:
1764 switch (c)
1765 {
1766 case '\\':
1767 c = (unsigned char) *string++;
1768 if (c == '\0')
1769 return;
1770 break;
1771 case '$':
1772 buffer[bufpos++] = '$';
1773 maybe_hash_deref = false;
1774 state = one_dollar;
1775 break;
1776 default:
1777 break;
1778 }
1779 break;
1780 case one_dollar:
1781 switch (c)
1782 {
1783 case '$':
1784 /*
1785 * This is enough to make us believe later that we dereference
1786 * a hash reference.
1787 */
1788 maybe_hash_deref = true;
1789 state = two_dollars;
1790 break;
1791 default:
1792 if (c == '_' || c == ':' || c == '\'' || c >= 0x80
1793 || (c >= 'A' && c <= 'Z')
1794 || (c >= 'a' && c <= 'z')
1795 || (c >= '0' && c <= '9'))
1796 {
1797 buffer[bufpos++] = c;
1798 state = identifier;
1799 }
1800 else
1801 state = initial;
1802 break;
1803 }
1804 break;
1805 case two_dollars:
1806 if (c == '_' || c == ':' || c == '\'' || c >= 0x80
1807 || (c >= 'A' && c <= 'Z')
1808 || (c >= 'a' && c <= 'z')
1809 || (c >= '0' && c <= '9'))
1810 {
1811 buffer[bufpos++] = c;
1812 state = identifier;
1813 }
1814 else
1815 state = initial;
1816 break;
1817 case identifier:
1818 switch (c)
1819 {
1820 case '-':
1821 if (hash_find_entry (&keywords, buffer, bufpos, &keyword_value)
1822 == 0)
1823 {
1824 flag_context_list_iterator_ty context_iter =
1825 flag_context_list_iterator (
1826 flag_context_list_table_lookup (
1827 flag_context_list_table,
1828 buffer, bufpos));
1829 context =
1830 inherited_context (null_context,
1831 flag_context_list_iterator_advance (
1832 &context_iter));
1833 state = minus;
1834 }
1835 else
1836 state = initial;
1837 break;
1838 case '{':
1839 if (!maybe_hash_deref)
1840 buffer[0] = '%';
1841 if (hash_find_entry (&keywords, buffer, bufpos, &keyword_value)
1842 == 0)
1843 {
1844 flag_context_list_iterator_ty context_iter =
1845 flag_context_list_iterator (
1846 flag_context_list_table_lookup (
1847 flag_context_list_table,
1848 buffer, bufpos));
1849 context =
1850 inherited_context (null_context,
1851 flag_context_list_iterator_advance (
1852 &context_iter));
1853 state = wait_quote;
1854 }
1855 else
1856 state = initial;
1857 break;
1858 default:
1859 if (c == '_' || c == ':' || c == '\'' || c >= 0x80
1860 || (c >= 'A' && c <= 'Z')
1861 || (c >= 'a' && c <= 'z')
1862 || (c >= '0' && c <= '9'))
1863 {
1864 buffer[bufpos++] = c;
1865 }
1866 else
1867 state = initial;
1868 break;
1869 }
1870 break;
1871 case minus:
1872 switch (c)
1873 {
1874 case '>':
1875 state = wait_lbrace;
1876 break;
1877 default:
1878 context = null_context;
1879 state = initial;
1880 break;
1881 }
1882 break;
1883 case wait_lbrace:
1884 switch (c)
1885 {
1886 case '{':
1887 state = wait_quote;
1888 break;
1889 default:
1890 context = null_context;
1891 state = initial;
1892 break;
1893 }
1894 break;
1895 case wait_quote:
1896 switch (c)
1897 {
1898 case_whitespace:
1899 break;
1900 case '\'':
1901 pos.line_number = lineno;
1902 bufpos = 0;
1903 state = squote;
1904 break;
1905 case '"':
1906 pos.line_number = lineno;
1907 bufpos = 0;
1908 state = dquote;
1909 break;
1910 default:
1911 if (c == '_' || (c >= '0' && c <= '9') || c >= 0x80
1912 || (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z'))
1913 {
1914 pos.line_number = lineno;
1915 bufpos = 0;
1916 buffer[bufpos++] = c;
1917 state = barekey;
1918 }
1919 else
1920 {
1921 context = null_context;
1922 state = initial;
1923 }
1924 break;
1925 }
1926 break;
1927 case dquote:
1928 switch (c)
1929 {
1930 case '"':
1931 /* The resulting string has to be interpolated twice. */
1932 buffer[bufpos] = '\0';
1933 token.string = xstrdup (buffer);
1934 extract_quotelike_pass3 (&token, EXIT_FAILURE);
1935 /* The string can only shrink with interpolation (because
1936 we ignore \Q). */
1937 if (!(strlen (token.string) <= bufpos))
1938 abort ();
1939 strcpy (buffer, token.string);
1940 free (token.string);
1941 state = wait_rbrace;
1942 break;
1943 case '\\':
1944 if (string[0] == '\"')
1945 {
1946 buffer[bufpos++] = string++[0];
1947 }
1948 else if (string[0])
1949 {
1950 buffer[bufpos++] = '\\';
1951 buffer[bufpos++] = string++[0];
1952 }
1953 else
1954 {
1955 context = null_context;
1956 state = initial;
1957 }
1958 break;
1959 default:
1960 buffer[bufpos++] = c;
1961 break;
1962 }
1963 break;
1964 case squote:
1965 switch (c)
1966 {
1967 case '\'':
1968 state = wait_rbrace;
1969 break;
1970 case '\\':
1971 if (string[0] == '\'')
1972 {
1973 buffer[bufpos++] = string++[0];
1974 }
1975 else if (string[0])
1976 {
1977 buffer[bufpos++] = '\\';
1978 buffer[bufpos++] = string++[0];
1979 }
1980 else
1981 {
1982 context = null_context;
1983 state = initial;
1984 }
1985 break;
1986 default:
1987 buffer[bufpos++] = c;
1988 break;
1989 }
1990 break;
1991 case barekey:
1992 if (c == '_' || (c >= '0' && c <= '9') || c >= 0x80
1993 || (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z'))
1994 {
1995 buffer[bufpos++] = c;
1996 break;
1997 }
1998 else if (is_whitespace (c))
1999 {
2000 state = wait_rbrace;
2001 break;
2002 }
2003 else if (c != '}')
2004 {
2005 context = null_context;
2006 state = initial;
2007 break;
2008 }
2009 /* Must be right brace. */
2010 /* FALLTHROUGH */
2011 case wait_rbrace:
2012 switch (c)
2013 {
2014 case_whitespace:
2015 break;
2016 case '}':
2017 buffer[bufpos] = '\0';
2018 token.string = xstrdup (buffer);
2019 extract_quotelike_pass3 (&token, EXIT_FAILURE);
2020 remember_a_message (mlp, NULL, token.string, true, false, context,
2021 &pos, NULL, savable_comment, true);
2022 /* FALLTHROUGH */
2023 default:
2024 context = null_context;
2025 state = initial;
2026 break;
2027 }
2028 break;
2029 }
2030 }
2031 }
2032
2033 /* There is an ambiguity about '/' and '?': They can start an operator
2034 (division operator '/' or '/=' or the conditional operator '?'), or they can
2035 start a regular expression. The distinction is important because inside
2036 regular expressions, '#' loses its special meaning. This function helps
2037 making the decision (a heuristic). See the documentation for details. */
2038 static bool
prefer_regexp_over_division(token_type_ty type)2039 prefer_regexp_over_division (token_type_ty type)
2040 {
2041 bool retval = true;
2042
2043 switch (type)
2044 {
2045 case token_type_eof:
2046 retval = true;
2047 break;
2048 case token_type_lparen:
2049 retval = true;
2050 break;
2051 case token_type_rparen:
2052 retval = false;
2053 break;
2054 case token_type_comma:
2055 retval = true;
2056 break;
2057 case token_type_fat_comma:
2058 retval = true;
2059 break;
2060 case token_type_dereference:
2061 retval = true;
2062 break;
2063 case token_type_semicolon:
2064 retval = true;
2065 break;
2066 case token_type_lbrace:
2067 retval = true;
2068 break;
2069 case token_type_rbrace:
2070 retval = false;
2071 break;
2072 case token_type_lbracket:
2073 retval = true;
2074 break;
2075 case token_type_rbracket:
2076 retval = false;
2077 break;
2078 case token_type_string:
2079 retval = false;
2080 break;
2081 case token_type_number:
2082 retval = false;
2083 break;
2084 case token_type_named_op:
2085 retval = true;
2086 break;
2087 case token_type_variable:
2088 retval = false;
2089 break;
2090 case token_type_object:
2091 retval = false;
2092 break;
2093 case token_type_symbol:
2094 case token_type_keyword_symbol:
2095 retval = true;
2096 break;
2097 case token_type_regex_op:
2098 retval = false;
2099 break;
2100 case token_type_dot:
2101 retval = true;
2102 break;
2103 case token_type_other:
2104 retval = true;
2105 break;
2106 }
2107
2108 #if DEBUG_PERL
2109 token_ty ty;
2110 ty.type = type;
2111 fprintf (stderr, "Prefer regexp over division after %s: %s\n",
2112 token2string (&ty), retval ? "true" : "false");
2113 #endif
2114
2115 return retval;
2116 }
2117
2118 /* Last token type seen in the stream. Important for the interpretation
2119 of slash and question mark. */
2120 static token_type_ty last_token_type;
2121
2122 /* Combine characters into tokens. Discard whitespace. */
2123
2124 static void
x_perl_prelex(message_list_ty * mlp,token_ty * tp)2125 x_perl_prelex (message_list_ty *mlp, token_ty *tp)
2126 {
2127 static char *buffer;
2128 static int bufmax;
2129 int bufpos;
2130 int c;
2131
2132 for (;;)
2133 {
2134 c = phase2_getc ();
2135 tp->line_number = line_number;
2136 tp->last_type = last_token_type;
2137
2138 switch (c)
2139 {
2140 case EOF:
2141 tp->type = token_type_eof;
2142 return;
2143
2144 case '\n':
2145 if (last_non_comment_line > last_comment_line)
2146 savable_comment_reset ();
2147 /* FALLTHROUGH */
2148 case '\t':
2149 case ' ':
2150 /* Ignore whitespace. */
2151 continue;
2152
2153 case '%':
2154 case '@':
2155 case '*':
2156 case '$':
2157 if (!extract_all)
2158 {
2159 extract_variable (mlp, tp, c);
2160 return;
2161 }
2162 break;
2163 }
2164
2165 last_non_comment_line = tp->line_number;
2166
2167 switch (c)
2168 {
2169 case '.':
2170 {
2171 int c2 = phase1_getc ();
2172 phase1_ungetc (c2);
2173 if (c2 == '.')
2174 {
2175 tp->type = token_type_other;
2176 return;
2177 }
2178 else if (!(c2 >= '0' && c2 <= '9'))
2179 {
2180 tp->type = token_type_dot;
2181 return;
2182 }
2183 }
2184 /* FALLTHROUGH */
2185 case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
2186 case 'G': case 'H': case 'I': case 'J': case 'K': case 'L':
2187 case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R':
2188 case 'S': case 'T': case 'U': case 'V': case 'W': case 'X':
2189 case 'Y': case 'Z':
2190 case '_':
2191 case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
2192 case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
2193 case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
2194 case 's': case 't': case 'u': case 'v': case 'w': case 'x':
2195 case 'y': case 'z':
2196 case '0': case '1': case '2': case '3': case '4':
2197 case '5': case '6': case '7': case '8': case '9':
2198 /* Symbol, or part of a number. */
2199 bufpos = 0;
2200 for (;;)
2201 {
2202 if (bufpos >= bufmax)
2203 {
2204 bufmax = 2 * bufmax + 10;
2205 buffer = xrealloc (buffer, bufmax);
2206 }
2207 buffer[bufpos++] = c;
2208 c = phase1_getc ();
2209 switch (c)
2210 {
2211 case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
2212 case 'G': case 'H': case 'I': case 'J': case 'K': case 'L':
2213 case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R':
2214 case 'S': case 'T': case 'U': case 'V': case 'W': case 'X':
2215 case 'Y': case 'Z':
2216 case '_':
2217 case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
2218 case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
2219 case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
2220 case 's': case 't': case 'u': case 'v': case 'w': case 'x':
2221 case 'y': case 'z':
2222 case '0': case '1': case '2': case '3': case '4':
2223 case '5': case '6': case '7': case '8': case '9':
2224 continue;
2225
2226 default:
2227 phase1_ungetc (c);
2228 break;
2229 }
2230 break;
2231 }
2232 if (bufpos >= bufmax)
2233 {
2234 bufmax = 2 * bufmax + 10;
2235 buffer = xrealloc (buffer, bufmax);
2236 }
2237 buffer[bufpos] = '\0';
2238
2239 if (strcmp (buffer, "__END__") == 0
2240 || strcmp (buffer, "__DATA__") == 0)
2241 {
2242 end_of_file = true;
2243 tp->type = token_type_eof;
2244 return;
2245 }
2246 else if (strcmp (buffer, "and") == 0
2247 || strcmp (buffer, "cmp") == 0
2248 || strcmp (buffer, "eq") == 0
2249 || strcmp (buffer, "if") == 0
2250 || strcmp (buffer, "ge") == 0
2251 || strcmp (buffer, "gt") == 0
2252 || strcmp (buffer, "le") == 0
2253 || strcmp (buffer, "lt") == 0
2254 || strcmp (buffer, "ne") == 0
2255 || strcmp (buffer, "not") == 0
2256 || strcmp (buffer, "or") == 0
2257 || strcmp (buffer, "unless") == 0
2258 || strcmp (buffer, "while") == 0
2259 || strcmp (buffer, "xor") == 0)
2260 {
2261 tp->type = token_type_named_op;
2262 tp->string = xstrdup (buffer);
2263 return;
2264 }
2265 else if (strcmp (buffer, "s") == 0
2266 || strcmp (buffer, "y") == 0
2267 || strcmp (buffer, "tr") == 0)
2268 {
2269 int delim = phase1_getc ();
2270
2271 while (is_whitespace (delim))
2272 delim = phase2_getc ();
2273
2274 if (delim == EOF)
2275 {
2276 tp->type = token_type_eof;
2277 return;
2278 }
2279 if ((delim >= '0' && delim <= '9')
2280 || (delim >= 'A' && delim <= 'Z')
2281 || (delim >= 'a' && delim <= 'z'))
2282 {
2283 /* False positive. */
2284 phase2_ungetc (delim);
2285 tp->type = token_type_symbol;
2286 tp->sub_type = symbol_type_none;
2287 tp->string = xstrdup (buffer);
2288 return;
2289 }
2290 extract_triple_quotelike (mlp, tp, delim,
2291 buffer[0] == 's' && delim != '\'');
2292
2293 /* Eat the following modifiers. */
2294 do
2295 c = phase1_getc ();
2296 while (c >= 'a' && c <= 'z');
2297 phase1_ungetc (c);
2298 return;
2299 }
2300 else if (strcmp (buffer, "m") == 0)
2301 {
2302 int delim = phase1_getc ();
2303
2304 while (is_whitespace (delim))
2305 delim = phase2_getc ();
2306
2307 if (delim == EOF)
2308 {
2309 tp->type = token_type_eof;
2310 return;
2311 }
2312 if ((delim >= '0' && delim <= '9')
2313 || (delim >= 'A' && delim <= 'Z')
2314 || (delim >= 'a' && delim <= 'z'))
2315 {
2316 /* False positive. */
2317 phase2_ungetc (delim);
2318 tp->type = token_type_symbol;
2319 tp->sub_type = symbol_type_none;
2320 tp->string = xstrdup (buffer);
2321 return;
2322 }
2323 extract_quotelike (tp, delim);
2324 if (delim != '\'')
2325 interpolate_keywords (mlp, tp->string, line_number);
2326 free (tp->string);
2327 drop_reference (tp->comment);
2328 tp->type = token_type_regex_op;
2329
2330 /* Eat the following modifiers. */
2331 do
2332 c = phase1_getc ();
2333 while (c >= 'a' && c <= 'z');
2334 phase1_ungetc (c);
2335 return;
2336 }
2337 else if (strcmp (buffer, "qq") == 0
2338 || strcmp (buffer, "q") == 0
2339 || strcmp (buffer, "qx") == 0
2340 || strcmp (buffer, "qw") == 0
2341 || strcmp (buffer, "qr") == 0)
2342 {
2343 /* The qw (...) construct is not really a string but we
2344 can treat in the same manner and then pretend it is
2345 a symbol. Rationale: Saying "qw (foo bar)" is the
2346 same as "my @list = ('foo', 'bar'); @list;". */
2347
2348 int delim = phase1_getc ();
2349
2350 while (is_whitespace (delim))
2351 delim = phase2_getc ();
2352
2353 if (delim == EOF)
2354 {
2355 tp->type = token_type_eof;
2356 return;
2357 }
2358
2359 if ((delim >= '0' && delim <= '9')
2360 || (delim >= 'A' && delim <= 'Z')
2361 || (delim >= 'a' && delim <= 'z'))
2362 {
2363 /* False positive. */
2364 phase2_ungetc (delim);
2365 tp->type = token_type_symbol;
2366 tp->sub_type = symbol_type_none;
2367 tp->string = xstrdup (buffer);
2368 return;
2369 }
2370
2371 extract_quotelike (tp, delim);
2372
2373 switch (buffer[1])
2374 {
2375 case 'q':
2376 case 'x':
2377 tp->type = token_type_string;
2378 tp->sub_type = string_type_qq;
2379 interpolate_keywords (mlp, tp->string, line_number);
2380 break;
2381 case 'r':
2382 drop_reference (tp->comment);
2383 tp->type = token_type_regex_op;
2384 break;
2385 case 'w':
2386 drop_reference (tp->comment);
2387 tp->type = token_type_symbol;
2388 tp->sub_type = symbol_type_none;
2389 break;
2390 case '\0':
2391 tp->type = token_type_string;
2392 tp->sub_type = string_type_q;
2393 break;
2394 default:
2395 abort ();
2396 }
2397 return;
2398 }
2399 else if ((buffer[0] >= '0' && buffer[0] <= '9') || buffer[0] == '.')
2400 {
2401 tp->type = token_type_number;
2402 return;
2403 }
2404 tp->type = token_type_symbol;
2405 tp->sub_type = (strcmp (buffer, "sub") == 0
2406 ? symbol_type_sub
2407 : symbol_type_none);
2408 tp->string = xstrdup (buffer);
2409 return;
2410
2411 case '"':
2412 extract_quotelike (tp, c);
2413 tp->sub_type = string_type_qq;
2414 interpolate_keywords (mlp, tp->string, line_number);
2415 return;
2416
2417 case '`':
2418 extract_quotelike (tp, c);
2419 tp->sub_type = string_type_qq;
2420 interpolate_keywords (mlp, tp->string, line_number);
2421 return;
2422
2423 case '\'':
2424 extract_quotelike (tp, c);
2425 tp->sub_type = string_type_q;
2426 return;
2427
2428 case '(':
2429 tp->type = token_type_lparen;
2430 return;
2431
2432 case ')':
2433 tp->type = token_type_rparen;
2434 return;
2435
2436 case '{':
2437 tp->type = token_type_lbrace;
2438 return;
2439
2440 case '}':
2441 tp->type = token_type_rbrace;
2442 return;
2443
2444 case '[':
2445 tp->type = token_type_lbracket;
2446 return;
2447
2448 case ']':
2449 tp->type = token_type_rbracket;
2450 return;
2451
2452 case ';':
2453 tp->type = token_type_semicolon;
2454 return;
2455
2456 case ',':
2457 tp->type = token_type_comma;
2458 return;
2459
2460 case '=':
2461 /* Check for fat comma. */
2462 c = phase1_getc ();
2463 if (c == '>')
2464 {
2465 tp->type = token_type_fat_comma;
2466 return;
2467 }
2468 else if (linepos == 2
2469 && (last_token_type == token_type_semicolon
2470 || last_token_type == token_type_rbrace)
2471 && ((c >= 'A' && c <='Z')
2472 || (c >= 'a' && c <= 'z')))
2473 {
2474 #if DEBUG_PERL
2475 fprintf (stderr, "%s:%d: start pod section\n",
2476 real_file_name, line_number);
2477 #endif
2478 skip_pod ();
2479 #if DEBUG_PERL
2480 fprintf (stderr, "%s:%d: end pod section\n",
2481 real_file_name, line_number);
2482 #endif
2483 continue;
2484 }
2485 phase1_ungetc (c);
2486 tp->type = token_type_other;
2487 return;
2488
2489 case '<':
2490 /* Check for <<EOF and friends. */
2491 c = phase1_getc ();
2492 if (c == '<')
2493 {
2494 c = phase1_getc ();
2495 if (c == '\'')
2496 {
2497 char *string;
2498 extract_quotelike (tp, c);
2499 string = get_here_document (tp->string);
2500 free (tp->string);
2501 tp->string = string;
2502 tp->type = token_type_string;
2503 tp->sub_type = string_type_verbatim;
2504 tp->line_number = line_number + 1;
2505 return;
2506 }
2507 else if (c == '"')
2508 {
2509 char *string;
2510 extract_quotelike (tp, c);
2511 string = get_here_document (tp->string);
2512 free (tp->string);
2513 tp->string = string;
2514 tp->type = token_type_string;
2515 tp->sub_type = string_type_qq;
2516 tp->line_number = line_number + 1;
2517 interpolate_keywords (mlp, tp->string, tp->line_number);
2518 return;
2519 }
2520 else if ((c >= 'A' && c <= 'Z')
2521 || (c >= 'a' && c <= 'z')
2522 || c == '_')
2523 {
2524 bufpos = 0;
2525 while ((c >= 'A' && c <= 'Z')
2526 || (c >= 'a' && c <= 'z')
2527 || (c >= '0' && c <= '9')
2528 || c == '_' || c >= 0x80)
2529 {
2530 if (bufpos >= bufmax)
2531 {
2532 bufmax = 2 * bufmax + 10;
2533 buffer = xrealloc (buffer, bufmax);
2534 }
2535 buffer[bufpos++] = c;
2536 c = phase1_getc ();
2537 }
2538 if (c == EOF)
2539 {
2540 tp->type = token_type_eof;
2541 return;
2542 }
2543 else
2544 {
2545 char *string;
2546 phase1_ungetc (c);
2547 if (bufpos >= bufmax)
2548 {
2549 bufmax = 2 * bufmax + 10;
2550 buffer = xrealloc (buffer, bufmax);
2551 }
2552 buffer[bufpos++] = '\0';
2553 string = get_here_document (buffer);
2554 tp->string = string;
2555 tp->type = token_type_string;
2556 tp->sub_type = string_type_qq;
2557 tp->comment = add_reference (savable_comment);
2558 tp->line_number = line_number + 1;
2559 interpolate_keywords (mlp, tp->string, tp->line_number);
2560 return;
2561 }
2562 }
2563 else
2564 {
2565 tp->type = token_type_other;
2566 return;
2567 }
2568 }
2569 else
2570 {
2571 phase1_ungetc (c);
2572 tp->type = token_type_other;
2573 }
2574 return; /* End of case '>'. */
2575
2576 case '-':
2577 /* Check for dereferencing operator. */
2578 c = phase1_getc ();
2579 if (c == '>')
2580 {
2581 tp->type = token_type_dereference;
2582 return;
2583 }
2584 else if ((c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z'))
2585 {
2586 /* One of the -X (filetest) functions. We play safe
2587 and accept all alphabetical characters here. */
2588 tp->type = token_type_other;
2589 return;
2590 }
2591 phase1_ungetc (c);
2592 tp->type = token_type_other;
2593 return;
2594
2595 case '/':
2596 case '?':
2597 if (prefer_regexp_over_division (tp->last_type))
2598 {
2599 extract_quotelike (tp, c);
2600 interpolate_keywords (mlp, tp->string, line_number);
2601 free (tp->string);
2602 drop_reference (tp->comment);
2603 tp->type = token_type_regex_op;
2604 /* Eat the following modifiers. */
2605 do
2606 c = phase1_getc ();
2607 while (c >= 'a' && c <= 'z');
2608 phase1_ungetc (c);
2609 return;
2610 }
2611 /* Recognize operator '//'. */
2612 if (c == '/')
2613 {
2614 c = phase1_getc ();
2615 if (c != '/')
2616 phase1_ungetc (c);
2617 }
2618 /* FALLTHROUGH */
2619
2620 default:
2621 /* We could carefully recognize each of the 2 and 3 character
2622 operators, but it is not necessary, except for the '//' operator,
2623 as we only need to recognize gettext invocations. Don't
2624 bother. */
2625 tp->type = token_type_other;
2626 return;
2627 }
2628 }
2629 }
2630
2631
2632 /* A token stack used as a lookahead buffer. */
2633
2634 typedef struct token_stack_ty token_stack_ty;
2635 struct token_stack_ty
2636 {
2637 token_ty **items;
2638 size_t nitems;
2639 size_t nitems_max;
2640 };
2641
2642 static struct token_stack_ty token_stack;
2643
2644 #if DEBUG_PERL
2645 /* Dumps all resources allocated by stack STACK. */
2646 static int
token_stack_dump(token_stack_ty * stack)2647 token_stack_dump (token_stack_ty *stack)
2648 {
2649 size_t i;
2650
2651 fprintf (stderr, "BEGIN STACK DUMP\n");
2652 for (i = 0; i < stack->nitems; i++)
2653 {
2654 token_ty *token = stack->items[i];
2655 fprintf (stderr, " [%s]\n", token2string (token));
2656 switch (token->type)
2657 {
2658 case token_type_named_op:
2659 case token_type_string:
2660 case token_type_symbol:
2661 case token_type_variable:
2662 fprintf (stderr, " string: %s\n", token->string);
2663 break;
2664 case token_type_object:
2665 fprintf (stderr, " string: %s->\n", token->string);
2666 default:
2667 break;
2668 }
2669 }
2670 fprintf (stderr, "END STACK DUMP\n");
2671 return 0;
2672 }
2673 #endif
2674
2675 /* Pushes the token TOKEN onto the stack STACK. */
2676 static inline void
token_stack_push(token_stack_ty * stack,token_ty * token)2677 token_stack_push (token_stack_ty *stack, token_ty *token)
2678 {
2679 if (stack->nitems >= stack->nitems_max)
2680 {
2681 size_t nbytes;
2682
2683 stack->nitems_max = 2 * stack->nitems_max + 4;
2684 nbytes = stack->nitems_max * sizeof (token_ty *);
2685 stack->items = xrealloc (stack->items, nbytes);
2686 }
2687 stack->items[stack->nitems++] = token;
2688 }
2689
2690 /* Pops the most recently pushed token from the stack STACK and returns it.
2691 Returns NULL if the stack is empty. */
2692 static inline token_ty *
token_stack_pop(token_stack_ty * stack)2693 token_stack_pop (token_stack_ty *stack)
2694 {
2695 if (stack->nitems > 0)
2696 return stack->items[--(stack->nitems)];
2697 else
2698 return NULL;
2699 }
2700
2701 /* Return the top of the stack without removing it from the stack, or
2702 NULL if the stack is empty. */
2703 static inline token_ty *
token_stack_peek(const token_stack_ty * stack)2704 token_stack_peek (const token_stack_ty *stack)
2705 {
2706 if (stack->nitems > 0)
2707 return stack->items[stack->nitems - 1];
2708 else
2709 return NULL;
2710 }
2711
2712 /* Frees all resources allocated by stack STACK. */
2713 static inline void
token_stack_free(token_stack_ty * stack)2714 token_stack_free (token_stack_ty *stack)
2715 {
2716 size_t i;
2717
2718 for (i = 0; i < stack->nitems; i++)
2719 free_token (stack->items[i]);
2720 free (stack->items);
2721 }
2722
2723
2724 static token_ty *
x_perl_lex(message_list_ty * mlp)2725 x_perl_lex (message_list_ty *mlp)
2726 {
2727 #if DEBUG_PERL
2728 int dummy = token_stack_dump (&token_stack);
2729 #endif
2730 token_ty *tp = token_stack_pop (&token_stack);
2731
2732 if (!tp)
2733 {
2734 tp = XMALLOC (token_ty);
2735 x_perl_prelex (mlp, tp);
2736 tp->last_type = last_token_type;
2737 last_token_type = tp->type;
2738
2739 #if DEBUG_PERL
2740 fprintf (stderr, "%s:%d: x_perl_prelex returned %s\n",
2741 real_file_name, line_number, token2string (tp));
2742 #endif
2743
2744 /* The interpretation of a slash or question mark after a function call
2745 depends on the prototype of that function. If the function expects
2746 at least one argument, a regular expression is preferred, otherwise
2747 an operator. With our limited means, we can only guess here. If
2748 the function is a builtin that takes no arguments, we prefer an
2749 operator by silently turning the last symbol into a variable instead
2750 of a symbol.
2751
2752 Method calls without parentheses are not ambiguous. After them, an
2753 operator must follow. Due to some ideosyncrasies in this parser
2754 they are treated in two different manners. If the call is
2755 chained ($foo->bar->baz) the token left of the symbol is a
2756 dereference operator. If it is not chained ($foo->bar) the
2757 dereference operator is consumed with the extracted variable. The
2758 latter case is handled below. */
2759 if (tp->type == token_type_symbol)
2760 {
2761 if (tp->last_type == token_type_dereference)
2762 {
2763 /* Class method call or chained method call (with at least
2764 two arrow operators). */
2765 last_token_type = token_type_variable;
2766 }
2767 else if (tp->last_type == token_type_object)
2768 {
2769 /* Instance method, not chained. */
2770 last_token_type = token_type_variable;
2771 }
2772 else if (strcmp (tp->string, "wantarray") == 0
2773 || strcmp (tp->string, "fork") == 0
2774 || strcmp (tp->string, "getlogin") == 0
2775 || strcmp (tp->string, "getppid") == 0
2776 || strcmp (tp->string, "getpwent") == 0
2777 || strcmp (tp->string, "getgrent") == 0
2778 || strcmp (tp->string, "gethostent") == 0
2779 || strcmp (tp->string, "getnetent") == 0
2780 || strcmp (tp->string, "getprotoent") == 0
2781 || strcmp (tp->string, "getservent") == 0
2782 || strcmp (tp->string, "setpwent") == 0
2783 || strcmp (tp->string, "setgrent") == 0
2784 || strcmp (tp->string, "endpwent") == 0
2785 || strcmp (tp->string, "endgrent") == 0
2786 || strcmp (tp->string, "endhostent") == 0
2787 || strcmp (tp->string, "endnetent") == 0
2788 || strcmp (tp->string, "endprotoent") == 0
2789 || strcmp (tp->string, "endservent") == 0
2790 || strcmp (tp->string, "time") == 0
2791 || strcmp (tp->string, "times") == 0
2792 || strcmp (tp->string, "wait") == 0
2793 || strcmp (tp->string, "wantarray") == 0)
2794 {
2795 /* A Perl built-in function that does not accept arguments. */
2796 last_token_type = token_type_variable;
2797 }
2798 }
2799 }
2800 #if DEBUG_PERL
2801 else
2802 {
2803 fprintf (stderr, "%s:%d: %s recycled from stack\n",
2804 real_file_name, line_number, token2string (tp));
2805 }
2806 #endif
2807
2808 /* A symbol followed by a fat comma is really a single-quoted string.
2809 Function definitions or forward declarations also need a special
2810 handling because the dollars and at signs inside the parentheses
2811 must not be interpreted as the beginning of a variable ')'. */
2812 if (tp->type == token_type_symbol || tp->type == token_type_named_op)
2813 {
2814 token_ty *next = token_stack_peek (&token_stack);
2815
2816 if (!next)
2817 {
2818 #if DEBUG_PERL
2819 fprintf (stderr, "%s:%d: pre-fetching next token\n",
2820 real_file_name, line_number);
2821 #endif
2822 next = x_perl_lex (mlp);
2823 x_perl_unlex (next);
2824 #if DEBUG_PERL
2825 fprintf (stderr, "%s:%d: unshifted next token\n",
2826 real_file_name, line_number);
2827 #endif
2828 }
2829
2830 #if DEBUG_PERL
2831 fprintf (stderr, "%s:%d: next token is %s\n",
2832 real_file_name, line_number, token2string (next));
2833 #endif
2834
2835 if (next->type == token_type_fat_comma)
2836 {
2837 tp->type = token_type_string;
2838 tp->sub_type = string_type_q;
2839 tp->comment = add_reference (savable_comment);
2840 #if DEBUG_PERL
2841 fprintf (stderr,
2842 "%s:%d: token %s mutated to token_type_string\n",
2843 real_file_name, line_number, token2string (tp));
2844 #endif
2845 }
2846 else if (tp->type == token_type_symbol && tp->sub_type == symbol_type_sub
2847 && next->type == token_type_symbol)
2848 {
2849 /* Start of a function declaration or definition. Mark this
2850 symbol as a function name, so that we can later eat up
2851 possible prototype information. */
2852 #if DEBUG_PERL
2853 fprintf (stderr, "%s:%d: subroutine declaration/definition '%s'\n",
2854 real_file_name, line_number, next->string);
2855 #endif
2856 next->sub_type = symbol_type_function;
2857 }
2858 else if (tp->type == token_type_symbol
2859 && (tp->sub_type == symbol_type_sub
2860 || tp->sub_type == symbol_type_function)
2861 && next->type == token_type_lparen)
2862 {
2863 /* For simplicity we simply consume everything up to the
2864 closing parenthesis. Actually only a limited set of
2865 characters is allowed inside parentheses but we leave
2866 complaints to the interpreter and are prepared for
2867 future extensions to the Perl syntax. */
2868 int c;
2869
2870 #if DEBUG_PERL
2871 fprintf (stderr, "%s:%d: consuming prototype information\n",
2872 real_file_name, line_number);
2873 #endif
2874
2875 do
2876 {
2877 c = phase1_getc ();
2878 #if DEBUG_PERL
2879 fprintf (stderr, " consuming character '%c'\n", c);
2880 #endif
2881 }
2882 while (c != EOF && c != ')');
2883 phase1_ungetc (c);
2884 }
2885 }
2886
2887 return tp;
2888 }
2889
2890 static void
x_perl_unlex(token_ty * tp)2891 x_perl_unlex (token_ty *tp)
2892 {
2893 token_stack_push (&token_stack, tp);
2894 }
2895
2896
2897 /* ========================= Extracting strings. ========================== */
2898
2899 /* Assuming TP is a string token, this function accumulates all subsequent
2900 . string2 . string3 ... to the string. (String concatenation.) */
2901
2902 static char *
collect_message(message_list_ty * mlp,token_ty * tp,int error_level)2903 collect_message (message_list_ty *mlp, token_ty *tp, int error_level)
2904 {
2905 char *string;
2906 size_t len;
2907
2908 extract_quotelike_pass3 (tp, error_level);
2909 string = xstrdup (tp->string);
2910 len = strlen (tp->string) + 1;
2911
2912 for (;;)
2913 {
2914 int c;
2915
2916 do
2917 c = phase2_getc ();
2918 while (is_whitespace (c));
2919
2920 if (c != '.')
2921 {
2922 phase2_ungetc (c);
2923 return string;
2924 }
2925
2926 do
2927 c = phase2_getc ();
2928 while (is_whitespace (c));
2929
2930 phase2_ungetc (c);
2931
2932 if (c == '"' || c == '\'' || c == '`'
2933 || ((c == '/' || c == '?')
2934 && prefer_regexp_over_division (tp->last_type))
2935 || c == 'q')
2936 {
2937 token_ty *qstring = x_perl_lex (mlp);
2938 if (qstring->type != token_type_string)
2939 {
2940 /* assert (qstring->type == token_type_symbol) */
2941 x_perl_unlex (qstring);
2942 return string;
2943 }
2944
2945 extract_quotelike_pass3 (qstring, error_level);
2946 len += strlen (qstring->string);
2947 string = xrealloc (string, len);
2948 strcat (string, qstring->string);
2949 free_token (qstring);
2950 }
2951 }
2952 }
2953
2954 /* The file is broken into tokens. Scan the token stream, looking for
2955 a keyword, followed by a left paren, followed by a string. When we
2956 see this sequence, we have something to remember. We assume we are
2957 looking at a valid Perl program, and leave the complaints about
2958 the grammar to the compiler.
2959
2960 Normal handling: Look for
2961 keyword ( ... msgid ... )
2962 Plural handling: Look for
2963 keyword ( ... msgid ... msgid_plural ... )
2964
2965 We use recursion because the arguments before msgid or between msgid
2966 and msgid_plural can contain subexpressions of the same form.
2967
2968 In Perl, parentheses around function arguments can be omitted.
2969
2970 The general rules are:
2971 1) Functions declared with a prototype take exactly the specified number
2972 of arguments.
2973 sub one_arg ($) { ... }
2974 sub two_args ($$) { ... }
2975 2) When a function name is immediately followed by an opening parenthesis,
2976 the argument list ends at the corresponding closing parenthesis.
2977
2978 If rule 1 and rule 2 are contradictory, i.e. when the program calls a
2979 function with an explicit argument list and the wrong number of arguments,
2980 the program is invalid:
2981 sub two_args ($$) { ... }
2982 foo two_args (x), y - invalid due to rules 1 and 2
2983
2984 Ambiguities are resolved as follows:
2985 3) Some built-ins, such as 'abs', 'sqrt', 'sin', 'cos', ..., and functions
2986 declared with a prototype of exactly one argument take exactly one
2987 argument:
2988 foo sin x, y ==> foo (sin (x), y)
2989 sub one_arg ($) { ... }
2990 foo one_arg x, y, z ==> foo (one_arg (x), y, z)
2991 4) Other identifiers, if not immediately followed by an opening
2992 parenthesis, consume the entire remaining argument list:
2993 foo bar x, y ==> foo (bar (x, y))
2994 sub two_args ($$) { ... }
2995 foo two_args x, y ==> foo (two_args (x, y))
2996
2997 Other series of comma separated expressions without a function name at
2998 the beginning are comma expressions:
2999 sub two_args ($$) { ... }
3000 foo two_args x, (y, z) ==> foo (two_args (x, (y, z)))
3001 Note that the evaluation of comma expressions returns a list of values
3002 when in list context (e.g. inside the argument list of a function without
3003 prototype) but only one value when inside the argument list of a function
3004 with a prototype:
3005 sub print3 ($$$) { print @_ }
3006 print3 5, (6, 7), 8 ==> 578
3007 print 5, (6, 7), 8 ==> 5678
3008
3009 Where rule 3 or 4 contradict rule 1 or 2, the program is invalid:
3010 sin (x, y) - invalid due to rules 2 and 3
3011 sub one_arg ($) { ... }
3012 one_arg (x, y) - invalid due to rules 2 and 3
3013 sub two_args ($$) { ... }
3014 foo two_args x, y, z - invalid due to rules 1 and 4
3015 */
3016
3017 /* Extract messages until the next balanced closing parenthesis.
3018 Extracted messages are added to MLP.
3019
3020 DELIM can be either token_type_rbrace, token_type_rbracket,
3021 token_type_rparen. Additionally, if COMMA_DELIM is true, parsing
3022 stops at the next comma outside parentheses.
3023
3024 ARG is the current argument list position, starts with 1.
3025 ARGPARSER is the corresponding argument list parser.
3026
3027 Returns true for EOF, false otherwise. */
3028
3029 static bool
extract_balanced(message_list_ty * mlp,token_type_ty delim,bool eat_delim,bool comma_delim,flag_context_ty outer_context,flag_context_list_iterator_ty context_iter,int arg,struct arglist_parser * argparser)3030 extract_balanced (message_list_ty *mlp,
3031 token_type_ty delim, bool eat_delim, bool comma_delim,
3032 flag_context_ty outer_context,
3033 flag_context_list_iterator_ty context_iter,
3034 int arg, struct arglist_parser *argparser)
3035 {
3036 /* Whether to implicitly assume the next tokens are arguments even without
3037 a '('. */
3038 bool next_is_argument = false;
3039 /* Parameters of the keyword just seen. Defined only when next_is_argument
3040 is true. */
3041 const struct callshapes *next_shapes = NULL;
3042 struct arglist_parser *next_argparser = NULL;
3043
3044 /* Whether to not consider strings until the next comma. */
3045 bool skip_until_comma = false;
3046
3047 /* Context iterator that will be used if the next token is a '('. */
3048 flag_context_list_iterator_ty next_context_iter =
3049 passthrough_context_list_iterator;
3050 /* Current context. */
3051 flag_context_ty inner_context =
3052 inherited_context (outer_context,
3053 flag_context_list_iterator_advance (&context_iter));
3054
3055 #if DEBUG_PERL
3056 static int nesting_level = 0;
3057
3058 ++nesting_level;
3059 #endif
3060
3061 for (;;)
3062 {
3063 /* The current token. */
3064 token_ty *tp;
3065
3066 tp = x_perl_lex (mlp);
3067
3068 if (delim == tp->type)
3069 {
3070 arglist_parser_done (argparser, arg);
3071 if (next_argparser != NULL)
3072 free (next_argparser);
3073 #if DEBUG_PERL
3074 fprintf (stderr, "%s:%d: extract_balanced finished (%d)\n",
3075 logical_file_name, tp->line_number, --nesting_level);
3076 #endif
3077 if (eat_delim)
3078 free_token (tp);
3079 else
3080 /* Preserve the delimiter for the caller. */
3081 x_perl_unlex (tp);
3082 return false;
3083 }
3084
3085 if (comma_delim && tp->type == token_type_comma)
3086 {
3087 arglist_parser_done (argparser, arg);
3088 if (next_argparser != NULL)
3089 free (next_argparser);
3090 #if DEBUG_PERL
3091 fprintf (stderr, "%s:%d: extract_balanced finished at comma (%d)\n",
3092 logical_file_name, tp->line_number, --nesting_level);
3093 #endif
3094 x_perl_unlex (tp);
3095 return false;
3096 }
3097
3098 if (next_is_argument && tp->type != token_type_lparen)
3099 {
3100 /* An argument list starts, even though there is no '('. */
3101 bool next_comma_delim;
3102
3103 x_perl_unlex (tp);
3104
3105 if (next_shapes != NULL)
3106 /* We know something about the function being called. Assume
3107 that it consumes only one argument if no argument number or
3108 total > 1 is specified. */
3109 {
3110 size_t i;
3111
3112 next_comma_delim = true;
3113 for (i = 0; i < next_shapes->nshapes; i++)
3114 {
3115 const struct callshape *shape = &next_shapes->shapes[i];
3116
3117 if (shape->argnum1 > 1
3118 || shape->argnum2 > 1
3119 || shape->argnumc > 1
3120 || shape->argtotal > 1)
3121 next_comma_delim = false;
3122 }
3123 }
3124 else
3125 /* We know nothing about the function being called. It could be
3126 a function prototyped to take only one argument, or on the other
3127 hand it could be prototyped to take more than one argument or an
3128 arbitrary argument list or it could be unprototyped. Due to
3129 the way the parser works, assuming the first case gives the
3130 best results. */
3131 next_comma_delim = true;
3132
3133 if (extract_balanced (mlp, delim, false, next_comma_delim,
3134 inner_context, next_context_iter,
3135 1, next_argparser))
3136 {
3137 arglist_parser_done (argparser, arg);
3138 return true;
3139 }
3140
3141 next_is_argument = false;
3142 next_argparser = NULL;
3143 next_context_iter = null_context_list_iterator;
3144 continue;
3145 }
3146
3147 switch (tp->type)
3148 {
3149 case token_type_symbol:
3150 case token_type_keyword_symbol:
3151 #if DEBUG_PERL
3152 fprintf (stderr, "%s:%d: type symbol (%d) \"%s\"\n",
3153 logical_file_name, tp->line_number, nesting_level,
3154 tp->string);
3155 #endif
3156
3157 {
3158 void *keyword_value;
3159
3160 if (hash_find_entry (&keywords, tp->string, strlen (tp->string),
3161 &keyword_value) == 0)
3162 {
3163 const struct callshapes *shapes =
3164 (const struct callshapes *) keyword_value;
3165
3166 next_shapes = shapes;
3167 next_argparser = arglist_parser_alloc (mlp, shapes);
3168 }
3169 else
3170 {
3171 next_shapes = NULL;
3172 next_argparser = arglist_parser_alloc (mlp, NULL);
3173 }
3174 }
3175 next_is_argument = true;
3176 next_context_iter =
3177 flag_context_list_iterator (
3178 flag_context_list_table_lookup (
3179 flag_context_list_table,
3180 tp->string, strlen (tp->string)));
3181 break;
3182
3183 case token_type_variable:
3184 #if DEBUG_PERL
3185 fprintf (stderr, "%s:%d: type variable (%d) \"%s\"\n",
3186 logical_file_name, tp->line_number, nesting_level,
3187 tp->string);
3188 #endif
3189 next_is_argument = false;
3190 if (next_argparser != NULL)
3191 free (next_argparser);
3192 next_argparser = NULL;
3193 next_context_iter = null_context_list_iterator;
3194 break;
3195
3196 case token_type_object:
3197 #if DEBUG_PERL
3198 fprintf (stderr, "%s:%d: type object (%d) \"%s->\"\n",
3199 logical_file_name, tp->line_number, nesting_level,
3200 tp->string);
3201 #endif
3202 next_is_argument = false;
3203 if (next_argparser != NULL)
3204 free (next_argparser);
3205 next_argparser = NULL;
3206 next_context_iter = null_context_list_iterator;
3207 break;
3208
3209 case token_type_lparen:
3210 #if DEBUG_PERL
3211 fprintf (stderr, "%s:%d: type left parenthesis (%d)\n",
3212 logical_file_name, tp->line_number, nesting_level);
3213 #endif
3214 if (next_is_argument)
3215 {
3216 /* Parse the argument list of a function call. */
3217 if (extract_balanced (mlp, token_type_rparen, true, false,
3218 inner_context, next_context_iter,
3219 1, next_argparser))
3220 {
3221 arglist_parser_done (argparser, arg);
3222 return true;
3223 }
3224 next_is_argument = false;
3225 next_argparser = NULL;
3226 }
3227 else
3228 {
3229 /* Parse a parenthesized expression or comma expression. */
3230 if (extract_balanced (mlp, token_type_rparen, true, false,
3231 inner_context, next_context_iter,
3232 arg, arglist_parser_clone (argparser)))
3233 {
3234 arglist_parser_done (argparser, arg);
3235 if (next_argparser != NULL)
3236 free (next_argparser);
3237 free_token (tp);
3238 return true;
3239 }
3240 next_is_argument = false;
3241 if (next_argparser != NULL)
3242 free (next_argparser);
3243 next_argparser = NULL;
3244 }
3245 skip_until_comma = true;
3246 next_context_iter = null_context_list_iterator;
3247 break;
3248
3249 case token_type_rparen:
3250 #if DEBUG_PERL
3251 fprintf (stderr, "%s:%d: type right parenthesis (%d)\n",
3252 logical_file_name, tp->line_number, nesting_level);
3253 #endif
3254 next_is_argument = false;
3255 if (next_argparser != NULL)
3256 free (next_argparser);
3257 next_argparser = NULL;
3258 skip_until_comma = true;
3259 next_context_iter = null_context_list_iterator;
3260 break;
3261
3262 case token_type_comma:
3263 case token_type_fat_comma:
3264 #if DEBUG_PERL
3265 fprintf (stderr, "%s:%d: type comma (%d)\n",
3266 logical_file_name, tp->line_number, nesting_level);
3267 #endif
3268 if (arglist_parser_decidedp (argparser, arg))
3269 {
3270 /* We have missed the argument. */
3271 arglist_parser_done (argparser, arg);
3272 argparser = arglist_parser_alloc (mlp, NULL);
3273 arg = 0;
3274 }
3275 arg++;
3276 #if DEBUG_PERL
3277 fprintf (stderr, "%s:%d: arg: %d\n",
3278 real_file_name, tp->line_number, arg);
3279 #endif
3280 inner_context =
3281 inherited_context (outer_context,
3282 flag_context_list_iterator_advance (
3283 &context_iter));
3284 next_is_argument = false;
3285 if (next_argparser != NULL)
3286 free (next_argparser);
3287 next_argparser = NULL;
3288 skip_until_comma = false;
3289 next_context_iter = passthrough_context_list_iterator;
3290 break;
3291
3292 case token_type_string:
3293 #if DEBUG_PERL
3294 fprintf (stderr, "%s:%d: type string (%d): \"%s\"\n",
3295 logical_file_name, tp->line_number, nesting_level,
3296 tp->string);
3297 #endif
3298
3299 if (extract_all)
3300 {
3301 char *string = collect_message (mlp, tp, EXIT_SUCCESS);
3302 lex_pos_ty pos;
3303
3304 pos.file_name = logical_file_name;
3305 pos.line_number = tp->line_number;
3306 remember_a_message (mlp, NULL, string, true, false, inner_context,
3307 &pos, NULL, tp->comment, true);
3308 }
3309 else if (!skip_until_comma)
3310 {
3311 /* Need to collect the complete string, with error checking,
3312 only if the argument ARG is used in ARGPARSER. */
3313 bool must_collect = false;
3314 {
3315 size_t nalternatives = argparser->nalternatives;
3316 size_t i;
3317
3318 for (i = 0; i < nalternatives; i++)
3319 {
3320 struct partial_call *cp = &argparser->alternative[i];
3321
3322 if (arg == cp->argnumc
3323 || arg == cp->argnum1 || arg == cp->argnum2)
3324 must_collect = true;
3325 }
3326 }
3327
3328 if (must_collect)
3329 {
3330 char *string = collect_message (mlp, tp, EXIT_FAILURE);
3331 mixed_string_ty *ms =
3332 mixed_string_alloc_utf8 (string, lc_string,
3333 logical_file_name, tp->line_number);
3334 free (string);
3335 arglist_parser_remember (argparser, arg, ms, inner_context,
3336 logical_file_name, tp->line_number,
3337 tp->comment, true);
3338 }
3339 }
3340
3341 if (arglist_parser_decidedp (argparser, arg))
3342 {
3343 arglist_parser_done (argparser, arg);
3344 argparser = arglist_parser_alloc (mlp, NULL);
3345 }
3346
3347 next_is_argument = false;
3348 if (next_argparser != NULL)
3349 free (next_argparser);
3350 next_argparser = NULL;
3351 next_context_iter = null_context_list_iterator;
3352 break;
3353
3354 case token_type_number:
3355 #if DEBUG_PERL
3356 fprintf (stderr, "%s:%d: type number (%d)\n",
3357 logical_file_name, tp->line_number, nesting_level);
3358 #endif
3359 next_is_argument = false;
3360 if (next_argparser != NULL)
3361 free (next_argparser);
3362 next_argparser = NULL;
3363 next_context_iter = null_context_list_iterator;
3364 break;
3365
3366 case token_type_eof:
3367 #if DEBUG_PERL
3368 fprintf (stderr, "%s:%d: type EOF (%d)\n",
3369 logical_file_name, tp->line_number, nesting_level);
3370 #endif
3371 arglist_parser_done (argparser, arg);
3372 if (next_argparser != NULL)
3373 free (next_argparser);
3374 next_argparser = NULL;
3375 free_token (tp);
3376 return true;
3377
3378 case token_type_lbrace:
3379 #if DEBUG_PERL
3380 fprintf (stderr, "%s:%d: type lbrace (%d)\n",
3381 logical_file_name, tp->line_number, nesting_level);
3382 #endif
3383 if (extract_balanced (mlp, token_type_rbrace, true, false,
3384 null_context, null_context_list_iterator,
3385 1, arglist_parser_alloc (mlp, NULL)))
3386 {
3387 arglist_parser_done (argparser, arg);
3388 if (next_argparser != NULL)
3389 free (next_argparser);
3390 free_token (tp);
3391 return true;
3392 }
3393 next_is_argument = false;
3394 if (next_argparser != NULL)
3395 free (next_argparser);
3396 next_argparser = NULL;
3397 next_context_iter = null_context_list_iterator;
3398 break;
3399
3400 case token_type_rbrace:
3401 #if DEBUG_PERL
3402 fprintf (stderr, "%s:%d: type rbrace (%d)\n",
3403 logical_file_name, tp->line_number, nesting_level);
3404 #endif
3405 next_is_argument = false;
3406 if (next_argparser != NULL)
3407 free (next_argparser);
3408 next_argparser = NULL;
3409 next_context_iter = null_context_list_iterator;
3410 break;
3411
3412 case token_type_lbracket:
3413 #if DEBUG_PERL
3414 fprintf (stderr, "%s:%d: type lbracket (%d)\n",
3415 logical_file_name, tp->line_number, nesting_level);
3416 #endif
3417 if (extract_balanced (mlp, token_type_rbracket, true, false,
3418 null_context, null_context_list_iterator,
3419 1, arglist_parser_alloc (mlp, NULL)))
3420 {
3421 arglist_parser_done (argparser, arg);
3422 if (next_argparser != NULL)
3423 free (next_argparser);
3424 free_token (tp);
3425 return true;
3426 }
3427 next_is_argument = false;
3428 if (next_argparser != NULL)
3429 free (next_argparser);
3430 next_argparser = NULL;
3431 next_context_iter = null_context_list_iterator;
3432 break;
3433
3434 case token_type_rbracket:
3435 #if DEBUG_PERL
3436 fprintf (stderr, "%s:%d: type rbracket (%d)\n",
3437 logical_file_name, tp->line_number, nesting_level);
3438 #endif
3439 next_is_argument = false;
3440 if (next_argparser != NULL)
3441 free (next_argparser);
3442 next_argparser = NULL;
3443 next_context_iter = null_context_list_iterator;
3444 break;
3445
3446 case token_type_semicolon:
3447 #if DEBUG_PERL
3448 fprintf (stderr, "%s:%d: type semicolon (%d)\n",
3449 logical_file_name, tp->line_number, nesting_level);
3450 #endif
3451
3452 /* The ultimate sign. */
3453 arglist_parser_done (argparser, arg);
3454 argparser = arglist_parser_alloc (mlp, NULL);
3455
3456 /* FIXME: Instead of resetting outer_context here, it may be better
3457 to recurse in the next_is_argument handling above, waiting for
3458 the next semicolon or other statement terminator. */
3459 outer_context = null_context;
3460 context_iter = null_context_list_iterator;
3461 next_is_argument = false;
3462 if (next_argparser != NULL)
3463 free (next_argparser);
3464 next_argparser = NULL;
3465 next_context_iter = passthrough_context_list_iterator;
3466 inner_context =
3467 inherited_context (outer_context,
3468 flag_context_list_iterator_advance (
3469 &context_iter));
3470 break;
3471
3472 case token_type_dereference:
3473 #if DEBUG_PERL
3474 fprintf (stderr, "%s:%d: type dereference (%d)\n",
3475 logical_file_name, tp->line_number, nesting_level);
3476 #endif
3477 next_is_argument = false;
3478 if (next_argparser != NULL)
3479 free (next_argparser);
3480 next_argparser = NULL;
3481 next_context_iter = null_context_list_iterator;
3482 break;
3483
3484 case token_type_dot:
3485 #if DEBUG_PERL
3486 fprintf (stderr, "%s:%d: type dot (%d)\n",
3487 logical_file_name, tp->line_number, nesting_level);
3488 #endif
3489 next_is_argument = false;
3490 if (next_argparser != NULL)
3491 free (next_argparser);
3492 next_argparser = NULL;
3493 next_context_iter = null_context_list_iterator;
3494 break;
3495
3496 case token_type_named_op:
3497 #if DEBUG_PERL
3498 fprintf (stderr, "%s:%d: type named operator (%d): %s\n",
3499 logical_file_name, tp->line_number, nesting_level,
3500 tp->string);
3501 #endif
3502 next_is_argument = false;
3503 if (next_argparser != NULL)
3504 free (next_argparser);
3505 next_argparser = NULL;
3506 next_context_iter = null_context_list_iterator;
3507 break;
3508
3509 case token_type_regex_op:
3510 #if DEBUG_PERL
3511 fprintf (stderr, "%s:%d: type regex operator (%d)\n",
3512 logical_file_name, tp->line_number, nesting_level);
3513 #endif
3514 next_is_argument = false;
3515 if (next_argparser != NULL)
3516 free (next_argparser);
3517 next_argparser = NULL;
3518 next_context_iter = null_context_list_iterator;
3519 break;
3520
3521 case token_type_other:
3522 #if DEBUG_PERL
3523 fprintf (stderr, "%s:%d: type other (%d)\n",
3524 logical_file_name, tp->line_number, nesting_level);
3525 #endif
3526 next_is_argument = false;
3527 if (next_argparser != NULL)
3528 free (next_argparser);
3529 next_argparser = NULL;
3530 next_context_iter = null_context_list_iterator;
3531 break;
3532
3533 default:
3534 fprintf (stderr, "%s:%d: unknown token type %d\n",
3535 real_file_name, tp->line_number, tp->type);
3536 abort ();
3537 }
3538
3539 free_token (tp);
3540 }
3541 }
3542
3543 void
extract_perl(FILE * f,const char * real_filename,const char * logical_filename,flag_context_list_table_ty * flag_table,msgdomain_list_ty * mdlp)3544 extract_perl (FILE *f, const char *real_filename, const char *logical_filename,
3545 flag_context_list_table_ty *flag_table,
3546 msgdomain_list_ty *mdlp)
3547 {
3548 message_list_ty *mlp = mdlp->item[0]->messages;
3549
3550 fp = f;
3551 real_file_name = real_filename;
3552 logical_file_name = xstrdup (logical_filename);
3553 line_number = 0;
3554
3555 linesize = 0;
3556 linepos = 0;
3557 eaten_here = 0;
3558 end_of_file = false;
3559
3560 last_comment_line = -1;
3561 last_non_comment_line = -1;
3562
3563 flag_context_list_table = flag_table;
3564
3565 /* Safe assumption. */
3566 last_token_type = token_type_semicolon;
3567
3568 token_stack.items = NULL;
3569 token_stack.nitems = 0;
3570 token_stack.nitems_max = 0;
3571
3572 init_keywords ();
3573
3574 /* Eat tokens until eof is seen. When extract_balanced returns
3575 due to an unbalanced closing brace, just restart it. */
3576 while (!extract_balanced (mlp, token_type_rbrace, true, false,
3577 null_context, null_context_list_iterator,
3578 1, arglist_parser_alloc (mlp, NULL)))
3579 ;
3580
3581 fp = NULL;
3582 real_file_name = NULL;
3583 free (logical_file_name);
3584 logical_file_name = NULL;
3585 line_number = 0;
3586 last_token_type = token_type_semicolon;
3587 token_stack_free (&token_stack);
3588 eaten_here = 0;
3589 end_of_file = true;
3590 }
3591