1 /* xgettext Java backend.
2    Copyright (C) 2003, 2005-2009, 2018-2020 Free Software Foundation, Inc.
3    Written by Bruno Haible <bruno@clisp.org>, 2003.
4 
5    This program is free software: you can redistribute it and/or modify
6    it under the terms of the GNU General Public License as published by
7    the Free Software Foundation; either version 3 of the License, or
8    (at your option) any later version.
9 
10    This program is distributed in the hope that it will be useful,
11    but WITHOUT ANY WARRANTY; without even the implied warranty of
12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13    GNU General Public License for more details.
14 
15    You should have received a copy of the GNU General Public License
16    along with this program.  If not, see <https://www.gnu.org/licenses/>.  */
17 
18 #ifdef HAVE_CONFIG_H
19 # include "config.h"
20 #endif
21 
22 /* Specification.  */
23 #include "x-java.h"
24 
25 #include <errno.h>
26 #include <stdbool.h>
27 #include <stdio.h>
28 #include <stdlib.h>
29 #include <string.h>
30 
31 #include "message.h"
32 #include "rc-str-list.h"
33 #include "xgettext.h"
34 #include "xg-pos.h"
35 #include "xg-encoding.h"
36 #include "xg-mixed-string.h"
37 #include "xg-arglist-context.h"
38 #include "xg-arglist-callshape.h"
39 #include "xg-arglist-parser.h"
40 #include "xg-message.h"
41 #include "error.h"
42 #include "error-progname.h"
43 #include "xalloc.h"
44 #include "mem-hash-map.h"
45 #include "po-charset.h"
46 #include "unistr.h"
47 #include "unictype.h"
48 #include "gettext.h"
49 
50 #define _(s) gettext(s)
51 
52 #define SIZEOF(a) (sizeof(a) / sizeof(a[0]))
53 
54 
55 /* The Java syntax is defined in the
56      Java Language Specification
57      (available from https://docs.oracle.com/javase/specs/),
58      chapter 3 "Lexical Structure".  */
59 
60 
61 /* ====================== Keyword set customization.  ====================== */
62 
63 /* If true extract all strings.  */
64 static bool extract_all = false;
65 
66 static hash_table keywords;
67 static bool default_keywords = true;
68 
69 
70 void
x_java_extract_all()71 x_java_extract_all ()
72 {
73   extract_all = true;
74 }
75 
76 
77 void
x_java_keyword(const char * name)78 x_java_keyword (const char *name)
79 {
80   if (name == NULL)
81     default_keywords = false;
82   else
83     {
84       const char *end;
85       struct callshape shape;
86       const char *colon;
87 
88       if (keywords.table == NULL)
89         hash_init (&keywords, 100);
90 
91       split_keywordspec (name, &end, &shape);
92 
93       /* The characters between name and end should form a valid Java
94          identifier sequence with dots.
95          A colon means an invalid parse in split_keywordspec().  */
96       colon = strchr (name, ':');
97       if (colon == NULL || colon >= end)
98         insert_keyword_callshape (&keywords, name, end - name, &shape);
99     }
100 }
101 
102 /* Finish initializing the keywords hash table.
103    Called after argument processing, before each file is processed.  */
104 static void
init_keywords()105 init_keywords ()
106 {
107   if (default_keywords)
108     {
109       /* When adding new keywords here, also update the documentation in
110          xgettext.texi!  */
111       x_java_keyword ("GettextResource.gettext:2");        /* static method */
112       x_java_keyword ("GettextResource.ngettext:2,3");     /* static method */
113       x_java_keyword ("GettextResource.pgettext:2c,3");    /* static method */
114       x_java_keyword ("GettextResource.npgettext:2c,3,4"); /* static method */
115       x_java_keyword ("gettext");
116       x_java_keyword ("ngettext:1,2");
117       x_java_keyword ("pgettext:1c,2");
118       x_java_keyword ("npgettext:1c,2,3");
119       x_java_keyword ("getString");     /* ResourceBundle.getString */
120       default_keywords = false;
121     }
122 }
123 
124 void
init_flag_table_java()125 init_flag_table_java ()
126 {
127   xgettext_record_flag ("GettextResource.gettext:2:pass-java-format");
128   xgettext_record_flag ("GettextResource.gettext:2:pass-java-printf-format");
129   xgettext_record_flag ("GettextResource.ngettext:2:pass-java-format");
130   xgettext_record_flag ("GettextResource.ngettext:2:pass-java-printf-format");
131   xgettext_record_flag ("GettextResource.ngettext:3:pass-java-format");
132   xgettext_record_flag ("GettextResource.ngettext:3:pass-java-printf-format");
133   xgettext_record_flag ("GettextResource.pgettext:3:pass-java-format");
134   xgettext_record_flag ("GettextResource.pgettext:3:pass-java-printf-format");
135   xgettext_record_flag ("GettextResource.npgettext:3:pass-java-format");
136   xgettext_record_flag ("GettextResource.npgettext:3:pass-java-printf-format");
137   xgettext_record_flag ("GettextResource.npgettext:4:pass-java-format");
138   xgettext_record_flag ("GettextResource.npgettext:4:pass-java-printf-format");
139   xgettext_record_flag ("gettext:1:pass-java-format");
140   xgettext_record_flag ("gettext:1:pass-java-printf-format");
141   xgettext_record_flag ("ngettext:1:pass-java-format");
142   xgettext_record_flag ("ngettext:1:pass-java-printf-format");
143   xgettext_record_flag ("ngettext:2:pass-java-format");
144   xgettext_record_flag ("ngettext:2:pass-java-printf-format");
145   xgettext_record_flag ("pgettext:2:pass-java-format");
146   xgettext_record_flag ("pgettext:2:pass-java-printf-format");
147   xgettext_record_flag ("npgettext:2:pass-java-format");
148   xgettext_record_flag ("npgettext:2:pass-java-printf-format");
149   xgettext_record_flag ("npgettext:3:pass-java-format");
150   xgettext_record_flag ("npgettext:3:pass-java-printf-format");
151   xgettext_record_flag ("getString:1:pass-java-format");
152   xgettext_record_flag ("getString:1:pass-java-printf-format");
153   xgettext_record_flag ("MessageFormat:1:java-format");
154   xgettext_record_flag ("MessageFormat.format:1:java-format");
155   xgettext_record_flag ("String.format:1:java-printf-format");
156   xgettext_record_flag ("printf:1:java-printf-format"); /* PrintStream.printf */
157 }
158 
159 
160 /* ======================== Reading of characters.  ======================== */
161 
162 /* The input file stream.  */
163 static FILE *fp;
164 
165 
166 /* Fetch the next single-byte character from the input file.
167    Pushback can consist of an unlimited number of 'u' followed by up to 4
168    other characters.  */
169 
170 /* Special coding of multiple 'u's in the pushback buffer.  */
171 #define MULTIPLE_U(count) (0x1000 + (count))
172 
173 static int phase1_pushback[5];
174 static unsigned int phase1_pushback_length;
175 
176 static int
phase1_getc()177 phase1_getc ()
178 {
179   int c;
180 
181   if (phase1_pushback_length)
182     {
183       c = phase1_pushback[--phase1_pushback_length];
184       if (c >= MULTIPLE_U (0))
185         {
186           if (c > MULTIPLE_U (1))
187             phase1_pushback[phase1_pushback_length++] = c - 1;
188           return 'u';
189         }
190       else
191         return c;
192     }
193 
194   c = getc (fp);
195 
196   if (c == EOF)
197     {
198       if (ferror (fp))
199         error (EXIT_FAILURE, errno,
200                _("error while reading \"%s\""), real_file_name);
201     }
202 
203   return c;
204 }
205 
206 /* Supports any number of 'u' and up to 4 arbitrary characters of pushback.  */
207 static void
phase1_ungetc(int c)208 phase1_ungetc (int c)
209 {
210   if (c != EOF)
211     {
212       if (c == 'u')
213         {
214           if (phase1_pushback_length > 0
215               && phase1_pushback[phase1_pushback_length - 1] >= MULTIPLE_U (0))
216             phase1_pushback[phase1_pushback_length - 1]++;
217           else
218             {
219               if (phase1_pushback_length == SIZEOF (phase1_pushback))
220                 abort ();
221               phase1_pushback[phase1_pushback_length++] = MULTIPLE_U (1);
222             }
223         }
224       else
225         {
226           if (phase1_pushback_length == SIZEOF (phase1_pushback))
227             abort ();
228           phase1_pushback[phase1_pushback_length++] = c;
229         }
230     }
231 }
232 
233 
234 /* Fetch the next single-byte character or Unicode character from the file.
235    (Here, as in the Java Language Specification, when we say "Unicode
236    character", we actually mean "UTF-16 encoding unit".)  */
237 
238 /* Return value of phase 2, 3, 4 when EOF is reached.  */
239 #define P2_EOF 0xffff
240 
241 /* Convert an UTF-16 code point to a return value that can be distinguished
242    from a single-byte return value.  */
243 #define UNICODE(code) (0x10000 + (code))
244 
245 /* Test a return value of phase 2, 3, 4 whether it designates an UTF-16 code
246    point.  */
247 #define IS_UNICODE(p2_result) ((p2_result) >= 0x10000)
248 
249 /* Extract the UTF-16 code of a return value that satisfies IS_UNICODE.  */
250 #define UTF16_VALUE(p2_result) ((p2_result) - 0x10000)
251 
252 /* Reduces a return value of phase 2, 3, 4 by unmasking the UNICODE bit,
253    so that it can be more easily compared against an ASCII character.
254    (RED (c) == 'x')  is equivalent to  (c == 'x' || c == UNICODE ('x')).  */
255 #define RED(p2_result) ((p2_result) & 0xffff)
256 
257 static int phase2_pushback[1];
258 static int phase2_pushback_length;
259 
260 static int
phase2_getc()261 phase2_getc ()
262 {
263   int c;
264 
265   if (phase2_pushback_length)
266     return phase2_pushback[--phase2_pushback_length];
267 
268   c = phase1_getc ();
269   if (c == EOF)
270     return P2_EOF;
271   if (c == '\\')
272     {
273       c = phase1_getc ();
274       if (c == 'u')
275         {
276           unsigned int u_count = 1;
277           unsigned char buf[4];
278           unsigned int n;
279           int i;
280 
281           for (;;)
282             {
283               c = phase1_getc ();
284               if (c != 'u')
285                 break;
286               u_count++;
287             }
288           phase1_ungetc (c);
289 
290           n = 0;
291           for (i = 0; i < 4; i++)
292             {
293               c = phase1_getc ();
294 
295               if (c >= '0' && c <= '9')
296                 n = (n << 4) + (c - '0');
297               else if (c >= 'A' && c <= 'F')
298                 n = (n << 4) + (c - 'A' + 10);
299               else if (c >= 'a' && c <= 'f')
300                 n = (n << 4) + (c - 'a' + 10);
301               else
302                 {
303                   phase1_ungetc (c);
304                   while (--i >= 0)
305                     phase1_ungetc (buf[i]);
306                   for (; u_count > 0; u_count--)
307                     phase1_ungetc ('u');
308                   return '\\';
309                 }
310 
311               buf[i] = c;
312             }
313           return UNICODE (n);
314         }
315       phase1_ungetc (c);
316       return '\\';
317     }
318   return c;
319 }
320 
321 /* Supports only one pushback character.  */
322 static void
phase2_ungetc(int c)323 phase2_ungetc (int c)
324 {
325   if (c != P2_EOF)
326     {
327       if (phase2_pushback_length == SIZEOF (phase2_pushback))
328         abort ();
329       phase2_pushback[phase2_pushback_length++] = c;
330     }
331 }
332 
333 
334 /* Fetch the next single-byte character or Unicode character from the file.
335    With line number handling.
336    Convert line terminators to '\n' or UNICODE ('\n').  */
337 
338 static int phase3_pushback[2];
339 static int phase3_pushback_length;
340 
341 static int
phase3_getc()342 phase3_getc ()
343 {
344   int c;
345 
346   if (phase3_pushback_length)
347     {
348       c = phase3_pushback[--phase3_pushback_length];
349       if (c == '\n')
350         ++line_number;
351       return c;
352     }
353 
354   c = phase2_getc ();
355 
356   /* Handle line terminators.  */
357   if (RED (c) == '\r')
358     {
359       int c1 = phase2_getc ();
360 
361       if (RED (c1) != '\n')
362         phase2_ungetc (c1);
363 
364       /* Seen line terminator CR or CR/LF.  */
365       if (c == '\r' || c1 == '\n')
366         {
367           ++line_number;
368           return '\n';
369         }
370       else
371         return UNICODE ('\n');
372     }
373   else if (RED (c) == '\n')
374     {
375       /* Seen line terminator LF.  */
376       if (c == '\n')
377         {
378           ++line_number;
379           return '\n';
380         }
381       else
382         return UNICODE ('\n');
383     }
384 
385   return c;
386 }
387 
388 /* Supports 2 characters of pushback.  */
389 static void
phase3_ungetc(int c)390 phase3_ungetc (int c)
391 {
392   if (c != P2_EOF)
393     {
394       if (c == '\n')
395         --line_number;
396       if (phase3_pushback_length == SIZEOF (phase3_pushback))
397         abort ();
398       phase3_pushback[phase3_pushback_length++] = c;
399     }
400 }
401 
402 
403 /* ========================= Accumulating strings.  ======================== */
404 
405 /* See xg-mixed-string.h for the main API.  */
406 
407 /* Append a character or Unicode character to a 'struct mixed_string_buffer'.  */
408 static void
mixed_string_buffer_append(struct mixed_string_buffer * bp,int c)409 mixed_string_buffer_append (struct mixed_string_buffer *bp, int c)
410 {
411   if (IS_UNICODE (c))
412     {
413       /* Append a Unicode character.  */
414       mixed_string_buffer_append_unicode (bp, UTF16_VALUE (c));
415     }
416   else
417     {
418       /* Append a single byte.  */
419       mixed_string_buffer_append_char (bp, (unsigned char) c);
420     }
421 }
422 
423 
424 /* ======================== Accumulating comments.  ======================== */
425 
426 
427 /* Accumulating a single comment line.  */
428 
429 static struct mixed_string_buffer comment_buffer;
430 
431 static inline void
comment_start()432 comment_start ()
433 {
434   mixed_string_buffer_init (&comment_buffer, lc_comment,
435                             logical_file_name, line_number);
436 }
437 
438 static inline bool
comment_at_start()439 comment_at_start ()
440 {
441   return mixed_string_buffer_is_empty (&comment_buffer);
442 }
443 
444 static inline void
comment_add(int c)445 comment_add (int c)
446 {
447   mixed_string_buffer_append (&comment_buffer, c);
448 }
449 
450 static inline void
comment_line_end(size_t chars_to_remove)451 comment_line_end (size_t chars_to_remove)
452 {
453   char *buffer =
454     mixed_string_contents_free1 (mixed_string_buffer_result (&comment_buffer));
455   size_t buflen = strlen (buffer);
456 
457   buflen -= chars_to_remove;
458   while (buflen >= 1
459          && (buffer[buflen - 1] == ' ' || buffer[buflen - 1] == '\t'))
460     --buflen;
461   buffer[buflen] = '\0';
462   savable_comment_add (buffer);
463 }
464 
465 
466 /* These are for tracking whether comments count as immediately before
467    keyword.  */
468 static int last_comment_line;
469 static int last_non_comment_line;
470 
471 
472 /* Replace each comment that is not inside a character constant or string
473    literal with a space or newline character.  */
474 
475 static int
phase4_getc()476 phase4_getc ()
477 {
478   int c0;
479   int c;
480   bool last_was_star;
481 
482   c0 = phase3_getc ();
483   if (RED (c0) != '/')
484     return c0;
485   c = phase3_getc ();
486   switch (RED (c))
487     {
488     default:
489       phase3_ungetc (c);
490       return c0;
491 
492     case '*':
493       /* C style comment.  */
494       comment_start ();
495       last_was_star = false;
496       for (;;)
497         {
498           c = phase3_getc ();
499           if (c == P2_EOF)
500             break;
501           /* We skip all leading white space, but not EOLs.  */
502           if (!(comment_at_start () && (RED (c) == ' ' || RED (c) == '\t')))
503             comment_add (c);
504           switch (RED (c))
505             {
506             case '\n':
507               comment_line_end (1);
508               comment_start ();
509               last_was_star = false;
510               continue;
511 
512             case '*':
513               last_was_star = true;
514               continue;
515 
516             case '/':
517               if (last_was_star)
518                 {
519                   comment_line_end (2);
520                   break;
521                 }
522               /* FALLTHROUGH */
523 
524             default:
525               last_was_star = false;
526               continue;
527             }
528           break;
529         }
530       last_comment_line = line_number;
531       return ' ';
532 
533     case '/':
534       /* C++ style comment.  */
535       last_comment_line = line_number;
536       comment_start ();
537       for (;;)
538         {
539           c = phase3_getc ();
540           if (RED (c) == '\n' || c == P2_EOF)
541             break;
542           /* We skip all leading white space, but not EOLs.  */
543           if (!(comment_at_start () && (RED (c) == ' ' || RED (c) == '\t')))
544             comment_add (c);
545         }
546       phase3_ungetc (c); /* push back the newline, to decrement line_number */
547       comment_line_end (0);
548       phase3_getc (); /* read the newline again */
549       return '\n';
550     }
551 }
552 
553 /* Supports only one pushback character.  */
554 static void
phase4_ungetc(int c)555 phase4_ungetc (int c)
556 {
557   phase3_ungetc (c);
558 }
559 
560 
561 /* ========================== Reading of tokens.  ========================== */
562 
563 enum token_type_ty
564 {
565   token_type_eof,
566   token_type_lparen,            /* ( */
567   token_type_rparen,            /* ) */
568   token_type_lbrace,            /* { */
569   token_type_rbrace,            /* } */
570   token_type_comma,             /* , */
571   token_type_dot,               /* . */
572   token_type_string_literal,    /* "abc", """text block""" */
573   token_type_number,            /* 1.23 */
574   token_type_symbol,            /* identifier, keyword, null */
575   token_type_plus,              /* + */
576   token_type_other              /* character literal, misc. operator */
577 };
578 typedef enum token_type_ty token_type_ty;
579 
580 typedef struct token_ty token_ty;
581 struct token_ty
582 {
583   token_type_ty type;
584   char *string;                         /* for token_type_symbol */
585   mixed_string_ty *mixed_string;        /* for token_type_string_literal */
586   refcounted_string_list_ty *comment;   /* for token_type_string_literal */
587   int line_number;
588 };
589 
590 
591 /* Free the memory pointed to by a 'struct token_ty'.  */
592 static inline void
free_token(token_ty * tp)593 free_token (token_ty *tp)
594 {
595   if (tp->type == token_type_symbol)
596     free (tp->string);
597   if (tp->type == token_type_string_literal)
598     {
599       free (tp->mixed_string);
600       drop_reference (tp->comment);
601     }
602 }
603 
604 
605 /* Read an escape sequence inside a string literal or character literal.  */
606 static inline int
do_getc_escaped()607 do_getc_escaped ()
608 {
609   int c;
610 
611   /* Use phase 3, because phase 4 elides comments.  */
612   c = phase3_getc ();
613   if (c == P2_EOF)
614     return UNICODE ('\\');
615   switch (RED (c))
616     {
617     case 'b':
618       return UNICODE (0x08);
619     case 't':
620       return UNICODE (0x09);
621     case 'n':
622       return UNICODE (0x0a);
623     case 'f':
624       return UNICODE (0x0c);
625     case 'r':
626       return UNICODE (0x0d);
627     case '"':
628       return UNICODE ('"');
629     case '\'':
630       return UNICODE ('\'');
631     case '\\':
632       return UNICODE ('\\');
633     case '0': case '1': case '2': case '3':
634     case '4': case '5': case '6': case '7':
635       {
636         int n = RED (c) - '0';
637         bool maybe3digits = (n < 4);
638 
639         c = phase3_getc ();
640         if (RED (c) >= '0' && RED (c) <= '7')
641           {
642             n = (n << 3) + (RED (c) - '0');
643             if (maybe3digits)
644               {
645                 c = phase3_getc ();
646                 if (RED (c) >= '0' && RED (c) <= '7')
647                   n = (n << 3) + (RED (c) - '0');
648                 else
649                   phase3_ungetc (c);
650               }
651           }
652         else
653           phase3_ungetc (c);
654 
655         return UNICODE (n);
656       }
657     default:
658       /* Invalid escape sequence.  */
659       phase3_ungetc (c);
660       return UNICODE ('\\');
661     }
662 }
663 
664 /* Read a string literal or character literal.  */
665 static void
accumulate_escaped(struct mixed_string_buffer * literal,int delimiter)666 accumulate_escaped (struct mixed_string_buffer *literal, int delimiter)
667 {
668   int c;
669 
670   for (;;)
671     {
672       /* Use phase 3, because phase 4 elides comments.  */
673       c = phase3_getc ();
674       if (c == P2_EOF || RED (c) == delimiter)
675         break;
676       if (RED (c) == '\n')
677         {
678           phase3_ungetc (c);
679           error_with_progname = false;
680           if (delimiter == '\'')
681             error (0, 0, _("%s:%d: warning: unterminated character constant"),
682                    logical_file_name, line_number);
683           else
684             error (0, 0, _("%s:%d: warning: unterminated string constant"),
685                    logical_file_name, line_number);
686           error_with_progname = true;
687           break;
688         }
689       if (RED (c) == '\\')
690         c = do_getc_escaped ();
691       mixed_string_buffer_append (literal, c);
692     }
693 }
694 
695 
696 /* Strip the common indentation of the non-blank lines of the given string and
697    remove all trailing whitespace of all lines.
698    Like the Java method String.stripIndent does.
699    <https://docs.oracle.com/en/java/javase/13/docs/api/java.base/java/lang/String.html#stripIndent()>  */
700 static void
strip_indent(mixed_string_ty * ms)701 strip_indent (mixed_string_ty *ms)
702 {
703   size_t nsegments = ms->nsegments;
704   size_t minimum_indentation = SIZE_MAX;
705   {
706     size_t curr_line_indentation = 0;
707     bool curr_line_blank = true;
708     size_t i;
709 
710     for (i = 0; i < nsegments; i++)
711       {
712         struct mixed_string_segment *segment = ms->segments[i];
713 
714         if (segment->type == utf8_encoded
715             || (segment->type == source_encoded
716                 && xgettext_current_source_encoding == po_charset_utf8))
717           {
718             /* Consider Unicode whitespace characters.  */
719             size_t seglength = segment->length;
720             size_t j;
721 
722             for (j = 0; j < seglength; )
723               {
724                 ucs4_t uc;
725                 int bytes =
726                   u8_mbtouc (&uc, (const uint8_t *) &segment->contents[j],
727                              seglength - j);
728                 j += bytes;
729                 if (uc == 0x000a)
730                   {
731                     /* Newline.  */
732                     if (!curr_line_blank)
733                       if (minimum_indentation > curr_line_indentation)
734                         minimum_indentation = curr_line_indentation;
735                     curr_line_indentation = 0;
736                     curr_line_blank = true;
737                   }
738                 else if (uc_is_java_whitespace (uc))
739                   {
740                     /* Whitespace character.  */
741                     if (curr_line_blank)
742                       /* Every whitespace character counts as 1, even the TAB
743                          character.  */
744                       curr_line_indentation++;
745                   }
746                 else
747                   {
748                     /* Other character.  */
749                     curr_line_blank = false;
750                   }
751               }
752           }
753         else
754           {
755             /* When the encoding is not UTF-8, consider only ASCII whitespace
756                characters.  */
757             size_t seglength = segment->length;
758             size_t j;
759 
760             for (j = 0; j < seglength; j++)
761               {
762                 char c = segment->contents[j];
763                 if (c == '\n')
764                   {
765                     /* Newline.  */
766                     if (!curr_line_blank)
767                       if (minimum_indentation > curr_line_indentation)
768                         minimum_indentation = curr_line_indentation;
769                     curr_line_indentation = 0;
770                     curr_line_blank = true;
771                   }
772                 else if (c == ' '
773                          || (c >= 0x09 && c <= 0x0d)
774                          || (c >= 0x1c && c <= 0x1f))
775                   {
776                     /* Whitespace character.  */
777                     if (curr_line_blank)
778                       /* Every whitespace character counts as 1, even the TAB
779                          character.  */
780                       curr_line_indentation++;
781                   }
782                 else
783                   {
784                     /* Other character.  */
785                     curr_line_blank = false;
786                   }
787               }
788           }
789       }
790     /* The indentation of the last line matters even if is blank.  */
791     if (minimum_indentation > curr_line_indentation)
792       minimum_indentation = curr_line_indentation;
793   }
794 
795   /* The same loop as above, but this time remove the leading
796      minimum_indentation whitespace characters and all trailing whitespace
797      characters from every line.  */
798   {
799     size_t start_of_curr_line_i = 0;
800     size_t start_of_curr_line_j = 0;
801     size_t start_of_trailing_whitespace_i = 0;
802     size_t start_of_trailing_whitespace_j = 0;
803     size_t whitespace_to_remove = minimum_indentation;
804     size_t i;
805 
806     for (i = 0; i < nsegments; i++)
807       {
808         struct mixed_string_segment *segment = ms->segments[i];
809         /* Perform a sliding copy from segment->contents[from_j] to
810            segment->contents[to_j].  0 <= to_j <= from_j.  */
811         size_t to_j;
812 
813         if (segment->type == utf8_encoded
814             || (segment->type == source_encoded
815                 && xgettext_current_source_encoding == po_charset_utf8))
816           {
817             /* Consider Unicode whitespace characters.  */
818             size_t seglength = segment->length;
819             size_t from_j;
820 
821             for (to_j = from_j = 0; from_j < seglength; )
822               {
823                 ucs4_t uc;
824                 int bytes =
825                   u8_mbtouc (&uc, (const uint8_t *) &segment->contents[from_j],
826                              seglength - from_j);
827                 if (uc == 0x000a)
828                   {
829                     /* Newline.  */
830                     if (whitespace_to_remove > 0)
831                       {
832                         /* It was a blank line with fewer than minimum_indentation
833                            whitespace characters.  Remove all this whitespace.  */
834                         if (start_of_curr_line_i < i)
835                           {
836                             size_t k;
837                             ms->segments[start_of_curr_line_i]->length = start_of_curr_line_j;
838                             for (k = start_of_curr_line_i + 1; k < i; k++)
839                               ms->segments[k]->length = 0;
840                             to_j = 0;
841                           }
842                         else
843                           to_j = start_of_curr_line_j;
844                       }
845                     else
846                       {
847                         /* Remove the trailing whitespace characters from the
848                            current line.  */
849                         if (start_of_trailing_whitespace_i < i)
850                           {
851                             size_t k;
852                             ms->segments[start_of_trailing_whitespace_i]->length = start_of_trailing_whitespace_j;
853                             for (k = start_of_trailing_whitespace_i + 1; k < i; k++)
854                               ms->segments[k]->length = 0;
855                             to_j = 0;
856                           }
857                         else
858                           to_j = start_of_trailing_whitespace_j;
859                       }
860                   }
861                 if (to_j < from_j)
862                   memmove (&segment->contents[to_j], &segment->contents[from_j], bytes);
863                 from_j += bytes;
864                 to_j += bytes;
865                 if (uc == 0x000a)
866                   {
867                     /* Newline.  */
868                     start_of_curr_line_i = i;
869                     start_of_curr_line_j = to_j;
870                     start_of_trailing_whitespace_i = i;
871                     start_of_trailing_whitespace_j = to_j;
872                     whitespace_to_remove = minimum_indentation;
873                   }
874                 else if (uc_is_java_whitespace (uc))
875                   {
876                     /* Whitespace character.  */
877                     if (whitespace_to_remove > 0
878                         && --whitespace_to_remove == 0)
879                       {
880                         /* Remove the leading minimum_indentation whitespace
881                            characters from the current line.  */
882                         if (start_of_curr_line_i < i)
883                           {
884                             size_t k;
885                             ms->segments[start_of_curr_line_i]->length = start_of_curr_line_j;
886                             for (k = start_of_curr_line_i + 1; k < i; k++)
887                               ms->segments[k]->length = 0;
888                             to_j = 0;
889                           }
890                         else
891                           to_j = start_of_curr_line_j;
892                       }
893                   }
894                 else
895                   {
896                     /* Other character.  */
897                     if (whitespace_to_remove > 0)
898                       abort ();
899                     start_of_trailing_whitespace_i = i;
900                     start_of_trailing_whitespace_j = to_j;
901                   }
902               }
903           }
904         else
905           {
906             /* When the encoding is not UTF-8, consider only ASCII whitespace
907                characters.  */
908             size_t seglength = segment->length;
909             size_t from_j;
910 
911             for (to_j = from_j = 0; from_j < seglength; )
912               {
913                 char c = segment->contents[from_j++];
914                 if (c == '\n')
915                   {
916                     /* Newline.  */
917                     if (whitespace_to_remove > 0)
918                       {
919                         /* It was a blank line with fewer than minimum_indentation
920                            whitespace characters.  Remove all this whitespace.  */
921                         if (start_of_curr_line_i < i)
922                           {
923                             size_t k;
924                             ms->segments[start_of_curr_line_i]->length = start_of_curr_line_j;
925                             for (k = start_of_curr_line_i + 1; k < i; k++)
926                               ms->segments[k]->length = 0;
927                             to_j = 0;
928                           }
929                         else
930                           to_j = start_of_curr_line_j;
931                       }
932                     else
933                       {
934                         /* Remove the trailing whitespace characters from the
935                            current line.  */
936                         if (start_of_trailing_whitespace_i < i)
937                           {
938                             size_t k;
939                             ms->segments[start_of_trailing_whitespace_i]->length = start_of_trailing_whitespace_j;
940                             for (k = start_of_trailing_whitespace_i + 1; k < i; k++)
941                               ms->segments[k]->length = 0;
942                             to_j = 0;
943                           }
944                         else
945                           to_j = start_of_trailing_whitespace_j;
946                       }
947                   }
948                 segment->contents[to_j++] = c;
949                 if (c == '\n')
950                   {
951                     /* Newline.  */
952                     start_of_curr_line_i = i;
953                     start_of_curr_line_j = to_j;
954                     start_of_trailing_whitespace_i = i;
955                     start_of_trailing_whitespace_j = to_j;
956                     whitespace_to_remove = minimum_indentation;
957                   }
958                 else if (c == ' '
959                          || (c >= 0x09 && c <= 0x0d)
960                          || (c >= 0x1c && c <= 0x1f))
961                   {
962                     /* Whitespace character.  */
963                     if (whitespace_to_remove > 0
964                         && --whitespace_to_remove == 0)
965                       {
966                         /* Remove the leading minimum_indentation whitespace
967                            characters from the current line.  */
968                         if (start_of_curr_line_i < i)
969                           {
970                             size_t k;
971                             ms->segments[start_of_curr_line_i]->length = start_of_curr_line_j;
972                             for (k = start_of_curr_line_i + 1; k < i; k++)
973                               ms->segments[k]->length = 0;
974                             to_j = 0;
975                           }
976                         else
977                           to_j = start_of_curr_line_j;
978                       }
979                   }
980                 else
981                   {
982                     /* Other character.  */
983                     if (whitespace_to_remove > 0)
984                       abort ();
985                     start_of_trailing_whitespace_i = i;
986                     start_of_trailing_whitespace_j = to_j;
987                   }
988               }
989           }
990         if (i + 1 == nsegments)
991           {
992             /* Handle the last line.  */
993             if (whitespace_to_remove > 0)
994               {
995                 /* It was a blank line with fewer than minimum_indentation
996                    whitespace characters.  Remove all this whitespace.  */
997                 if (start_of_curr_line_i < i)
998                   {
999                     size_t k;
1000                     ms->segments[start_of_curr_line_i]->length = start_of_curr_line_j;
1001                     for (k = start_of_curr_line_i + 1; k < i; k++)
1002                       ms->segments[k]->length = 0;
1003                     to_j = 0;
1004                   }
1005                 else
1006                   to_j = start_of_curr_line_j;
1007               }
1008             else
1009               {
1010                 /* Remove the trailing whitespace characters from the
1011                    current line.  */
1012                 if (start_of_trailing_whitespace_i < i)
1013                   {
1014                     size_t k;
1015                     ms->segments[start_of_trailing_whitespace_i]->length = start_of_trailing_whitespace_j;
1016                     for (k = start_of_trailing_whitespace_i + 1; k < i; k++)
1017                       ms->segments[k]->length = 0;
1018                     to_j = 0;
1019                   }
1020                 else
1021                   to_j = start_of_trailing_whitespace_j;
1022               }
1023           }
1024         segment->length = to_j;
1025       }
1026   }
1027 }
1028 
1029 
1030 /* Combine characters into tokens.  Discard whitespace.  */
1031 
1032 static token_ty phase5_pushback[3];
1033 static int phase5_pushback_length;
1034 
1035 static void
phase5_get(token_ty * tp)1036 phase5_get (token_ty *tp)
1037 {
1038   int c;
1039 
1040   if (phase5_pushback_length)
1041     {
1042       *tp = phase5_pushback[--phase5_pushback_length];
1043       return;
1044     }
1045   tp->string = NULL;
1046 
1047   for (;;)
1048     {
1049       tp->line_number = line_number;
1050       c = phase4_getc ();
1051 
1052       if (c == P2_EOF)
1053         {
1054           tp->type = token_type_eof;
1055           return;
1056         }
1057 
1058       switch (RED (c))
1059         {
1060         case '\n':
1061           if (last_non_comment_line > last_comment_line)
1062             savable_comment_reset ();
1063           /* FALLTHROUGH */
1064         case ' ':
1065         case '\t':
1066         case '\f':
1067           /* Ignore whitespace and comments.  */
1068           continue;
1069         }
1070 
1071       last_non_comment_line = tp->line_number;
1072 
1073       switch (RED (c))
1074         {
1075         case '(':
1076           tp->type = token_type_lparen;
1077           return;
1078 
1079         case ')':
1080           tp->type = token_type_rparen;
1081           return;
1082 
1083         case '{':
1084           tp->type = token_type_lbrace;
1085           return;
1086 
1087         case '}':
1088           tp->type = token_type_rbrace;
1089           return;
1090 
1091         case ',':
1092           tp->type = token_type_comma;
1093           return;
1094 
1095         case '.':
1096           c = phase4_getc ();
1097           if (!(RED (c) >= '0' && RED (c) <= '9'))
1098             {
1099               phase4_ungetc (c);
1100               tp->type = token_type_dot;
1101               return;
1102             }
1103           /* FALLTHROUGH */
1104 
1105         case '0': case '1': case '2': case '3': case '4':
1106         case '5': case '6': case '7': case '8': case '9':
1107           {
1108             /* Don't need to verify the complicated syntax of integers and
1109                floating-point numbers.  We assume a valid Java input.
1110                The simplified syntax that we recognize as number is: any
1111                sequence of alphanumeric characters, additionally '+' and '-'
1112                immediately after 'e' or 'E' except in hexadecimal numbers.  */
1113             bool hexadecimal = false;
1114 
1115             for (;;)
1116               {
1117                 c = phase4_getc ();
1118                 if (RED (c) >= '0' && RED (c) <= '9')
1119                   continue;
1120                 if ((RED (c) >= 'A' && RED (c) <= 'Z')
1121                     || (RED (c) >= 'a' && RED (c) <= 'z'))
1122                   {
1123                     if (RED (c) == 'X' || RED (c) == 'x')
1124                       hexadecimal = true;
1125                     if ((RED (c) == 'E' || RED (c) == 'e') && !hexadecimal)
1126                       {
1127                         c = phase4_getc ();
1128                         if (!(RED (c) == '+' || RED (c) == '-'))
1129                           phase4_ungetc (c);
1130                       }
1131                     continue;
1132                   }
1133                 if (RED (c) == '.')
1134                   continue;
1135                 break;
1136               }
1137             phase4_ungetc (c);
1138             tp->type = token_type_number;
1139             return;
1140           }
1141 
1142         case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': case 'G':
1143         case 'H': case 'I': case 'J': case 'K': case 'L': case 'M': case 'N':
1144         case 'O': case 'P': case 'Q': case 'R': case 'S': case 'T': case 'U':
1145         case 'V': case 'W': case 'X': case 'Y': case 'Z':
1146         case '_':
1147         case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': case 'g':
1148         case 'h': case 'i': case 'j': case 'k': case 'l': case 'm': case 'n':
1149         case 'o': case 'p': case 'q': case 'r': case 's': case 't': case 'u':
1150         case 'v': case 'w': case 'x': case 'y': case 'z':
1151           /* Although Java allows identifiers containing many Unicode
1152              characters, we recognize only identifiers consisting of ASCII
1153              characters.  This avoids conversion hassles w.r.t. the --keyword
1154              arguments, and shouldn't be a big problem in practice.  */
1155           {
1156             static char *buffer;
1157             static int bufmax;
1158             int bufpos = 0;
1159             for (;;)
1160               {
1161                 if (bufpos >= bufmax)
1162                   {
1163                     bufmax = 2 * bufmax + 10;
1164                     buffer = xrealloc (buffer, bufmax);
1165                   }
1166                 buffer[bufpos++] = RED (c);
1167                 c = phase4_getc ();
1168                 if (!((RED (c) >= 'A' && RED (c) <= 'Z')
1169                       || (RED (c) >= 'a' && RED (c) <= 'z')
1170                       || (RED (c) >= '0' && RED (c) <= '9')
1171                       || RED (c) == '_'))
1172                   break;
1173               }
1174             phase4_ungetc (c);
1175             if (bufpos >= bufmax)
1176               {
1177                 bufmax = 2 * bufmax + 10;
1178                 buffer = xrealloc (buffer, bufmax);
1179               }
1180             buffer[bufpos] = '\0';
1181             tp->string = xstrdup (buffer);
1182             tp->type = token_type_symbol;
1183             return;
1184           }
1185 
1186         case '"':
1187           {
1188             int c2 = phase3_getc ();
1189             if (c2 == '"')
1190               {
1191                 int c3 = phase3_getc ();
1192                 if (c3 == '"')
1193                   {
1194                     /* Text block.  Specification:
1195                        <https://docs.oracle.com/javase/specs/jls/se13/preview/text-blocks.html>  */
1196                     struct mixed_string_buffer block;
1197                     unsigned int consecutive_unescaped_doublequotes;
1198                     mixed_string_ty *block_content;
1199 
1200                     /* Parse the part up to and including the first newline.  */
1201                     for (;;)
1202                       {
1203                         int ic = phase3_getc ();
1204                         if (ic == P2_EOF)
1205                           {
1206                             error_with_progname = false;
1207                             error (0, 0, _("%s:%d: warning: unterminated text block"),
1208                                    logical_file_name, line_number);
1209                             error_with_progname = true;
1210                             tp->type = token_type_other;
1211                             return;
1212                           }
1213                         if (RED (ic) == ' ' || RED (ic) == '\t' || RED (ic) == '\f')
1214                           ;
1215                         else if (RED (ic) == '\n')
1216                           break;
1217                         else
1218                           {
1219                             error_with_progname = false;
1220                             error (0, 0, _("%s:%d: warning: invalid syntax in text block"),
1221                                    logical_file_name, line_number);
1222                             error_with_progname = true;
1223                             tp->type = token_type_other;
1224                             return;
1225                           }
1226                       }
1227 
1228                     /* Parse the part after the first newline.  */
1229                     mixed_string_buffer_init (&block, lc_string,
1230                                               logical_file_name, line_number);
1231                     consecutive_unescaped_doublequotes = 0;
1232                     for (;;)
1233                       {
1234                         int ic = phase3_getc ();
1235                         if (RED (ic) == '"')
1236                           {
1237                             consecutive_unescaped_doublequotes++;
1238                             if (consecutive_unescaped_doublequotes == 3)
1239                               break;
1240                           }
1241                         else
1242                           {
1243                             while (consecutive_unescaped_doublequotes > 0)
1244                               {
1245                                 mixed_string_buffer_append (&block, '"');
1246                                 consecutive_unescaped_doublequotes--;
1247                               }
1248                             if (ic == P2_EOF)
1249                               {
1250                                 error_with_progname = false;
1251                                 error (0, 0, _("%s:%d: warning: unterminated text block"),
1252                                        logical_file_name, block.line_number);
1253                                 error_with_progname = true;
1254                                 break;
1255                               }
1256                             if (RED (ic) == '\\')
1257                               ic = do_getc_escaped ();
1258                             mixed_string_buffer_append (&block, ic);
1259                           }
1260                       }
1261                     block_content = mixed_string_buffer_result (&block);
1262 
1263                     /* Remove the common indentation from the content.  */
1264                     strip_indent (block_content);
1265 
1266                     tp->mixed_string = block_content;
1267                     tp->comment = add_reference (savable_comment);
1268                     tp->type = token_type_string_literal;
1269                     return;
1270                   }
1271                 phase3_ungetc (c3);
1272               }
1273             phase3_ungetc (c2);
1274           }
1275           /* String literal.  */
1276           {
1277             struct mixed_string_buffer literal;
1278 
1279             mixed_string_buffer_init (&literal, lc_string,
1280                                       logical_file_name, line_number);
1281             accumulate_escaped (&literal, '"');
1282             tp->mixed_string = mixed_string_buffer_result (&literal);
1283             tp->comment = add_reference (savable_comment);
1284             tp->type = token_type_string_literal;
1285             return;
1286           }
1287 
1288         case '\'':
1289           /* Character literal.  */
1290           {
1291             struct mixed_string_buffer literal;
1292 
1293             mixed_string_buffer_init (&literal, lc_outside,
1294                                       logical_file_name, line_number);
1295             accumulate_escaped (&literal, '\'');
1296             mixed_string_buffer_destroy (&literal);
1297             tp->type = token_type_other;
1298             return;
1299           }
1300 
1301         case '+':
1302           c = phase4_getc ();
1303           if (RED (c) == '+')
1304             /* Operator ++ */
1305             tp->type = token_type_other;
1306           else if (RED (c) == '=')
1307             /* Operator += */
1308             tp->type = token_type_other;
1309           else
1310             {
1311               /* Operator + */
1312               phase4_ungetc (c);
1313               tp->type = token_type_plus;
1314             }
1315           return;
1316 
1317         default:
1318           /* Misc. operator.  */
1319           tp->type = token_type_other;
1320           return;
1321         }
1322     }
1323 }
1324 
1325 /* Supports 3 tokens of pushback.  */
1326 static void
phase5_unget(token_ty * tp)1327 phase5_unget (token_ty *tp)
1328 {
1329   if (tp->type != token_type_eof)
1330     {
1331       if (phase5_pushback_length == SIZEOF (phase5_pushback))
1332         abort ();
1333       phase5_pushback[phase5_pushback_length++] = *tp;
1334     }
1335 }
1336 
1337 
1338 /* Compile-time optimization of string literal concatenation.
1339    Combine "string1" + ... + "stringN" to the concatenated string if
1340      - the token before this expression is not ')' (because then the first
1341        string could be part of a cast expression),
1342      - the token after this expression is not '.' (because then the last
1343        string could be part of a method call expression).  */
1344 
1345 static token_ty phase6_pushback[2];
1346 static int phase6_pushback_length;
1347 
1348 static token_type_ty phase6_last;
1349 
1350 static void
phase6_get(token_ty * tp)1351 phase6_get (token_ty *tp)
1352 {
1353   if (phase6_pushback_length)
1354     {
1355       *tp = phase6_pushback[--phase6_pushback_length];
1356       return;
1357     }
1358 
1359   phase5_get (tp);
1360   if (tp->type == token_type_string_literal && phase6_last != token_type_rparen)
1361     {
1362       mixed_string_ty *sum = tp->mixed_string;
1363 
1364       for (;;)
1365         {
1366           token_ty token2;
1367 
1368           phase5_get (&token2);
1369           if (token2.type == token_type_plus)
1370             {
1371               token_ty token3;
1372 
1373               phase5_get (&token3);
1374               if (token3.type == token_type_string_literal)
1375                 {
1376                   token_ty token_after;
1377 
1378                   phase5_get (&token_after);
1379                   if (token_after.type != token_type_dot)
1380                     {
1381                       sum = mixed_string_concat_free1 (sum, token3.mixed_string);
1382 
1383                       phase5_unget (&token_after);
1384                       free_token (&token3);
1385                       free_token (&token2);
1386                       continue;
1387                     }
1388                   phase5_unget (&token_after);
1389                 }
1390               phase5_unget (&token3);
1391             }
1392           phase5_unget (&token2);
1393           break;
1394         }
1395       tp->mixed_string = sum;
1396     }
1397   phase6_last = tp->type;
1398 }
1399 
1400 /* Supports 2 tokens of pushback.  */
1401 static void
phase6_unget(token_ty * tp)1402 phase6_unget (token_ty *tp)
1403 {
1404   if (tp->type != token_type_eof)
1405     {
1406       if (phase6_pushback_length == SIZEOF (phase6_pushback))
1407         abort ();
1408       phase6_pushback[phase6_pushback_length++] = *tp;
1409     }
1410 }
1411 
1412 
1413 static void
x_java_lex(token_ty * tp)1414 x_java_lex (token_ty *tp)
1415 {
1416   phase6_get (tp);
1417 }
1418 
1419 /* Supports 2 tokens of pushback.  */
1420 static void
x_java_unlex(token_ty * tp)1421 x_java_unlex (token_ty *tp)
1422 {
1423   phase6_unget (tp);
1424 }
1425 
1426 
1427 /* ========================= Extracting strings.  ========================== */
1428 
1429 
1430 /* Context lookup table.  */
1431 static flag_context_list_table_ty *flag_context_list_table;
1432 
1433 
1434 /* The file is broken into tokens.  Scan the token stream, looking for
1435    a keyword, followed by a left paren, followed by a string.  When we
1436    see this sequence, we have something to remember.  We assume we are
1437    looking at a valid C or C++ program, and leave the complaints about
1438    the grammar to the compiler.
1439 
1440      Normal handling: Look for
1441        keyword ( ... msgid ... )
1442      Plural handling: Look for
1443        keyword ( ... msgid ... msgid_plural ... )
1444 
1445    We use recursion because the arguments before msgid or between msgid
1446    and msgid_plural can contain subexpressions of the same form.  */
1447 
1448 
1449 /* Extract messages until the next balanced closing parenthesis or brace,
1450    depending on TERMINATOR.
1451    Extracted messages are added to MLP.
1452    Return true upon eof, false upon closing parenthesis or brace.  */
1453 static bool
extract_parenthesized(message_list_ty * mlp,token_type_ty terminator,flag_context_ty outer_context,flag_context_list_iterator_ty context_iter,struct arglist_parser * argparser)1454 extract_parenthesized (message_list_ty *mlp, token_type_ty terminator,
1455                        flag_context_ty outer_context,
1456                        flag_context_list_iterator_ty context_iter,
1457                        struct arglist_parser *argparser)
1458 {
1459   /* Current argument number.  */
1460   int arg = 1;
1461   /* 0 when no keyword has been seen.  1 right after a keyword is seen.  */
1462   int state;
1463   /* Parameters of the keyword just seen.  Defined only in state 1.  */
1464   const struct callshapes *next_shapes = NULL;
1465   /* Context iterator that will be used if the next token is a '('.  */
1466   flag_context_list_iterator_ty next_context_iter =
1467     passthrough_context_list_iterator;
1468   /* Current context.  */
1469   flag_context_ty inner_context =
1470     inherited_context (outer_context,
1471                        flag_context_list_iterator_advance (&context_iter));
1472 
1473   /* Start state is 0.  */
1474   state = 0;
1475 
1476   for (;;)
1477     {
1478       token_ty token;
1479 
1480       x_java_lex (&token);
1481       switch (token.type)
1482         {
1483         case token_type_symbol:
1484           {
1485             /* Combine symbol1 . ... . symbolN to a single strings, so that
1486                we can recognize static function calls like
1487                GettextResource.gettext.  The information present for
1488                symbolI.....symbolN has precedence over the information for
1489                symbolJ.....symbolN with J > I.  */
1490             char *sum = token.string;
1491             size_t sum_len = strlen (sum);
1492             const char *dottedname;
1493             flag_context_list_ty *context_list;
1494 
1495             for (;;)
1496               {
1497                 token_ty token2;
1498 
1499                 x_java_lex (&token2);
1500                 if (token2.type == token_type_dot)
1501                   {
1502                     token_ty token3;
1503 
1504                     x_java_lex (&token3);
1505                     if (token3.type == token_type_symbol)
1506                       {
1507                         char *addend = token3.string;
1508                         size_t addend_len = strlen (addend);
1509 
1510                         sum =
1511                           (char *) xrealloc (sum, sum_len + 1 + addend_len + 1);
1512                         sum[sum_len] = '.';
1513                         memcpy (sum + sum_len + 1, addend, addend_len + 1);
1514                         sum_len += 1 + addend_len;
1515 
1516                         free_token (&token3);
1517                         free_token (&token2);
1518                         continue;
1519                       }
1520                     x_java_unlex (&token3);
1521                   }
1522                 x_java_unlex (&token2);
1523                 break;
1524               }
1525 
1526             for (dottedname = sum;;)
1527               {
1528                 void *keyword_value;
1529 
1530                 if (hash_find_entry (&keywords, dottedname, strlen (dottedname),
1531                                      &keyword_value)
1532                     == 0)
1533                   {
1534                     next_shapes = (const struct callshapes *) keyword_value;
1535                     state = 1;
1536                     break;
1537                   }
1538 
1539                 dottedname = strchr (dottedname, '.');
1540                 if (dottedname == NULL)
1541                   {
1542                     state = 0;
1543                     break;
1544                   }
1545                 dottedname++;
1546               }
1547 
1548             for (dottedname = sum;;)
1549               {
1550                 context_list =
1551                   flag_context_list_table_lookup (
1552                     flag_context_list_table,
1553                     dottedname, strlen (dottedname));
1554                 if (context_list != NULL)
1555                   break;
1556 
1557                 dottedname = strchr (dottedname, '.');
1558                 if (dottedname == NULL)
1559                   break;
1560                 dottedname++;
1561               }
1562             next_context_iter = flag_context_list_iterator (context_list);
1563 
1564             free (sum);
1565             continue;
1566           }
1567 
1568         case token_type_lparen:
1569           if (extract_parenthesized (mlp, token_type_rparen,
1570                                      inner_context, next_context_iter,
1571                                      arglist_parser_alloc (mlp,
1572                                                            state ? next_shapes : NULL)))
1573             {
1574               arglist_parser_done (argparser, arg);
1575               return true;
1576             }
1577           next_context_iter = null_context_list_iterator;
1578           state = 0;
1579           continue;
1580 
1581         case token_type_rparen:
1582           if (terminator == token_type_rparen)
1583             {
1584               arglist_parser_done (argparser, arg);
1585               return false;
1586             }
1587           if (terminator == token_type_rbrace)
1588             {
1589               error_with_progname = false;
1590               error (0, 0,
1591                      _("%s:%d: warning: ')' found where '}' was expected"),
1592                      logical_file_name, token.line_number);
1593               error_with_progname = true;
1594             }
1595           next_context_iter = null_context_list_iterator;
1596           state = 0;
1597           continue;
1598 
1599         case token_type_lbrace:
1600           if (extract_parenthesized (mlp, token_type_rbrace,
1601                                      null_context, null_context_list_iterator,
1602                                      arglist_parser_alloc (mlp, NULL)))
1603             {
1604               arglist_parser_done (argparser, arg);
1605               return true;
1606             }
1607           next_context_iter = null_context_list_iterator;
1608           state = 0;
1609           continue;
1610 
1611         case token_type_rbrace:
1612           if (terminator == token_type_rbrace)
1613             {
1614               arglist_parser_done (argparser, arg);
1615               return false;
1616             }
1617           if (terminator == token_type_rparen)
1618             {
1619               error_with_progname = false;
1620               error (0, 0,
1621                      _("%s:%d: warning: '}' found where ')' was expected"),
1622                      logical_file_name, token.line_number);
1623               error_with_progname = true;
1624             }
1625           next_context_iter = null_context_list_iterator;
1626           state = 0;
1627           continue;
1628 
1629         case token_type_comma:
1630           arg++;
1631           inner_context =
1632             inherited_context (outer_context,
1633                                flag_context_list_iterator_advance (
1634                                  &context_iter));
1635           next_context_iter = passthrough_context_list_iterator;
1636           state = 0;
1637           continue;
1638 
1639         case token_type_string_literal:
1640           {
1641             lex_pos_ty pos;
1642 
1643             pos.file_name = logical_file_name;
1644             pos.line_number = token.line_number;
1645 
1646             if (extract_all)
1647               {
1648                 char *string = mixed_string_contents (token.mixed_string);
1649                 mixed_string_free (token.mixed_string);
1650                 remember_a_message (mlp, NULL, string, true, false,
1651                                     inner_context, &pos,
1652                                     NULL, token.comment, true);
1653               }
1654             else
1655               arglist_parser_remember (argparser, arg, token.mixed_string,
1656                                        inner_context,
1657                                        pos.file_name, pos.line_number,
1658                                        token.comment, true);
1659           }
1660           drop_reference (token.comment);
1661           next_context_iter = null_context_list_iterator;
1662           state = 0;
1663           continue;
1664 
1665         case token_type_eof:
1666           arglist_parser_done (argparser, arg);
1667           return true;
1668 
1669         case token_type_dot:
1670         case token_type_number:
1671         case token_type_plus:
1672         case token_type_other:
1673           next_context_iter = null_context_list_iterator;
1674           state = 0;
1675           continue;
1676 
1677         default:
1678           abort ();
1679         }
1680     }
1681 }
1682 
1683 
1684 void
extract_java(FILE * f,const char * real_filename,const char * logical_filename,flag_context_list_table_ty * flag_table,msgdomain_list_ty * mdlp)1685 extract_java (FILE *f,
1686               const char *real_filename, const char *logical_filename,
1687               flag_context_list_table_ty *flag_table,
1688               msgdomain_list_ty *mdlp)
1689 {
1690   message_list_ty *mlp = mdlp->item[0]->messages;
1691 
1692   fp = f;
1693   real_file_name = real_filename;
1694   logical_file_name = xstrdup (logical_filename);
1695   line_number = 1;
1696 
1697   phase1_pushback_length = 0;
1698   phase2_pushback_length = 0;
1699   phase3_pushback_length = 0;
1700 
1701   last_comment_line = -1;
1702   last_non_comment_line = -1;
1703 
1704   phase5_pushback_length = 0;
1705   phase6_pushback_length = 0;
1706   phase6_last = token_type_eof;
1707 
1708   flag_context_list_table = flag_table;
1709 
1710   init_keywords ();
1711 
1712   /* Eat tokens until eof is seen.  When extract_parenthesized returns
1713      due to an unbalanced closing parenthesis, just restart it.  */
1714   while (!extract_parenthesized (mlp, token_type_eof,
1715                                  null_context, null_context_list_iterator,
1716                                  arglist_parser_alloc (mlp, NULL)))
1717     ;
1718 
1719   fp = NULL;
1720   real_file_name = NULL;
1721   logical_file_name = NULL;
1722   line_number = 0;
1723 }
1724