1 /* xgettext C/C++/ObjectiveC backend.
2    Copyright (C) 1995-1998, 2000-2009, 2012-2015, 2018-2020 Free Software Foundation, Inc.
3 
4    This file was written by Peter Miller <millerp@canb.auug.org.au>
5 
6    This program is free software: you can redistribute it and/or modify
7    it under the terms of the GNU General Public License as published by
8    the Free Software Foundation; either version 3 of the License, or
9    (at your option) any later version.
10 
11    This program is distributed in the hope that it will be useful,
12    but WITHOUT ANY WARRANTY; without even the implied warranty of
13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14    GNU General Public License for more details.
15 
16    You should have received a copy of the GNU General Public License
17    along with this program.  If not, see <https://www.gnu.org/licenses/>.  */
18 
19 #ifdef HAVE_CONFIG_H
20 # include "config.h"
21 #endif
22 
23 /* Specification.  */
24 #include "x-c.h"
25 
26 #include <assert.h>
27 #include <errno.h>
28 #include <stdbool.h>
29 #include <stdio.h>
30 #include <stdlib.h>
31 #include <string.h>
32 
33 #include "message.h"
34 #include "rc-str-list.h"
35 #include "xgettext.h"
36 #include "xg-pos.h"
37 #include "xg-encoding.h"
38 #include "xg-mixed-string.h"
39 #include "xg-arglist-context.h"
40 #include "xg-arglist-callshape.h"
41 #include "xg-arglist-parser.h"
42 #include "xg-message.h"
43 #include "error.h"
44 #include "error-progname.h"
45 #include "xalloc.h"
46 #include "xvasprintf.h"
47 #include "mem-hash-map.h"
48 #include "po-charset.h"
49 #include "gettext.h"
50 
51 #define _(s) gettext(s)
52 
53 #define SIZEOF(a) (sizeof(a) / sizeof(a[0]))
54 
55 
56 /* The ANSI C standard defines several phases of translation:
57 
58    1. Terminate line by \n, regardless of the external representation
59       of a text line.  Stdio does this for us.
60 
61    2. Convert trigraphs to their single character equivalents.
62 
63    3. Concatenate each line ending in backslash (\) with the following
64       line.
65 
66    4. Replace each comment with a space character.
67 
68    5. Parse each resulting logical line as preprocessing tokens a
69       white space.
70 
71    6. Recognize and carry out directives (it also expands macros on
72       non-directive lines, which we do not do here).
73 
74    7. Replaces escape sequences within character strings with their
75       single character equivalents (we do this in step 5, because we
76       don't have to worry about the #include argument).
77 
78    8. Concatenates adjacent string literals to form single string
79       literals (because we don't expand macros, there are a few things
80       we will miss).
81 
82    9. Converts the remaining preprocessing tokens to C tokens and
83       discards any white space from the translation unit.
84 
85    This lexer implements the above, and presents the scanner (in
86    xgettext.c) with a stream of C tokens.  The comments are
87    accumulated in a buffer, and given to xgettext when asked for.  */
88 
89 
90 /* ========================= Lexer customization.  ========================= */
91 
92 static bool trigraphs = false;
93 
94 void
x_c_trigraphs()95 x_c_trigraphs ()
96 {
97   trigraphs = true;
98 }
99 
100 
101 /* ====================== Keyword set customization.  ====================== */
102 
103 /* If true extract all strings.  */
104 static bool extract_all = false;
105 
106 static hash_table c_keywords;
107 static hash_table objc_keywords;
108 static bool default_keywords = true;
109 
110 
111 void
x_c_extract_all()112 x_c_extract_all ()
113 {
114   extract_all = true;
115 }
116 
117 
118 static void
add_keyword(const char * name,hash_table * keywords)119 add_keyword (const char *name, hash_table *keywords)
120 {
121   if (name == NULL)
122     default_keywords = false;
123   else
124     {
125       const char *end;
126       struct callshape shape;
127       const char *colon;
128 
129       if (keywords->table == NULL)
130         hash_init (keywords, 100);
131 
132       split_keywordspec (name, &end, &shape);
133 
134       /* The characters between name and end should form a valid C identifier.
135          A colon means an invalid parse in split_keywordspec().  */
136       colon = strchr (name, ':');
137       if (colon == NULL || colon >= end)
138         insert_keyword_callshape (keywords, name, end - name, &shape);
139     }
140 }
141 
142 void
x_c_keyword(const char * name)143 x_c_keyword (const char *name)
144 {
145   add_keyword (name, &c_keywords);
146 }
147 
148 void
x_objc_keyword(const char * name)149 x_objc_keyword (const char *name)
150 {
151   add_keyword (name, &objc_keywords);
152 }
153 
154 static bool additional_keywords_kde;
155 
156 void
activate_additional_keywords_kde()157 activate_additional_keywords_kde ()
158 {
159   additional_keywords_kde = true;
160 }
161 
162 /* Finish initializing the keywords hash tables.
163    Called after argument processing, before each file is processed.  */
164 static void
init_keywords()165 init_keywords ()
166 {
167   if (default_keywords)
168     {
169       /* When adding new keywords here, also update the documentation in
170          xgettext.texi!  */
171       x_c_keyword ("gettext");
172       x_c_keyword ("dgettext:2");
173       x_c_keyword ("dcgettext:2");
174       x_c_keyword ("ngettext:1,2");
175       x_c_keyword ("dngettext:2,3");
176       x_c_keyword ("dcngettext:2,3");
177       x_c_keyword ("gettext_noop");
178       x_c_keyword ("pgettext:1c,2");
179       x_c_keyword ("dpgettext:2c,3");
180       x_c_keyword ("dcpgettext:2c,3");
181       x_c_keyword ("npgettext:1c,2,3");
182       x_c_keyword ("dnpgettext:2c,3,4");
183       x_c_keyword ("dcnpgettext:2c,3,4");
184 
185       if (additional_keywords_kde)
186         {
187           x_c_keyword ("i18n:1");
188           x_c_keyword ("i18nc:1c,2");
189           x_c_keyword ("i18np:1,2");
190           x_c_keyword ("i18ncp:1c,2,3");
191           x_c_keyword ("i18nd:2");
192           x_c_keyword ("i18ndc:2c,3");
193           x_c_keyword ("i18ndp:2,3");
194           x_c_keyword ("i18ndcp:2c,3,4");
195           x_c_keyword ("ki18n:1");
196           x_c_keyword ("ki18nc:1c,2");
197           x_c_keyword ("ki18np:1,2");
198           x_c_keyword ("ki18ncp:1c,2,3");
199           x_c_keyword ("ki18nd:2");
200           x_c_keyword ("ki18ndc:2c,3");
201           x_c_keyword ("ki18ndp:2,3");
202           x_c_keyword ("ki18ndcp:2c,3,4");
203           x_c_keyword ("I18N_NOOP:1");
204           x_c_keyword ("I18NC_NOOP:1c,2");
205           x_c_keyword ("I18N_NOOP2:1c,2");
206           x_c_keyword ("I18N_NOOP2_NOSTRIP:1c,2");
207           x_c_keyword ("xi18n:1");
208           x_c_keyword ("xi18nc:1c,2");
209           x_c_keyword ("xi18np:1,2");
210           x_c_keyword ("xi18ncp:1c,2,3");
211           x_c_keyword ("xi18nd:2");
212           x_c_keyword ("xi18ndc:2c,3");
213           x_c_keyword ("xi18ndp:2,3");
214           x_c_keyword ("xi18ndcp:2c,3,4");
215           x_c_keyword ("kxi18n:1");
216           x_c_keyword ("kxi18nc:1c,2");
217           x_c_keyword ("kxi18np:1,2");
218           x_c_keyword ("kxi18ncp:1c,2,3");
219           x_c_keyword ("kxi18nd:2");
220           x_c_keyword ("kxi18ndc:2c,3");
221           x_c_keyword ("kxi18ndp:2,3");
222           x_c_keyword ("kxi18ndcp:2c,3,4");
223           x_c_keyword ("XI18N_NOOP:1");
224           x_c_keyword ("XI18NC_NOOP:1c,2");
225           x_c_keyword ("XI18N_NOOP2:1c,2");
226           x_c_keyword ("XI18N_NOOP2_NOSTRIP:1c,2");
227         }
228 
229       x_objc_keyword ("gettext");
230       x_objc_keyword ("dgettext:2");
231       x_objc_keyword ("dcgettext:2");
232       x_objc_keyword ("ngettext:1,2");
233       x_objc_keyword ("dngettext:2,3");
234       x_objc_keyword ("dcngettext:2,3");
235       x_objc_keyword ("gettext_noop");
236       x_objc_keyword ("pgettext:1c,2");
237       x_objc_keyword ("dpgettext:2c,3");
238       x_objc_keyword ("dcpgettext:2c,3");
239       x_objc_keyword ("npgettext:1c,2,3");
240       x_objc_keyword ("dnpgettext:2c,3,4");
241       x_objc_keyword ("dcnpgettext:2c,3,4");
242       x_objc_keyword ("NSLocalizedString");       /* similar to gettext */
243       x_objc_keyword ("_");                       /* similar to gettext */
244       x_objc_keyword ("NSLocalizedStaticString"); /* similar to gettext_noop */
245       x_objc_keyword ("__");                      /* similar to gettext_noop */
246 
247       default_keywords = false;
248     }
249 }
250 
251 void
init_flag_table_c()252 init_flag_table_c ()
253 {
254   xgettext_record_flag ("gettext:1:pass-c-format");
255   xgettext_record_flag ("dgettext:2:pass-c-format");
256   xgettext_record_flag ("dcgettext:2:pass-c-format");
257   xgettext_record_flag ("ngettext:1:pass-c-format");
258   xgettext_record_flag ("ngettext:2:pass-c-format");
259   xgettext_record_flag ("dngettext:2:pass-c-format");
260   xgettext_record_flag ("dngettext:3:pass-c-format");
261   xgettext_record_flag ("dcngettext:2:pass-c-format");
262   xgettext_record_flag ("dcngettext:3:pass-c-format");
263   xgettext_record_flag ("gettext_noop:1:pass-c-format");
264   xgettext_record_flag ("pgettext:2:pass-c-format");
265   xgettext_record_flag ("dpgettext:3:pass-c-format");
266   xgettext_record_flag ("dcpgettext:3:pass-c-format");
267   xgettext_record_flag ("npgettext:2:pass-c-format");
268   xgettext_record_flag ("npgettext:3:pass-c-format");
269   xgettext_record_flag ("dnpgettext:3:pass-c-format");
270   xgettext_record_flag ("dnpgettext:4:pass-c-format");
271   xgettext_record_flag ("dcnpgettext:3:pass-c-format");
272   xgettext_record_flag ("dcnpgettext:4:pass-c-format");
273 
274   /* <stdio.h> */
275   xgettext_record_flag ("fprintf:2:c-format");
276   xgettext_record_flag ("vfprintf:2:c-format");
277   xgettext_record_flag ("printf:1:c-format");
278   xgettext_record_flag ("vprintf:1:c-format");
279   xgettext_record_flag ("sprintf:2:c-format");
280   xgettext_record_flag ("vsprintf:2:c-format");
281   xgettext_record_flag ("snprintf:3:c-format");
282   xgettext_record_flag ("vsnprintf:3:c-format");
283 #if 0 /* These functions are not standard.  */
284   /* <stdio.h> */
285   xgettext_record_flag ("asprintf:2:c-format");
286   xgettext_record_flag ("vasprintf:2:c-format");
287   xgettext_record_flag ("dprintf:2:c-format");
288   xgettext_record_flag ("vdprintf:2:c-format");
289   xgettext_record_flag ("obstack_printf:2:c-format");
290   xgettext_record_flag ("obstack_vprintf:2:c-format");
291   /* <error.h> */
292   xgettext_record_flag ("error:3:c-format");
293   xgettext_record_flag ("error_at_line:5:c-format");
294   /* <argp.h> */
295   xgettext_record_flag ("argp_error:2:c-format");
296   xgettext_record_flag ("argp_failure:2:c-format");
297 #endif
298 
299   xgettext_record_flag ("gettext:1:pass-qt-format");
300   xgettext_record_flag ("dgettext:2:pass-qt-format");
301   xgettext_record_flag ("dcgettext:2:pass-qt-format");
302   xgettext_record_flag ("ngettext:1:pass-qt-format");
303   xgettext_record_flag ("ngettext:2:pass-qt-format");
304   xgettext_record_flag ("dngettext:2:pass-qt-format");
305   xgettext_record_flag ("dngettext:3:pass-qt-format");
306   xgettext_record_flag ("dcngettext:2:pass-qt-format");
307   xgettext_record_flag ("dcngettext:3:pass-qt-format");
308   xgettext_record_flag ("gettext_noop:1:pass-qt-format");
309   xgettext_record_flag ("pgettext:2:pass-qt-format");
310   xgettext_record_flag ("dpgettext:3:pass-qt-format");
311   xgettext_record_flag ("dcpgettext:3:pass-qt-format");
312   xgettext_record_flag ("npgettext:2:pass-qt-format");
313   xgettext_record_flag ("npgettext:3:pass-qt-format");
314   xgettext_record_flag ("dnpgettext:3:pass-qt-format");
315   xgettext_record_flag ("dnpgettext:4:pass-qt-format");
316   xgettext_record_flag ("dcnpgettext:3:pass-qt-format");
317   xgettext_record_flag ("dcnpgettext:4:pass-qt-format");
318 
319   xgettext_record_flag ("gettext:1:pass-kde-format");
320   xgettext_record_flag ("dgettext:2:pass-kde-format");
321   xgettext_record_flag ("dcgettext:2:pass-kde-format");
322   xgettext_record_flag ("ngettext:1:pass-kde-format");
323   xgettext_record_flag ("ngettext:2:pass-kde-format");
324   xgettext_record_flag ("dngettext:2:pass-kde-format");
325   xgettext_record_flag ("dngettext:3:pass-kde-format");
326   xgettext_record_flag ("dcngettext:2:pass-kde-format");
327   xgettext_record_flag ("dcngettext:3:pass-kde-format");
328   xgettext_record_flag ("gettext_noop:1:pass-kde-format");
329   xgettext_record_flag ("pgettext:2:pass-kde-format");
330   xgettext_record_flag ("dpgettext:3:pass-kde-format");
331   xgettext_record_flag ("dcpgettext:3:pass-kde-format");
332   xgettext_record_flag ("npgettext:2:pass-kde-format");
333   xgettext_record_flag ("npgettext:3:pass-kde-format");
334   xgettext_record_flag ("dnpgettext:3:pass-kde-format");
335   xgettext_record_flag ("dnpgettext:4:pass-kde-format");
336   xgettext_record_flag ("dcnpgettext:3:pass-kde-format");
337   xgettext_record_flag ("dcnpgettext:4:pass-kde-format");
338 
339   xgettext_record_flag ("gettext:1:pass-boost-format");
340   xgettext_record_flag ("dgettext:2:pass-boost-format");
341   xgettext_record_flag ("dcgettext:2:pass-boost-format");
342   xgettext_record_flag ("ngettext:1:pass-boost-format");
343   xgettext_record_flag ("ngettext:2:pass-boost-format");
344   xgettext_record_flag ("dngettext:2:pass-boost-format");
345   xgettext_record_flag ("dngettext:3:pass-boost-format");
346   xgettext_record_flag ("dcngettext:2:pass-boost-format");
347   xgettext_record_flag ("dcngettext:3:pass-boost-format");
348   xgettext_record_flag ("gettext_noop:1:pass-boost-format");
349   xgettext_record_flag ("pgettext:2:pass-boost-format");
350   xgettext_record_flag ("dpgettext:3:pass-boost-format");
351   xgettext_record_flag ("dcpgettext:3:pass-boost-format");
352   xgettext_record_flag ("npgettext:2:pass-boost-format");
353   xgettext_record_flag ("npgettext:3:pass-boost-format");
354   xgettext_record_flag ("dnpgettext:3:pass-boost-format");
355   xgettext_record_flag ("dnpgettext:4:pass-boost-format");
356   xgettext_record_flag ("dcnpgettext:3:pass-boost-format");
357   xgettext_record_flag ("dcnpgettext:4:pass-boost-format");
358 
359   /* <boost/format.hpp> */
360   xgettext_record_flag ("format:1:boost-format");
361 }
362 
363 void
init_flag_table_objc()364 init_flag_table_objc ()
365 {
366   /* Since the settings done in init_flag_table_c() also have an effect for
367      the ObjectiveC parser, we don't have to repeat them here.  */
368   xgettext_record_flag ("gettext:1:pass-objc-format");
369   xgettext_record_flag ("dgettext:2:pass-objc-format");
370   xgettext_record_flag ("dcgettext:2:pass-objc-format");
371   xgettext_record_flag ("ngettext:1:pass-objc-format");
372   xgettext_record_flag ("ngettext:2:pass-objc-format");
373   xgettext_record_flag ("dngettext:2:pass-objc-format");
374   xgettext_record_flag ("dngettext:3:pass-objc-format");
375   xgettext_record_flag ("dcngettext:2:pass-objc-format");
376   xgettext_record_flag ("dcngettext:3:pass-objc-format");
377   xgettext_record_flag ("gettext_noop:1:pass-objc-format");
378   xgettext_record_flag ("pgettext:2:pass-objc-format");
379   xgettext_record_flag ("dpgettext:3:pass-objc-format");
380   xgettext_record_flag ("dcpgettext:3:pass-objc-format");
381   xgettext_record_flag ("npgettext:2:pass-objc-format");
382   xgettext_record_flag ("npgettext:3:pass-objc-format");
383   xgettext_record_flag ("dnpgettext:3:pass-objc-format");
384   xgettext_record_flag ("dnpgettext:4:pass-objc-format");
385   xgettext_record_flag ("dcnpgettext:3:pass-objc-format");
386   xgettext_record_flag ("dcnpgettext:4:pass-objc-format");
387   xgettext_record_flag ("NSLocalizedString:1:pass-c-format");
388   xgettext_record_flag ("NSLocalizedString:1:pass-objc-format");
389   xgettext_record_flag ("_:1:pass-c-format");
390   xgettext_record_flag ("_:1:pass-objc-format");
391   xgettext_record_flag ("stringWithFormat::1:objc-format");
392   xgettext_record_flag ("initWithFormat::1:objc-format");
393   xgettext_record_flag ("stringByAppendingFormat::1:objc-format");
394   xgettext_record_flag ("localizedStringWithFormat::1:objc-format");
395   xgettext_record_flag ("appendFormat::1:objc-format");
396 }
397 
398 void
init_flag_table_gcc_internal()399 init_flag_table_gcc_internal ()
400 {
401   xgettext_record_flag ("gettext:1:pass-gcc-internal-format");
402   xgettext_record_flag ("dgettext:2:pass-gcc-internal-format");
403   xgettext_record_flag ("dcgettext:2:pass-gcc-internal-format");
404   xgettext_record_flag ("ngettext:1:pass-gcc-internal-format");
405   xgettext_record_flag ("ngettext:2:pass-gcc-internal-format");
406   xgettext_record_flag ("dngettext:2:pass-gcc-internal-format");
407   xgettext_record_flag ("dngettext:3:pass-gcc-internal-format");
408   xgettext_record_flag ("dcngettext:2:pass-gcc-internal-format");
409   xgettext_record_flag ("dcngettext:3:pass-gcc-internal-format");
410   xgettext_record_flag ("gettext_noop:1:pass-gcc-internal-format");
411   xgettext_record_flag ("pgettext:2:pass-gcc-internal-format");
412   xgettext_record_flag ("dpgettext:3:pass-gcc-internal-format");
413   xgettext_record_flag ("dcpgettext:3:pass-gcc-internal-format");
414   xgettext_record_flag ("npgettext:2:pass-gcc-internal-format");
415   xgettext_record_flag ("npgettext:3:pass-gcc-internal-format");
416   xgettext_record_flag ("dnpgettext:3:pass-gcc-internal-format");
417   xgettext_record_flag ("dnpgettext:4:pass-gcc-internal-format");
418   xgettext_record_flag ("dcnpgettext:3:pass-gcc-internal-format");
419   xgettext_record_flag ("dcnpgettext:4:pass-gcc-internal-format");
420 #if 0 /* This should better be done inside GCC.  */
421   /* grepping for ATTRIBUTE_PRINTF in gcc-3.3/gcc/?*.h */
422   /* c-format.c */
423   xgettext_record_flag ("status_warning:2:gcc-internal-format");
424   /* c-tree.h */
425   xgettext_record_flag ("pedwarn_c99:1:pass-gcc-internal-format");
426   /* collect2.h */
427   //xgettext_record_flag ("error:1:c-format"); // 3 different versions
428   xgettext_record_flag ("notice:1:c-format");
429   //xgettext_record_flag ("fatal:1:c-format"); // 2 different versions
430   xgettext_record_flag ("fatal_perror:1:c-format");
431   /* cpplib.h */
432   xgettext_record_flag ("cpp_error:3:c-format");
433   xgettext_record_flag ("cpp_error_with_line:5:c-format");
434   /* diagnostic.h */
435   xgettext_record_flag ("diagnostic_set_info:2:pass-gcc-internal-format");
436   xgettext_record_flag ("output_printf:2:gcc-internal-format");
437   xgettext_record_flag ("output_verbatim:2:pass-gcc-internal-format");
438   xgettext_record_flag ("verbatim:1:gcc-internal-format");
439   xgettext_record_flag ("inform:1:pass-gcc-internal-format");
440   /* gcc.h */
441   //xgettext_record_flag ("fatal:1:c-format"); // 2 different versions
442   //xgettext_record_flag ("error:1:c-format"); // 3 different versions
443   /* genattrtab.h */
444   xgettext_record_flag ("attr_printf:2:pass-c-format");
445   /* gengtype.h */
446   xgettext_record_flag ("error_at_line:2:pass-c-format");
447   xgettext_record_flag ("xvasprintf:2:pass-c-format");
448   xgettext_record_flag ("xasprintf:1:pass-c-format");
449   xgettext_record_flag ("oprintf:2:pass-c-format");
450   /* gensupport.h */
451   xgettext_record_flag ("message_with_line:2:pass-c-format");
452   /* output.h */
453   xgettext_record_flag ("output_operand_lossage:1:c-format");
454   /* ra.h */
455    xgettext_record_flag ("ra_debug_msg:2:pass-c-format");
456   /* toplev.h */
457   xgettext_record_flag ("fnotice:2:c-format");
458   xgettext_record_flag ("fatal_io_error:2:gcc-internal-format");
459   xgettext_record_flag ("error_for_asm:2:pass-gcc-internal-format");
460   xgettext_record_flag ("warning_for_asm:2:pass-gcc-internal-format");
461   xgettext_record_flag ("error_with_file_and_line:3:pass-gcc-internal-format");
462   xgettext_record_flag ("error_with_decl:2:pass-gcc-internal-format");
463   xgettext_record_flag ("pedwarn:1:gcc-internal-format");
464   xgettext_record_flag ("pedwarn_with_file_and_line:3:gcc-internal-format");
465   xgettext_record_flag ("pedwarn_with_decl:2:gcc-internal-format");
466   xgettext_record_flag ("sorry:1:gcc-internal-format");
467   xgettext_record_flag ("error:1:pass-gcc-internal-format");
468   xgettext_record_flag ("fatal_error:1:pass-gcc-internal-format");
469   xgettext_record_flag ("internal_error:1:pass-gcc-internal-format");
470   xgettext_record_flag ("warning:1:pass-gcc-internal-format");
471   xgettext_record_flag ("warning_with_file_and_line:3:pass-gcc-internal-format");
472   xgettext_record_flag ("warning_with_decl:2:pass-gcc-internal-format");
473   /* f/com.h */
474   xgettext_record_flag ("ffecom_get_invented_identifier:1:pass-c-format");
475   /* f/sts.h */
476   xgettext_record_flag ("ffests_printf:2:pass-c-format");
477   /* java/java-tree.h */
478   xgettext_record_flag ("parse_error_context:2:pass-c-format");
479 #endif
480 
481   xgettext_record_flag ("gettext:1:pass-gfc-internal-format");
482   xgettext_record_flag ("dgettext:2:pass-gfc-internal-format");
483   xgettext_record_flag ("dcgettext:2:pass-gfc-internal-format");
484   xgettext_record_flag ("ngettext:1:pass-gfc-internal-format");
485   xgettext_record_flag ("ngettext:2:pass-gfc-internal-format");
486   xgettext_record_flag ("dngettext:2:pass-gfc-internal-format");
487   xgettext_record_flag ("dngettext:3:pass-gfc-internal-format");
488   xgettext_record_flag ("dcngettext:2:pass-gfc-internal-format");
489   xgettext_record_flag ("dcngettext:3:pass-gfc-internal-format");
490   xgettext_record_flag ("gettext_noop:1:pass-gfc-internal-format");
491   xgettext_record_flag ("pgettext:2:pass-gfc-internal-format");
492   xgettext_record_flag ("dpgettext:3:pass-gfc-internal-format");
493   xgettext_record_flag ("dcpgettext:3:pass-gfc-internal-format");
494   xgettext_record_flag ("npgettext:2:pass-gfc-internal-format");
495   xgettext_record_flag ("npgettext:3:pass-gfc-internal-format");
496   xgettext_record_flag ("dnpgettext:3:pass-gfc-internal-format");
497   xgettext_record_flag ("dnpgettext:4:pass-gfc-internal-format");
498   xgettext_record_flag ("dcnpgettext:3:pass-gfc-internal-format");
499   xgettext_record_flag ("dcnpgettext:4:pass-gfc-internal-format");
500 #if 0 /* This should better be done inside GCC.  */
501   /* fortran/error.c */
502   xgettext_record_flag ("gfc_error:1:gfc-internal-format");
503   xgettext_record_flag ("gfc_error_now:1:gfc-internal-format");
504   xgettext_record_flag ("gfc_fatal_error:1:gfc-internal-format");
505   xgettext_record_flag ("gfc_internal_error:1:gfc-internal-format");
506   xgettext_record_flag ("gfc_notify_std:2:gfc-internal-format");
507   xgettext_record_flag ("gfc_warning:1:gfc-internal-format");
508   xgettext_record_flag ("gfc_warning_now:1:gfc-internal-format");
509 #endif
510 }
511 
512 void
init_flag_table_kde()513 init_flag_table_kde ()
514 {
515   xgettext_record_flag ("i18n:1:kde-format");
516   xgettext_record_flag ("i18nc:2:kde-format");
517   xgettext_record_flag ("i18np:1:kde-format");
518   xgettext_record_flag ("i18ncp:2:kde-format");
519   xgettext_record_flag ("i18nd:2:kde-format");
520   xgettext_record_flag ("i18ndc:3:kde-format");
521   xgettext_record_flag ("i18ndp:2:kde-format");
522   xgettext_record_flag ("i18ndcp:3:kde-format");
523   xgettext_record_flag ("ki18n:1:kde-format");
524   xgettext_record_flag ("ki18nc:2:kde-format");
525   xgettext_record_flag ("ki18np:1:kde-format");
526   xgettext_record_flag ("ki18ncp:2:kde-format");
527   xgettext_record_flag ("ki18nd:2:kde-format");
528   xgettext_record_flag ("ki18ndc:3:kde-format");
529   xgettext_record_flag ("ki18ndp:2:kde-format");
530   xgettext_record_flag ("ki18ndcp:3:kde-format");
531   xgettext_record_flag ("I18N_NOOP:1:kde-format");
532   xgettext_record_flag ("I18NC_NOOP:2:kde-format");
533   xgettext_record_flag ("I18N_NOOP2:2:kde-format");
534   xgettext_record_flag ("I18N_NOOP2_NOSTRIP:2:kde-format");
535   xgettext_record_flag ("xi18n:1:kde-kuit-format");
536   xgettext_record_flag ("xi18nc:2:kde-kuit-format");
537   xgettext_record_flag ("xi18np:1:kde-kuit-format");
538   xgettext_record_flag ("xi18ncp:2:kde-kuit-format");
539   xgettext_record_flag ("xi18nd:2:kde-kuit-format");
540   xgettext_record_flag ("xi18ndc:3:kde-kuit-format");
541   xgettext_record_flag ("xi18ndp:2:kde-kuit-format");
542   xgettext_record_flag ("xi18ndcp:3:kde-kuit-format");
543   xgettext_record_flag ("kxi18n:1:kde-kuit-format");
544   xgettext_record_flag ("kxi18nc:2:kde-kuit-format");
545   xgettext_record_flag ("kxi18np:1:kde-kuit-format");
546   xgettext_record_flag ("kxi18ncp:2:kde-kuit-format");
547   xgettext_record_flag ("kxi18nd:2:kde-kuit-format");
548   xgettext_record_flag ("kxi18ndc:3:kde-kuit-format");
549   xgettext_record_flag ("kxi18ndp:2:kde-kuit-format");
550   xgettext_record_flag ("kxi18ndcp:3:kde-kuit-format");
551   xgettext_record_flag ("XI18N_NOOP:1:kde-kuit-format");
552   xgettext_record_flag ("XI18NC_NOOP:2:kde-kuit-format");
553   xgettext_record_flag ("XI18N_NOOP2:2:kde-kuit-format");
554   xgettext_record_flag ("XI18N_NOOP2_NOSTRIP:2:kde-kuit-format");
555 }
556 
557 /* ======================== Reading of characters.  ======================== */
558 
559 /* The input file stream.  */
560 static FILE *fp;
561 
562 
563 /* 0. Terminate line by \n, regardless whether the external representation of
564    a line terminator is LF (Unix), CR (Mac) or CR/LF (DOS/Windows).
565    It is debatable whether supporting CR/LF line terminators in C sources
566    on Unix is ISO C or POSIX compliant, but since GCC 3.3 now supports it
567    unconditionally, it must be OK.
568    The so-called "text mode" in stdio on DOS/Windows translates CR/LF to \n
569    automatically, but here we also need this conversion on Unix.  As a side
570    effect, on DOS/Windows we also parse CR/CR/LF into a single \n, but this
571    is not a problem.  */
572 
573 
574 static int
phase0_getc()575 phase0_getc ()
576 {
577   int c;
578 
579   c = getc (fp);
580   if (c == EOF)
581     {
582       if (ferror (fp))
583         error (EXIT_FAILURE, errno, _("error while reading \"%s\""),
584                real_file_name);
585       return EOF;
586     }
587 
588   if (c == '\r')
589     {
590       int c1 = getc (fp);
591 
592       if (c1 != EOF && c1 != '\n')
593         ungetc (c1, fp);
594 
595       /* Seen line terminator CR or CR/LF.  */
596       return '\n';
597     }
598 
599   return c;
600 }
601 
602 
603 /* Supports only one pushback character, and not '\n'.  */
604 static inline void
phase0_ungetc(int c)605 phase0_ungetc (int c)
606 {
607   if (c != EOF)
608     ungetc (c, fp);
609 }
610 
611 
612 /* 1. line_number handling.  Combine backslash-newline to nothing.  */
613 
614 static unsigned char phase1_pushback[2];
615 static int phase1_pushback_length;
616 
617 
618 static int
phase1_getc()619 phase1_getc ()
620 {
621   int c;
622 
623   if (phase1_pushback_length)
624     {
625       c = phase1_pushback[--phase1_pushback_length];
626       if (c == '\n')
627         ++line_number;
628       return c;
629     }
630   for (;;)
631     {
632       c = phase0_getc ();
633       switch (c)
634         {
635         case '\n':
636           ++line_number;
637           return '\n';
638 
639         case '\\':
640           c = phase0_getc ();
641           if (c != '\n')
642             {
643               phase0_ungetc (c);
644               return '\\';
645             }
646           ++line_number;
647           break;
648 
649         default:
650           return c;
651         }
652     }
653 }
654 
655 
656 /* Supports 2 characters of pushback.  */
657 static void
phase1_ungetc(int c)658 phase1_ungetc (int c)
659 {
660   switch (c)
661     {
662     case EOF:
663       break;
664 
665     case '\n':
666       --line_number;
667       /* FALLTHROUGH */
668 
669     default:
670       if (phase1_pushback_length == SIZEOF (phase1_pushback))
671         abort ();
672       phase1_pushback[phase1_pushback_length++] = c;
673       break;
674     }
675 }
676 
677 
678 /* 2. Convert trigraphs to their single character equivalents.  Most
679    sane human beings vomit copiously at the mention of trigraphs, which
680    is why they are an option.  */
681 
682 static unsigned char phase2_pushback[1];
683 static int phase2_pushback_length;
684 
685 
686 static int
phase2_getc()687 phase2_getc ()
688 {
689   int c;
690 
691   if (phase2_pushback_length)
692     return phase2_pushback[--phase2_pushback_length];
693   if (!trigraphs)
694     return phase1_getc ();
695 
696   c = phase1_getc ();
697   if (c != '?')
698     return c;
699   c = phase1_getc ();
700   if (c != '?')
701     {
702       phase1_ungetc (c);
703       return '?';
704     }
705   c = phase1_getc ();
706   switch (c)
707     {
708     case '(':
709       return '[';
710     case '/':
711       return '\\';
712     case ')':
713       return ']';
714     case '\'':
715       return '^';
716     case '<':
717       return '{';
718     case '!':
719       return '|';
720     case '>':
721       return '}';
722     case '-':
723       return '~';
724     case '#':
725       return '=';
726     }
727   phase1_ungetc (c);
728   phase1_ungetc ('?');
729   return '?';
730 }
731 
732 
733 /* Supports only one pushback character.  */
734 static void
phase2_ungetc(int c)735 phase2_ungetc (int c)
736 {
737   if (c != EOF)
738     {
739       if (phase2_pushback_length == SIZEOF (phase2_pushback))
740         abort ();
741       phase2_pushback[phase2_pushback_length++] = c;
742     }
743 }
744 
745 
746 /* 3. Concatenate each line ending in backslash (\) with the following
747    line.  Basically, all you need to do is elide "\\\n" sequences from
748    the input.  */
749 
750 static unsigned char phase3_pushback[9];
751 static int phase3_pushback_length;
752 
753 
754 static int
phase3_getc()755 phase3_getc ()
756 {
757   if (phase3_pushback_length)
758     return phase3_pushback[--phase3_pushback_length];
759   for (;;)
760     {
761       int c = phase2_getc ();
762       if (c != '\\')
763         return c;
764       c = phase2_getc ();
765       if (c != '\n')
766         {
767           phase2_ungetc (c);
768           return '\\';
769         }
770     }
771 }
772 
773 
774 /* Supports 9 characters of pushback.  */
775 static void
phase3_ungetc(int c)776 phase3_ungetc (int c)
777 {
778   if (c != EOF)
779     {
780       if (phase3_pushback_length == SIZEOF (phase3_pushback))
781         abort ();
782       phase3_pushback[phase3_pushback_length++] = c;
783     }
784 }
785 
786 
787 /* Accumulating comments.  */
788 
789 static char *buffer;
790 static size_t bufmax;
791 static size_t buflen;
792 
793 static inline void
comment_start()794 comment_start ()
795 {
796   buflen = 0;
797 }
798 
799 static inline void
comment_add(int c)800 comment_add (int c)
801 {
802   if (buflen >= bufmax)
803     {
804       bufmax = 2 * bufmax + 10;
805       buffer = xrealloc (buffer, bufmax);
806     }
807   buffer[buflen++] = c;
808 }
809 
810 static inline void
comment_line_end(size_t chars_to_remove)811 comment_line_end (size_t chars_to_remove)
812 {
813   buflen -= chars_to_remove;
814   while (buflen >= 1
815          && (buffer[buflen - 1] == ' ' || buffer[buflen - 1] == '\t'))
816     --buflen;
817   if (chars_to_remove == 0 && buflen >= bufmax)
818     {
819       bufmax = 2 * bufmax + 10;
820       buffer = xrealloc (buffer, bufmax);
821     }
822   buffer[buflen] = '\0';
823   savable_comment_add (buffer);
824 }
825 
826 
827 /* These are for tracking whether comments count as immediately before
828    keyword.  */
829 static int last_comment_line;
830 static int last_non_comment_line;
831 static int newline_count;
832 
833 
834 /* 4. Replace each comment that is not inside a character constant or
835    string literal with a space character.  We need to remember the
836    comment for later, because it may be attached to a keyword string.
837    We also optionally understand C++ comments.  */
838 
839 static int
phase4_getc()840 phase4_getc ()
841 {
842   int c;
843   bool last_was_star;
844 
845   c = phase3_getc ();
846   if (c != '/')
847     return c;
848   c = phase3_getc ();
849   switch (c)
850     {
851     default:
852       phase3_ungetc (c);
853       return '/';
854 
855     case '*':
856       /* C comment.  */
857       comment_start ();
858       last_was_star = false;
859       for (;;)
860         {
861           c = phase3_getc ();
862           if (c == EOF)
863             break;
864           /* We skip all leading white space, but not EOLs.  */
865           if (!(buflen == 0 && (c == ' ' || c == '\t')))
866             comment_add (c);
867           switch (c)
868             {
869             case '\n':
870               comment_line_end (1);
871               comment_start ();
872               last_was_star = false;
873               continue;
874 
875             case '*':
876               last_was_star = true;
877               continue;
878 
879             case '/':
880               if (last_was_star)
881                 {
882                   comment_line_end (2);
883                   break;
884                 }
885               /* FALLTHROUGH */
886 
887             default:
888               last_was_star = false;
889               continue;
890             }
891           break;
892         }
893       last_comment_line = newline_count;
894       return ' ';
895 
896     case '/':
897       /* C++ or ISO C 99 comment.  */
898       comment_start ();
899       for (;;)
900         {
901           c = phase3_getc ();
902           if (c == '\n' || c == EOF)
903             break;
904           /* We skip all leading white space, but not EOLs.  */
905           if (!(buflen == 0 && (c == ' ' || c == '\t')))
906             comment_add (c);
907         }
908       comment_line_end (0);
909       last_comment_line = newline_count;
910       return '\n';
911     }
912 }
913 
914 
915 /* Supports only one pushback character.  */
916 static void
phase4_ungetc(int c)917 phase4_ungetc (int c)
918 {
919   phase3_ungetc (c);
920 }
921 
922 
923 /* ========================== Reading of tokens.  ========================== */
924 
925 
926 /* True if ObjectiveC extensions are recognized.  */
927 static bool objc_extensions;
928 
929 /* True if C++ extensions are recognized.  */
930 static bool cxx_extensions;
931 
932 enum token_type_ty
933 {
934   token_type_character_constant,        /* 'x' */
935   token_type_eof,
936   token_type_eoln,
937   token_type_hash,                      /* # */
938   token_type_lparen,                    /* ( */
939   token_type_rparen,                    /* ) */
940   token_type_comma,                     /* , */
941   token_type_colon,                     /* : */
942   token_type_name,                      /* abc */
943   token_type_number,                    /* 2.7 */
944   token_type_string_literal,            /* "abc" */
945   token_type_symbol,                    /* < > = etc. */
946   token_type_objc_special,              /* @ */
947   token_type_white_space
948 };
949 typedef enum token_type_ty token_type_ty;
950 
951 typedef struct token_ty token_ty;
952 struct token_ty
953 {
954   token_type_ty type;
955   char *string;                         /* for token_type_name */
956   mixed_string_ty *mixed_string;        /* for token_type_string_literal */
957   refcounted_string_list_ty *comment;   /* for token_type_string_literal,
958                                            token_type_objc_special */
959   long number;
960   int line_number;
961 };
962 
963 
964 /* 7. Replace escape sequences within character strings with their
965    single character equivalents.  This is called from phase 5, because
966    we don't have to worry about the #include argument.  There are
967    pathological cases which could bite us (like the DOS directory
968    separator), but just pretend it can't happen.  */
969 
970 /* Return value of phase7_getc when EOF is reached.  */
971 #define P7_EOF (-1)
972 #define P7_STRING_END (-2)
973 
974 /* Replace escape sequences within character strings with their single
975    character equivalents.  */
976 #define P7_QUOTES (-3)
977 #define P7_QUOTE (-4)
978 #define P7_NEWLINE (-5)
979 
980 /* Convert an UTF-16 or UTF-32 code point to a return value that can be
981    distinguished from a single-byte return value.  */
982 #define UNICODE(code) (0x100 + (code))
983 
984 /* Test a return value of phase7_getuc whether it designates an UTF-16 or
985    UTF-32 code point.  */
986 #define IS_UNICODE(p7_result) ((p7_result) >= 0x100)
987 
988 /* Extract the UTF-16 or UTF-32 code of a return value that satisfies
989    IS_UNICODE.  */
990 #define UNICODE_VALUE(p7_result) ((p7_result) - 0x100)
991 
992 
993 static int
phase7_getc()994 phase7_getc ()
995 {
996   int c, n, j;
997 
998   /* Use phase 3, because phase 4 elides comments.  */
999   c = phase3_getc ();
1000 
1001   /* Return a magic newline indicator, so that we can distinguish
1002      between the user requesting a newline in the string (e.g. using
1003      "\n" or "\012") from the user failing to terminate the string or
1004      character constant.  The ANSI C standard says: 3.1.3.4 Character
1005      Constants contain "any character except single quote, backslash or
1006      newline; or an escape sequence" and 3.1.4 String Literals contain
1007      "any character except double quote, backslash or newline; or an
1008      escape sequence".
1009 
1010      Most compilers give a fatal error in this case, however gcc is
1011      stupidly silent, even though this is a very common typo.  OK, so
1012      "gcc --pedantic" will tell me, but that gripes about too much other
1013      stuff.  Could I have a "gcc -Wnewline-in-string" option, or
1014      better yet a "gcc -fno-newline-in-string" option, please?  Gcc is
1015      also inconsistent between string literals and character constants:
1016      you may not embed newlines in character constants; try it, you get
1017      a useful diagnostic.  --PMiller  */
1018   if (c == '\n')
1019     return P7_NEWLINE;
1020 
1021   if (c == '"')
1022     return P7_QUOTES;
1023   if (c == '\'')
1024     return P7_QUOTE;
1025   if (c != '\\')
1026     return c;
1027   c = phase3_getc ();
1028   switch (c)
1029     {
1030     default:
1031       /* Invalid escape sequences generate a GCC warning, and GCC transforms
1032          \c to the character c.  So let's do the same.  */
1033     case '"':
1034     case '\'':
1035     case '?':
1036     case '\\':
1037       return c;
1038 
1039     case 'a':
1040       return '\a';
1041     case 'b':
1042       return '\b';
1043 
1044       /* The \e escape is preculiar to gcc, and assumes an ASCII
1045          character set (or superset).  We don't provide support for it
1046          here.  */
1047 
1048     case 'f':
1049       return '\f';
1050     case 'n':
1051       return '\n';
1052     case 'r':
1053       return '\r';
1054     case 't':
1055       return '\t';
1056     case 'v':
1057       return '\v';
1058 
1059     case 'x':
1060       c = phase3_getc ();
1061       switch (c)
1062         {
1063         default:
1064           phase3_ungetc (c);
1065           phase3_ungetc ('x');
1066           return '\\';
1067 
1068         case '0': case '1': case '2': case '3': case '4':
1069         case '5': case '6': case '7': case '8': case '9':
1070         case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
1071         case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
1072           break;
1073         }
1074       n = 0;
1075       for (;;)
1076         {
1077           switch (c)
1078             {
1079             default:
1080               phase3_ungetc (c);
1081               return n;
1082 
1083             case '0': case '1': case '2': case '3': case '4':
1084             case '5': case '6': case '7': case '8': case '9':
1085               n = n * 16 + c - '0';
1086               break;
1087 
1088             case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
1089               n = n * 16 + 10 + c - 'A';
1090               break;
1091 
1092             case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
1093               n = n * 16 + 10 + c - 'a';
1094               break;
1095             }
1096           c = phase3_getc ();
1097         }
1098       return n;
1099 
1100     case '0': case '1': case '2': case '3':
1101     case '4': case '5': case '6': case '7':
1102       n = 0;
1103       for (j = 0; j < 3; ++j)
1104         {
1105           n = n * 8 + c - '0';
1106           c = phase3_getc ();
1107           switch (c)
1108             {
1109             default:
1110               break;
1111 
1112             case '0': case '1': case '2': case '3':
1113             case '4': case '5': case '6': case '7':
1114               continue;
1115             }
1116           break;
1117         }
1118       phase3_ungetc (c);
1119       return n;
1120 
1121     case 'U': case 'u':
1122       {
1123         unsigned char buf[8];
1124 
1125         n = 0;
1126         for (j = 0; j < (c == 'u' ? 4 : 8); j++)
1127           {
1128             int c1 = phase3_getc ();
1129 
1130             if (c1 >= '0' && c1 <= '9')
1131               n = (n << 4) + (c1 - '0');
1132             else if (c1 >= 'A' && c1 <= 'F')
1133               n = (n << 4) + (c1 - 'A' + 10);
1134             else if (c1 >= 'a' && c1 <= 'f')
1135               n = (n << 4) + (c1 - 'a' + 10);
1136             else
1137               {
1138                 phase3_ungetc (c1);
1139                 while (--j >= 0)
1140                   phase3_ungetc (buf[j]);
1141                 phase3_ungetc (c);
1142                 return '\\';
1143               }
1144 
1145             buf[j] = c1;
1146           }
1147 
1148         if (n < 0x110000)
1149           return UNICODE (n);
1150 
1151         error_with_progname = false;
1152         error (0, 0, _("%s:%d: warning: invalid Unicode character"),
1153                logical_file_name, line_number);
1154         error_with_progname = true;
1155 
1156         while (--j >= 0)
1157           phase3_ungetc (buf[j]);
1158         phase3_ungetc (c);
1159         return '\\';
1160       }
1161     }
1162 }
1163 
1164 
1165 static void
phase7_ungetc(int c)1166 phase7_ungetc (int c)
1167 {
1168   phase3_ungetc (c);
1169 }
1170 
1171 
1172 /* Free the memory pointed to by a 'struct token_ty'.  */
1173 static inline void
free_token(token_ty * tp)1174 free_token (token_ty *tp)
1175 {
1176   if (tp->type == token_type_name)
1177     free (tp->string);
1178   if (tp->type == token_type_string_literal)
1179     mixed_string_free (tp->mixed_string);
1180   if (tp->type == token_type_string_literal
1181       || tp->type == token_type_objc_special)
1182     drop_reference (tp->comment);
1183 }
1184 
1185 
1186 /* 5. Parse each resulting logical line as preprocessing tokens and
1187    white space.  Preprocessing tokens and C tokens don't always match.  */
1188 
1189 static token_ty phase5_pushback[1];
1190 static int phase5_pushback_length;
1191 
1192 
1193 static void
phase5_get(token_ty * tp)1194 phase5_get (token_ty *tp)
1195 {
1196   static char *buffer;
1197   static int bufmax;
1198   int bufpos;
1199   int c;
1200 
1201   if (phase5_pushback_length)
1202     {
1203       *tp = phase5_pushback[--phase5_pushback_length];
1204       return;
1205     }
1206   tp->string = NULL;
1207   tp->number = 0;
1208   tp->line_number = line_number;
1209   c = phase4_getc ();
1210   switch (c)
1211     {
1212     case EOF:
1213       tp->type = token_type_eof;
1214       return;
1215 
1216     case '\n':
1217       tp->type = token_type_eoln;
1218       return;
1219 
1220     case ' ':
1221     case '\f':
1222     case '\t':
1223       for (;;)
1224         {
1225           c = phase4_getc ();
1226           switch (c)
1227             {
1228             case ' ':
1229             case '\f':
1230             case '\t':
1231               continue;
1232 
1233             default:
1234               phase4_ungetc (c);
1235               break;
1236             }
1237           break;
1238         }
1239       tp->type = token_type_white_space;
1240       return;
1241 
1242     case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': case 'G':
1243     case 'H': case 'I': case 'J': case 'K': case 'L': case 'M': case 'N':
1244     case 'O': case 'P': case 'Q': case 'R': case 'S': case 'T': case 'U':
1245     case 'V': case 'W': case 'X': case 'Y': case 'Z':
1246     case '_':
1247     case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': case 'g':
1248     case 'h': case 'i': case 'j': case 'k': case 'l': case 'm': case 'n':
1249     case 'o': case 'p': case 'q': case 'r': case 's': case 't': case 'u':
1250     case 'v': case 'w': case 'x': case 'y': case 'z':
1251       bufpos = 0;
1252       for (;;)
1253         {
1254           if (bufpos >= bufmax)
1255             {
1256               bufmax = 2 * bufmax + 10;
1257               buffer = xrealloc (buffer, bufmax);
1258             }
1259           buffer[bufpos++] = c;
1260           c = phase4_getc ();
1261           switch (c)
1262             {
1263             case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
1264             case 'G': case 'H': case 'I': case 'J': case 'K': case 'L':
1265             case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R':
1266             case 'S': case 'T': case 'U': case 'V': case 'W': case 'X':
1267             case 'Y': case 'Z':
1268             case '_':
1269             case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
1270             case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
1271             case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
1272             case 's': case 't': case 'u': case 'v': case 'w': case 'x':
1273             case 'y': case 'z':
1274             case '0': case '1': case '2': case '3': case '4':
1275             case '5': case '6': case '7': case '8': case '9':
1276               continue;
1277 
1278             case '"':
1279               /* Recognize C11 / C++11 string literals.
1280                  See (for C) ISO 9899:2011 section 6.4.5
1281                  and (for C++) ISO C++ 11 section 2.14.5 [lex.string].
1282                  Note: The programmer who passes an UTF-8 encoded string to
1283                  gettext() or similar API functions will have to have called
1284                  bind_textdomain_codeset (DOMAIN, "UTF-8") first.  */
1285               if (bufpos == 2 && buffer[0] == 'u' && buffer[1] == '8')
1286                 goto string_literal;
1287               /* Recognize C++11 raw string literals.
1288                  See ISO C++ 11 section 2.14.5 [lex.string].
1289                  Here it is important to properly parse all cases according to
1290                  the standard, otherwise our parser could get confused by
1291                  double-quotes inside the raw string.
1292                  Note: The programmer who passes an UTF-8 encoded string to
1293                  gettext() or similar API functions will have to have called
1294                  bind_textdomain_codeset (DOMAIN, "UTF-8") first.  */
1295               if (cxx_extensions
1296                   && (bufpos == 1
1297                       || (bufpos == 2
1298                           && (buffer[0] == 'u' || buffer[0] == 'U'
1299                               || buffer[0] == 'L'))
1300                       || (bufpos == 3 && buffer[0] == 'u' && buffer[1] == '8'))
1301                   && buffer[bufpos - 1] == 'R')
1302                 {
1303                   /* Only R and u8R raw strings can be used as gettext()
1304                      arguments, for type reasons.  */
1305                   const bool relevant = (bufpos != 2);
1306                   int starting_line_number = line_number;
1307                   bufpos = 0;
1308                   /* Start the buffer with a closing parenthesis.  This makes the
1309                      parsing code below simpler.  */
1310                   buffer[bufpos++] = ')';
1311                   /* Parse the initial delimiter.  */
1312                   for (;;)
1313                     {
1314                       bool valid_delimiter_char;
1315 
1316                       c = phase3_getc ();
1317                       switch (c)
1318                         {
1319                         case 'A': case 'B': case 'C': case 'D': case 'E':
1320                         case 'F': case 'G': case 'H': case 'I': case 'J':
1321                         case 'K': case 'L': case 'M': case 'N': case 'O':
1322                         case 'P': case 'Q': case 'R': case 'S': case 'T':
1323                         case 'U': case 'V': case 'W': case 'X': case 'Y':
1324                         case 'Z':
1325                         case 'a': case 'b': case 'c': case 'd': case 'e':
1326                         case 'f': case 'g': case 'h': case 'i': case 'j':
1327                         case 'k': case 'l': case 'm': case 'n': case 'o':
1328                         case 'p': case 'q': case 'r': case 's': case 't':
1329                         case 'u': case 'v': case 'w': case 'x': case 'y':
1330                         case 'z':
1331                         case '0': case '1': case '2': case '3': case '4':
1332                         case '5': case '6': case '7': case '8': case '9':
1333                         case '_': case '{': case '}': case '[': case ']':
1334                         case '#': case '<': case '>': case '%': case ':':
1335                         case ';': case '.': case '?': case '*': case '+':
1336                         case '-': case '/': case '^': case '&': case '|':
1337                         case '~': case '!': case '=': case ',': case '\'':
1338                           valid_delimiter_char = true;
1339                           break;
1340                         case '"':
1341                           /* A double-quote within the delimiter! This is too
1342                              weird.  We don't support this.  */
1343                           error_with_progname = false;
1344                           error (0, 0, _("%s:%d: warning: a double-quote in the delimiter of a raw string literal is unsupported"),
1345                                  logical_file_name, starting_line_number);
1346                           error_with_progname = true;
1347                           /* FALLTHROUGH */
1348                         default:
1349                           valid_delimiter_char = false;
1350                           break;
1351                         }
1352                       if (!valid_delimiter_char)
1353                         break;
1354 
1355                       if (bufpos >= bufmax)
1356                         {
1357                           bufmax = 2 * bufmax + 10;
1358                           buffer = xrealloc (buffer, bufmax);
1359                         }
1360                       buffer[bufpos++] = c;
1361                     }
1362                   if (c == '(')
1363                     {
1364                       struct mixed_string_buffer msb;
1365                       /* The state is either 0 or
1366                          N, after a ')' and N-1 bytes of the delimiter have been
1367                          encountered.  */
1368                       int state;
1369 
1370                       /* Start accumulating the string.  */
1371                       if (relevant)
1372                         mixed_string_buffer_init (&msb, lc_string,
1373                                                   logical_file_name,
1374                                                   line_number);
1375                       state = 0;
1376 
1377                       for (;;)
1378                         {
1379                           c = phase3_getc ();
1380 
1381                           /* Keep line_number in sync.  */
1382                           if (relevant)
1383                             msb.line_number = line_number;
1384 
1385                           if (c == EOF)
1386                             break;
1387 
1388                           /* Update the state.  */
1389                           if (c == (state < bufpos ? buffer[state] : '"'))
1390                             {
1391                               if (state < bufpos)
1392                                 state++;
1393                               else /* state == bufpos && c == '"' */
1394                                 {
1395                                   /* Finished parsing the string.  */
1396                                   if (relevant)
1397                                     {
1398                                       tp->type = token_type_string_literal;
1399                                       tp->mixed_string = mixed_string_buffer_result (&msb);
1400                                       tp->comment = add_reference (savable_comment);
1401                                     }
1402                                   else
1403                                     tp->type = token_type_symbol;
1404                                   return;
1405                                 }
1406                             }
1407                           else
1408                             {
1409                               int i;
1410 
1411                               /* None of the bytes buffer[0]...buffer[state-1]
1412                                  can be ')'.  */
1413                               if (relevant)
1414                                 for (i = 0; i < state; i++)
1415                                   mixed_string_buffer_append_char (&msb, buffer[i]);
1416 
1417                               /* But c may be ')'.  */
1418                               if (c == ')')
1419                                 state = 1;
1420                               else
1421                                 {
1422                                   if (relevant)
1423                                     mixed_string_buffer_append_char (&msb, c);
1424                                   state = 0;
1425                                 }
1426                             }
1427                         }
1428                     }
1429                   if (c == EOF)
1430                     {
1431                       error_with_progname = false;
1432                       error (0, 0, _("%s:%d: warning: unterminated raw string literal"),
1433                              logical_file_name, starting_line_number);
1434                       error_with_progname = true;
1435                       tp->type = token_type_eof;
1436                       return;
1437                     }
1438                   /* The error message for c == '"' was already emitted above.  */
1439                   if (c != '"')
1440                     {
1441                       error_with_progname = false;
1442                       error (0, 0, _("%s:%d: warning: invalid raw string literal syntax"),
1443                              logical_file_name, starting_line_number);
1444                       error_with_progname = true;
1445                     }
1446                   /* To get into a sane state, read up until the next double-quote,
1447                      newline, or EOF.  */
1448                   while (!(c == EOF || c == '"' || c == '\n'))
1449                     c = phase3_getc ();
1450                   tp->type = token_type_symbol;
1451                   return;
1452                 }
1453               /* FALLTHROUGH */
1454 
1455             default:
1456               phase4_ungetc (c);
1457               break;
1458             }
1459           break;
1460         }
1461       if (bufpos >= bufmax)
1462         {
1463           bufmax = 2 * bufmax + 10;
1464           buffer = xrealloc (buffer, bufmax);
1465         }
1466       buffer[bufpos] = 0;
1467       tp->string = xstrdup (buffer);
1468       tp->type = token_type_name;
1469       return;
1470 
1471     case '.':
1472       c = phase4_getc ();
1473       phase4_ungetc (c);
1474       switch (c)
1475         {
1476         default:
1477           tp->type = token_type_symbol;
1478           return;
1479 
1480         case '0': case '1': case '2': case '3': case '4':
1481         case '5': case '6': case '7': case '8': case '9':
1482           c = '.';
1483           break;
1484         }
1485       /* FALLTHROUGH */
1486 
1487     case '0': case '1': case '2': case '3': case '4':
1488     case '5': case '6': case '7': case '8': case '9':
1489       /* The preprocessing number token is more "generous" than the C
1490          number tokens.  This is mostly due to token pasting (another
1491          thing we can ignore here).  */
1492       bufpos = 0;
1493       for (;;)
1494         {
1495           if (bufpos >= bufmax)
1496             {
1497               bufmax = 2 * bufmax + 10;
1498               buffer = xrealloc (buffer, bufmax);
1499             }
1500           buffer[bufpos++] = c;
1501           c = phase4_getc ();
1502           switch (c)
1503             {
1504             case 'p':
1505             case 'P':
1506               /* In C99 and C++17, 'p' and 'P' can be used as an exponent
1507                  marker.  */
1508               /* FALLTHROUGH */
1509             case 'e':
1510             case 'E':
1511               if (bufpos >= bufmax)
1512                 {
1513                   bufmax = 2 * bufmax + 10;
1514                   buffer = xrealloc (buffer, bufmax);
1515                 }
1516               buffer[bufpos++] = c;
1517               c = phase4_getc ();
1518               if (c != '+' && c != '-')
1519                 {
1520                   phase4_ungetc (c);
1521                   break;
1522                 }
1523               continue;
1524 
1525             case 'A': case 'B': case 'C': case 'D':           case 'F':
1526             case 'G': case 'H': case 'I': case 'J': case 'K': case 'L':
1527             case 'M': case 'N': case 'O':           case 'Q': case 'R':
1528             case 'S': case 'T': case 'U': case 'V': case 'W': case 'X':
1529             case 'Y': case 'Z':
1530             case 'a': case 'b': case 'c': case 'd':           case 'f':
1531             case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
1532             case 'm': case 'n': case 'o':           case 'q': case 'r':
1533             case 's': case 't': case 'u': case 'v': case 'w': case 'x':
1534             case 'y': case 'z':
1535             case '0': case '1': case '2': case '3': case '4':
1536             case '5': case '6': case '7': case '8': case '9':
1537             case '.':
1538               continue;
1539 
1540             case '_':
1541               if (cxx_extensions)
1542                 /* In C++, an underscore can be part of a preprocessing number
1543                    token.  */
1544                 continue;
1545               else
1546                 {
1547                   phase4_ungetc (c);
1548                   break;
1549                 }
1550 
1551             case '\'':
1552               if (cxx_extensions)
1553                 {
1554                   /* In C++14, a single-quote followed by a digit, ASCII letter,
1555                      or underscore can be part of a preprocessing number token.  */
1556                   int c1 = phase4_getc ();
1557                   switch (c1)
1558                     {
1559                     case '0': case '1': case '2': case '3': case '4':
1560                     case '5': case '6': case '7': case '8': case '9':
1561                     case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
1562                     case 'G': case 'H': case 'I': case 'J': case 'K': case 'L':
1563                     case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R':
1564                     case 'S': case 'T': case 'U': case 'V': case 'W': case 'X':
1565                     case 'Y': case 'Z':
1566                     case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
1567                     case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
1568                     case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
1569                     case 's': case 't': case 'u': case 'v': case 'w': case 'x':
1570                     case 'y': case 'z':
1571                     case '_':
1572                       if (bufpos >= bufmax)
1573                         {
1574                           bufmax = 2 * bufmax + 10;
1575                           buffer = xrealloc (buffer, bufmax);
1576                         }
1577                       buffer[bufpos++] = c;
1578                       c = c1;
1579                       continue;
1580                     default:
1581                       /* The two phase4_getc() calls that returned c and c1 did
1582                          nothing more than to call phase3_getc(), without any
1583                          lookahead.  Therefore 2 pushback characters are
1584                          supported in this case.  */
1585                       phase4_ungetc (c1);
1586                       break;
1587                     }
1588                 }
1589               /* FALLTHROUGH */
1590             default:
1591               phase4_ungetc (c);
1592               break;
1593             }
1594           break;
1595         }
1596       if (bufpos >= bufmax)
1597         {
1598           bufmax = 2 * bufmax + 10;
1599           buffer = xrealloc (buffer, bufmax);
1600         }
1601       buffer[bufpos] = 0;
1602       tp->type = token_type_number;
1603       tp->number = atol (buffer);
1604       return;
1605 
1606     case '\'':
1607       /* We could worry about the 'L' before wide character constants,
1608          but ignoring it has no effect unless one of the keywords is
1609          "L".  Just pretend it won't happen.  Also, we don't need to
1610          remember the character constant.  */
1611       for (;;)
1612         {
1613           c = phase7_getc ();
1614           if (c == P7_NEWLINE)
1615             {
1616               error_with_progname = false;
1617               error (0, 0, _("%s:%d: warning: unterminated character constant"),
1618                      logical_file_name, line_number - 1);
1619               error_with_progname = true;
1620               phase7_ungetc ('\n');
1621               break;
1622             }
1623           if (c == EOF || c == P7_QUOTE)
1624             break;
1625         }
1626       tp->type = token_type_character_constant;
1627       return;
1628 
1629     case '"':
1630     string_literal:
1631       /* We could worry about the 'L' or 'u' or 'U' before wide string
1632          constants, but since gettext's argument is a 'const char *', not
1633          a 'const wchar_t *' (for 'L') nor a 'const char16_t *' (for 'u')
1634          nor a 'const char32_t *' (for 'U'), the compiler would complain
1635          about the argument not matching the prototype.  Just pretend it
1636          won't happen.  */
1637       {
1638         struct mixed_string_buffer msb;
1639 
1640         /* Start accumulating the string.  */
1641         mixed_string_buffer_init (&msb, lc_string,
1642                                   logical_file_name, line_number);
1643 
1644         for (;;)
1645           {
1646             c = phase7_getc ();
1647 
1648             /* Keep line_number in sync.  */
1649             msb.line_number = line_number;
1650 
1651             if (c == P7_NEWLINE)
1652               {
1653                 error_with_progname = false;
1654                 error (0, 0, _("%s:%d: warning: unterminated string literal"),
1655                        logical_file_name, line_number - 1);
1656                 error_with_progname = true;
1657                 phase7_ungetc ('\n');
1658                 break;
1659               }
1660             if (c == EOF || c == P7_QUOTES)
1661               break;
1662             if (c == P7_QUOTE)
1663               c = '\'';
1664             if (IS_UNICODE (c))
1665               {
1666                 assert (UNICODE_VALUE (c) >= 0
1667                         && UNICODE_VALUE (c) < 0x110000);
1668                 mixed_string_buffer_append_unicode (&msb, UNICODE_VALUE (c));
1669               }
1670             else
1671               mixed_string_buffer_append_char (&msb, c);
1672           }
1673         tp->type = token_type_string_literal;
1674         tp->mixed_string = mixed_string_buffer_result (&msb);
1675         tp->comment = add_reference (savable_comment);
1676         return;
1677       }
1678 
1679     case '(':
1680       tp->type = token_type_lparen;
1681       return;
1682 
1683     case ')':
1684       tp->type = token_type_rparen;
1685       return;
1686 
1687     case ',':
1688       tp->type = token_type_comma;
1689       return;
1690 
1691     case '#':
1692       tp->type = token_type_hash;
1693       return;
1694 
1695     case ':':
1696       tp->type = token_type_colon;
1697       return;
1698 
1699     case '@':
1700       if (objc_extensions)
1701         {
1702           tp->type = token_type_objc_special;
1703           tp->comment = add_reference (savable_comment);
1704           return;
1705         }
1706       /* FALLTHROUGH */
1707 
1708     default:
1709       /* We could carefully recognize each of the 2 and 3 character
1710          operators, but it is not necessary, as we only need to recognize
1711          gettext invocations.  Don't bother.  */
1712       tp->type = token_type_symbol;
1713       return;
1714     }
1715 }
1716 
1717 
1718 /* Supports only one pushback token.  */
1719 static void
phase5_unget(token_ty * tp)1720 phase5_unget (token_ty *tp)
1721 {
1722   if (tp->type != token_type_eof)
1723     {
1724       if (phase5_pushback_length == SIZEOF (phase5_pushback))
1725         abort ();
1726       phase5_pushback[phase5_pushback_length++] = *tp;
1727     }
1728 }
1729 
1730 
1731 /* X. Recognize a leading # symbol.  Leave leading hash as a hash, but
1732    turn hash in the middle of a line into a plain symbol token.  This
1733    makes the phase 6 easier.  */
1734 
1735 static void
phaseX_get(token_ty * tp)1736 phaseX_get (token_ty *tp)
1737 {
1738   static bool middle;   /* false at the beginning of a line, true otherwise.  */
1739 
1740   phase5_get (tp);
1741 
1742   if (tp->type == token_type_eoln || tp->type == token_type_eof)
1743     middle = false;
1744   else
1745     {
1746       if (middle)
1747         {
1748           /* Turn hash in the middle of a line into a plain symbol token.  */
1749           if (tp->type == token_type_hash)
1750             tp->type = token_type_symbol;
1751         }
1752       else
1753         {
1754           /* When we see leading whitespace followed by a hash sign,
1755              discard the leading white space token.  The hash is all
1756              phase 6 is interested in.  */
1757           if (tp->type == token_type_white_space)
1758             {
1759               token_ty next;
1760 
1761               phase5_get (&next);
1762               if (next.type == token_type_hash)
1763                 *tp = next;
1764               else
1765                 phase5_unget (&next);
1766             }
1767           middle = true;
1768         }
1769     }
1770 }
1771 
1772 
1773 /* 6. Recognize and carry out directives (it also expands macros on
1774    non-directive lines, which we do not do here).  The only directive
1775    we care about are the #line and #define directive.  We throw all the
1776    others away.  */
1777 
1778 static token_ty phase6_pushback[2];
1779 static int phase6_pushback_length;
1780 
1781 
1782 static void
phase6_get(token_ty * tp)1783 phase6_get (token_ty *tp)
1784 {
1785   static token_ty *buf;
1786   static int bufmax;
1787   int bufpos;
1788   int j;
1789 
1790   if (phase6_pushback_length)
1791     {
1792       *tp = phase6_pushback[--phase6_pushback_length];
1793       return;
1794     }
1795   for (;;)
1796     {
1797       /* Get the next token.  If it is not a '#' at the beginning of a
1798          line (ignoring whitespace), return immediately.  */
1799       phaseX_get (tp);
1800       if (tp->type != token_type_hash)
1801         return;
1802 
1803       /* Accumulate the rest of the directive in a buffer, until the
1804          "define" keyword is seen or until end of line.  */
1805       bufpos = 0;
1806       for (;;)
1807         {
1808           phaseX_get (tp);
1809           if (tp->type == token_type_eoln || tp->type == token_type_eof)
1810             break;
1811 
1812           /* Before the "define" keyword and inside other directives
1813              white space is irrelevant.  So just throw it away.  */
1814           if (tp->type != token_type_white_space)
1815             {
1816               /* If it is a #define directive, return immediately,
1817                  thus treating the body of the #define directive like
1818                  normal input.  */
1819               if (bufpos == 0
1820                   && tp->type == token_type_name
1821                   && strcmp (tp->string, "define") == 0)
1822                 return;
1823 
1824               /* Accumulate.  */
1825               if (bufpos >= bufmax)
1826                 {
1827                   bufmax = 2 * bufmax + 10;
1828                   buf = xrealloc (buf, bufmax * sizeof (buf[0]));
1829                 }
1830               buf[bufpos++] = *tp;
1831             }
1832         }
1833 
1834       /* If it is a #line directive, with no macros to expand, act on
1835          it.  Ignore all other directives.  */
1836       if (bufpos >= 3 && buf[0].type == token_type_name
1837           && strcmp (buf[0].string, "line") == 0
1838           && buf[1].type == token_type_number
1839           && buf[2].type == token_type_string_literal)
1840         {
1841           logical_file_name = mixed_string_contents (buf[2].mixed_string);
1842           line_number = buf[1].number;
1843         }
1844       if (bufpos >= 2 && buf[0].type == token_type_number
1845           && buf[1].type == token_type_string_literal)
1846         {
1847           logical_file_name = mixed_string_contents (buf[1].mixed_string);
1848           line_number = buf[0].number;
1849         }
1850 
1851       /* Release the storage held by the directive.  */
1852       for (j = 0; j < bufpos; ++j)
1853         free_token (&buf[j]);
1854 
1855       /* We must reset the selected comments.  */
1856       savable_comment_reset ();
1857     }
1858 }
1859 
1860 
1861 /* Supports 2 tokens of pushback.  */
1862 static void
phase6_unget(token_ty * tp)1863 phase6_unget (token_ty *tp)
1864 {
1865   if (tp->type != token_type_eof)
1866     {
1867       if (phase6_pushback_length == SIZEOF (phase6_pushback))
1868         abort ();
1869       phase6_pushback[phase6_pushback_length++] = *tp;
1870     }
1871 }
1872 
1873 
1874 /* 8a. Convert ISO C 99 section 7.8.1 format string directives to string
1875    literal placeholders.  */
1876 
1877 /* Test for an ISO C 99 section 7.8.1 format string directive.  */
1878 static bool
is_inttypes_macro(const char * name)1879 is_inttypes_macro (const char *name)
1880 {
1881   /* Syntax:
1882      P R I { d | i | o | u | x | X }
1883      { { | LEAST | FAST } { 8 | 16 | 32 | 64 } | MAX | PTR }  */
1884   if (name[0] == 'P' && name[1] == 'R' && name[2] == 'I')
1885     {
1886       name += 3;
1887       if (name[0] == 'd' || name[0] == 'i' || name[0] == 'o' || name[0] == 'u'
1888           || name[0] == 'x' || name[0] == 'X')
1889         {
1890           name += 1;
1891           if (name[0] == 'M' && name[1] == 'A' && name[2] == 'X'
1892               && name[3] == '\0')
1893             return true;
1894           if (name[0] == 'P' && name[1] == 'T' && name[2] == 'R'
1895               && name[3] == '\0')
1896             return true;
1897           if (name[0] == 'L' && name[1] == 'E' && name[2] == 'A'
1898               && name[3] == 'S' && name[4] == 'T')
1899             name += 5;
1900           else if (name[0] == 'F' && name[1] == 'A' && name[2] == 'S'
1901                    && name[3] == 'T')
1902             name += 4;
1903           if (name[0] == '8' && name[1] == '\0')
1904             return true;
1905           if (name[0] == '1' && name[1] == '6' && name[2] == '\0')
1906             return true;
1907           if (name[0] == '3' && name[1] == '2' && name[2] == '\0')
1908             return true;
1909           if (name[0] == '6' && name[1] == '4' && name[2] == '\0')
1910             return true;
1911         }
1912     }
1913   return false;
1914 }
1915 
1916 static void
phase8a_get(token_ty * tp)1917 phase8a_get (token_ty *tp)
1918 {
1919   phase6_get (tp);
1920   if (tp->type == token_type_name && is_inttypes_macro (tp->string))
1921     {
1922       /* Turn PRIdXXX into "<PRIdXXX>".  */
1923       char *new_string = xasprintf ("<%s>", tp->string);
1924       free (tp->string);
1925       tp->mixed_string =
1926         mixed_string_alloc_utf8 (new_string, lc_string,
1927                                  logical_file_name, line_number);
1928       tp->comment = add_reference (savable_comment);
1929       tp->type = token_type_string_literal;
1930     }
1931 }
1932 
1933 /* Supports 2 tokens of pushback.  */
1934 static inline void
phase8a_unget(token_ty * tp)1935 phase8a_unget (token_ty *tp)
1936 {
1937   phase6_unget (tp);
1938 }
1939 
1940 
1941 /* 8b. Drop whitespace.  */
1942 static void
phase8b_get(token_ty * tp)1943 phase8b_get (token_ty *tp)
1944 {
1945   for (;;)
1946     {
1947       phase8a_get (tp);
1948 
1949       if (tp->type == token_type_white_space)
1950         continue;
1951       if (tp->type == token_type_eoln)
1952         {
1953           /* We have to track the last occurrence of a string.  One
1954              mode of xgettext allows to group an extracted message
1955              with a comment for documentation.  The rule which states
1956              which comment is assumed to be grouped with the message
1957              says it should immediately precede it.  Our
1958              interpretation: between the last line of the comment and
1959              the line in which the keyword is found must be no line
1960              with non-white space tokens.  */
1961           ++newline_count;
1962           if (last_non_comment_line > last_comment_line)
1963             savable_comment_reset ();
1964           continue;
1965         }
1966       break;
1967     }
1968 }
1969 
1970 /* Supports 2 tokens of pushback.  */
1971 static inline void
phase8b_unget(token_ty * tp)1972 phase8b_unget (token_ty *tp)
1973 {
1974   phase8a_unget (tp);
1975 }
1976 
1977 
1978 /* 8c. In ObjectiveC mode, drop '@' before a literal string.  We need to
1979    do this before performing concatenation of adjacent string literals.  */
1980 static void
phase8c_get(token_ty * tp)1981 phase8c_get (token_ty *tp)
1982 {
1983   token_ty tmp;
1984 
1985   phase8b_get (tp);
1986   if (tp->type != token_type_objc_special)
1987     return;
1988   phase8b_get (&tmp);
1989   if (tmp.type != token_type_string_literal)
1990     {
1991       phase8b_unget (&tmp);
1992       return;
1993     }
1994   /* Drop the '@' token and return immediately the following string.  */
1995   drop_reference (tmp.comment);
1996   tmp.comment = tp->comment;
1997   *tp = tmp;
1998 }
1999 
2000 /* Supports only one pushback token.  */
2001 static inline void
phase8c_unget(token_ty * tp)2002 phase8c_unget (token_ty *tp)
2003 {
2004   phase8b_unget (tp);
2005 }
2006 
2007 
2008 /* 8. Concatenate adjacent string literals to form single string
2009    literals (because we don't expand macros, there are a few things we
2010    will miss).  */
2011 
2012 static void
phase8_get(token_ty * tp)2013 phase8_get (token_ty *tp)
2014 {
2015   phase8c_get (tp);
2016   if (tp->type != token_type_string_literal)
2017     return;
2018   for (;;)
2019     {
2020       token_ty tmp;
2021 
2022       phase8c_get (&tmp);
2023       if (tmp.type != token_type_string_literal)
2024         {
2025           phase8c_unget (&tmp);
2026           return;
2027         }
2028       tp->mixed_string =
2029         mixed_string_concat_free1 (tp->mixed_string, tmp.mixed_string);
2030       free_token (&tmp);
2031     }
2032 }
2033 
2034 
2035 /* ===================== Reading of high-level tokens.  ==================== */
2036 
2037 
2038 enum xgettext_token_type_ty
2039 {
2040   xgettext_token_type_eof,
2041   xgettext_token_type_keyword,
2042   xgettext_token_type_symbol,
2043   xgettext_token_type_lparen,
2044   xgettext_token_type_rparen,
2045   xgettext_token_type_comma,
2046   xgettext_token_type_colon,
2047   xgettext_token_type_string_literal,
2048   xgettext_token_type_other
2049 };
2050 typedef enum xgettext_token_type_ty xgettext_token_type_ty;
2051 
2052 typedef struct xgettext_token_ty xgettext_token_ty;
2053 struct xgettext_token_ty
2054 {
2055   xgettext_token_type_ty type;
2056 
2057   /* This field is used only for xgettext_token_type_keyword.  */
2058   const struct callshapes *shapes;
2059 
2060   /* This field is used only for xgettext_token_type_keyword,
2061      xgettext_token_type_symbol.  */
2062   char *string;
2063 
2064   /* This field is used only for xgettext_token_type_string_literal.  */
2065   mixed_string_ty *mixed_string;
2066 
2067   /* This field is used only for xgettext_token_type_string_literal.  */
2068   refcounted_string_list_ty *comment;
2069 
2070   /* This field is used only for xgettext_token_type_keyword,
2071      xgettext_token_type_string_literal.  */
2072   lex_pos_ty pos;
2073 };
2074 
2075 
2076 /* 9. Convert the remaining preprocessing tokens to C tokens and
2077    discards any white space from the translation unit.  */
2078 
2079 static void
x_c_lex(xgettext_token_ty * tp)2080 x_c_lex (xgettext_token_ty *tp)
2081 {
2082   for (;;)
2083     {
2084       token_ty token;
2085       void *keyword_value;
2086 
2087       phase8_get (&token);
2088       switch (token.type)
2089         {
2090         case token_type_eof:
2091           tp->type = xgettext_token_type_eof;
2092           return;
2093 
2094         case token_type_name:
2095           last_non_comment_line = newline_count;
2096 
2097           if (hash_find_entry (objc_extensions ? &objc_keywords : &c_keywords,
2098                                token.string, strlen (token.string),
2099                                &keyword_value)
2100               == 0)
2101             {
2102               tp->type = xgettext_token_type_keyword;
2103               tp->shapes = (const struct callshapes *) keyword_value;
2104               tp->pos.file_name = logical_file_name;
2105               tp->pos.line_number = token.line_number;
2106             }
2107           else
2108             tp->type = xgettext_token_type_symbol;
2109           tp->string = token.string;
2110           return;
2111 
2112         case token_type_lparen:
2113           last_non_comment_line = newline_count;
2114 
2115           tp->type = xgettext_token_type_lparen;
2116           return;
2117 
2118         case token_type_rparen:
2119           last_non_comment_line = newline_count;
2120 
2121           tp->type = xgettext_token_type_rparen;
2122           return;
2123 
2124         case token_type_comma:
2125           last_non_comment_line = newline_count;
2126 
2127           tp->type = xgettext_token_type_comma;
2128           return;
2129 
2130         case token_type_colon:
2131           last_non_comment_line = newline_count;
2132 
2133           tp->type = xgettext_token_type_colon;
2134           return;
2135 
2136         case token_type_string_literal:
2137           last_non_comment_line = newline_count;
2138 
2139           tp->type = xgettext_token_type_string_literal;
2140           tp->mixed_string = token.mixed_string;
2141           tp->comment = token.comment;
2142           tp->pos.file_name = logical_file_name;
2143           tp->pos.line_number = token.line_number;
2144           return;
2145 
2146         case token_type_objc_special:
2147           drop_reference (token.comment);
2148           /* FALLTHROUGH */
2149 
2150         default:
2151           last_non_comment_line = newline_count;
2152 
2153           tp->type = xgettext_token_type_other;
2154           return;
2155         }
2156     }
2157 }
2158 
2159 
2160 /* ========================= Extracting strings.  ========================== */
2161 
2162 
2163 /* Context lookup table.  */
2164 static flag_context_list_table_ty *flag_context_list_table;
2165 
2166 
2167 /* The file is broken into tokens.  Scan the token stream, looking for
2168    a keyword, followed by a left paren, followed by a string.  When we
2169    see this sequence, we have something to remember.  We assume we are
2170    looking at a valid C or C++ program, and leave the complaints about
2171    the grammar to the compiler.
2172 
2173      Normal handling: Look for
2174        keyword ( ... msgid ... )
2175      Plural handling: Look for
2176        keyword ( ... msgid ... msgid_plural ... )
2177 
2178    We use recursion because the arguments before msgid or between msgid
2179    and msgid_plural can contain subexpressions of the same form.  */
2180 
2181 
2182 /* Extract messages until the next balanced closing parenthesis.
2183    Extracted messages are added to MLP.
2184    Return true upon eof, false upon closing parenthesis.  */
2185 static bool
extract_parenthesized(message_list_ty * mlp,flag_context_ty outer_context,flag_context_list_iterator_ty context_iter,struct arglist_parser * argparser)2186 extract_parenthesized (message_list_ty *mlp,
2187                        flag_context_ty outer_context,
2188                        flag_context_list_iterator_ty context_iter,
2189                        struct arglist_parser *argparser)
2190 {
2191   /* Current argument number.  */
2192   int arg = 1;
2193   /* 0 when no keyword has been seen.  1 right after a keyword is seen.  */
2194   int state;
2195   /* Parameters of the keyword just seen.  Defined only in state 1.  */
2196   const struct callshapes *next_shapes = NULL;
2197   /* Context iterator that will be used if the next token is a '('.  */
2198   flag_context_list_iterator_ty next_context_iter =
2199     passthrough_context_list_iterator;
2200   /* Context iterator that will be used if the next token is a ':'.
2201      (Objective C selector syntax.)  */
2202   flag_context_list_iterator_ty selectorcall_context_iter =
2203     passthrough_context_list_iterator;
2204   /* Current context.  */
2205   flag_context_ty inner_context =
2206     inherited_context (outer_context,
2207                        flag_context_list_iterator_advance (&context_iter));
2208 
2209   /* Start state is 0.  */
2210   state = 0;
2211 
2212   for (;;)
2213     {
2214       xgettext_token_ty token;
2215 
2216       x_c_lex (&token);
2217       switch (token.type)
2218         {
2219         case xgettext_token_type_keyword:
2220           next_shapes = token.shapes;
2221           state = 1;
2222           goto keyword_or_symbol;
2223 
2224         case xgettext_token_type_symbol:
2225           state = 0;
2226         keyword_or_symbol:
2227           next_context_iter =
2228             flag_context_list_iterator (
2229               flag_context_list_table_lookup (
2230                 flag_context_list_table,
2231                 token.string, strlen (token.string)));
2232           if (objc_extensions)
2233             {
2234               size_t token_string_len = strlen (token.string);
2235               token.string = xrealloc (token.string, token_string_len + 2);
2236               token.string[token_string_len] = ':';
2237               token.string[token_string_len + 1] = '\0';
2238               selectorcall_context_iter =
2239                 flag_context_list_iterator (
2240                   flag_context_list_table_lookup (
2241                     flag_context_list_table,
2242                     token.string, token_string_len + 1));
2243             }
2244           free (token.string);
2245           continue;
2246 
2247         case xgettext_token_type_lparen:
2248           if (extract_parenthesized (mlp, inner_context, next_context_iter,
2249                                      arglist_parser_alloc (mlp,
2250                                                            state ? next_shapes : NULL)))
2251             {
2252               arglist_parser_done (argparser, arg);
2253               return true;
2254             }
2255           next_context_iter = null_context_list_iterator;
2256           selectorcall_context_iter = null_context_list_iterator;
2257           state = 0;
2258           continue;
2259 
2260         case xgettext_token_type_rparen:
2261           arglist_parser_done (argparser, arg);
2262           return false;
2263 
2264         case xgettext_token_type_comma:
2265           arg++;
2266           inner_context =
2267             inherited_context (outer_context,
2268                                flag_context_list_iterator_advance (
2269                                  &context_iter));
2270           next_context_iter = passthrough_context_list_iterator;
2271           selectorcall_context_iter = passthrough_context_list_iterator;
2272           state = 0;
2273           continue;
2274 
2275         case xgettext_token_type_colon:
2276           if (objc_extensions)
2277             {
2278               context_iter = selectorcall_context_iter;
2279               inner_context =
2280                 inherited_context (inner_context,
2281                                    flag_context_list_iterator_advance (
2282                                      &context_iter));
2283               next_context_iter = passthrough_context_list_iterator;
2284               selectorcall_context_iter = passthrough_context_list_iterator;
2285             }
2286           else
2287             {
2288               next_context_iter = null_context_list_iterator;
2289               selectorcall_context_iter = null_context_list_iterator;
2290             }
2291           state = 0;
2292           continue;
2293 
2294         case xgettext_token_type_string_literal:
2295           {
2296             if (extract_all)
2297               {
2298                 char *string = mixed_string_contents (token.mixed_string);
2299                 mixed_string_free (token.mixed_string);
2300                 remember_a_message (mlp, NULL, string, true, false,
2301                                     inner_context, &token.pos,
2302                                     NULL, token.comment, false);
2303               }
2304             else
2305               arglist_parser_remember (argparser, arg, token.mixed_string,
2306                                        inner_context,
2307                                        token.pos.file_name,
2308                                        token.pos.line_number,
2309                                        token.comment, false);
2310             drop_reference (token.comment);
2311           }
2312           next_context_iter = null_context_list_iterator;
2313           selectorcall_context_iter = null_context_list_iterator;
2314           state = 0;
2315           continue;
2316 
2317         case xgettext_token_type_other:
2318           next_context_iter = null_context_list_iterator;
2319           selectorcall_context_iter = null_context_list_iterator;
2320           state = 0;
2321           continue;
2322 
2323         case xgettext_token_type_eof:
2324           arglist_parser_done (argparser, arg);
2325           return true;
2326 
2327         default:
2328           abort ();
2329         }
2330     }
2331 }
2332 
2333 
2334 static void
extract_whole_file(FILE * f,const char * real_filename,const char * logical_filename,flag_context_list_table_ty * flag_table,msgdomain_list_ty * mdlp)2335 extract_whole_file (FILE *f,
2336                     const char *real_filename, const char *logical_filename,
2337                     flag_context_list_table_ty *flag_table,
2338                     msgdomain_list_ty *mdlp)
2339 {
2340   message_list_ty *mlp = mdlp->item[0]->messages;
2341 
2342   fp = f;
2343   real_file_name = real_filename;
2344   logical_file_name = xstrdup (logical_filename);
2345   line_number = 1;
2346 
2347   phase1_pushback_length = 0;
2348   phase2_pushback_length = 0;
2349   phase3_pushback_length = 0;
2350 
2351   last_comment_line = -1;
2352   last_non_comment_line = -1;
2353   newline_count = 0;
2354 
2355   phase5_pushback_length = 0;
2356   phase6_pushback_length = 0;
2357 
2358   flag_context_list_table = flag_table;
2359 
2360   init_keywords ();
2361 
2362   /* Eat tokens until eof is seen.  When extract_parenthesized returns
2363      due to an unbalanced closing parenthesis, just restart it.  */
2364   while (!extract_parenthesized (mlp, null_context, null_context_list_iterator,
2365                                  arglist_parser_alloc (mlp, NULL)))
2366     ;
2367 
2368   /* Close scanner.  */
2369   fp = NULL;
2370   real_file_name = NULL;
2371   logical_file_name = NULL;
2372   line_number = 0;
2373 }
2374 
2375 
2376 void
extract_c(FILE * f,const char * real_filename,const char * logical_filename,flag_context_list_table_ty * flag_table,msgdomain_list_ty * mdlp)2377 extract_c (FILE *f,
2378            const char *real_filename, const char *logical_filename,
2379            flag_context_list_table_ty *flag_table,
2380            msgdomain_list_ty *mdlp)
2381 {
2382   objc_extensions = false;
2383   cxx_extensions = false;
2384   extract_whole_file (f, real_filename, logical_filename, flag_table, mdlp);
2385 }
2386 
2387 void
extract_cxx(FILE * f,const char * real_filename,const char * logical_filename,flag_context_list_table_ty * flag_table,msgdomain_list_ty * mdlp)2388 extract_cxx (FILE *f,
2389              const char *real_filename, const char *logical_filename,
2390              flag_context_list_table_ty *flag_table,
2391              msgdomain_list_ty *mdlp)
2392 {
2393   objc_extensions = false;
2394   cxx_extensions = true;
2395   extract_whole_file (f, real_filename, logical_filename, flag_table, mdlp);
2396 }
2397 
2398 void
extract_objc(FILE * f,const char * real_filename,const char * logical_filename,flag_context_list_table_ty * flag_table,msgdomain_list_ty * mdlp)2399 extract_objc (FILE *f,
2400               const char *real_filename, const char *logical_filename,
2401               flag_context_list_table_ty *flag_table,
2402               msgdomain_list_ty *mdlp)
2403 {
2404   objc_extensions = true;
2405   cxx_extensions = false;
2406   extract_whole_file (f, real_filename, logical_filename, flag_table, mdlp);
2407 }
2408