1 /* xgettext C/C++/ObjectiveC backend.
2 Copyright (C) 1995-1998, 2000-2009, 2012-2015, 2018-2020 Free Software Foundation, Inc.
3
4 This file was written by Peter Miller <millerp@canb.auug.org.au>
5
6 This program is free software: you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 3 of the License, or
9 (at your option) any later version.
10
11 This program is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
15
16 You should have received a copy of the GNU General Public License
17 along with this program. If not, see <https://www.gnu.org/licenses/>. */
18
19 #ifdef HAVE_CONFIG_H
20 # include "config.h"
21 #endif
22
23 /* Specification. */
24 #include "x-c.h"
25
26 #include <assert.h>
27 #include <errno.h>
28 #include <stdbool.h>
29 #include <stdio.h>
30 #include <stdlib.h>
31 #include <string.h>
32
33 #include "message.h"
34 #include "rc-str-list.h"
35 #include "xgettext.h"
36 #include "xg-pos.h"
37 #include "xg-encoding.h"
38 #include "xg-mixed-string.h"
39 #include "xg-arglist-context.h"
40 #include "xg-arglist-callshape.h"
41 #include "xg-arglist-parser.h"
42 #include "xg-message.h"
43 #include "error.h"
44 #include "error-progname.h"
45 #include "xalloc.h"
46 #include "xvasprintf.h"
47 #include "mem-hash-map.h"
48 #include "po-charset.h"
49 #include "gettext.h"
50
51 #define _(s) gettext(s)
52
53 #define SIZEOF(a) (sizeof(a) / sizeof(a[0]))
54
55
56 /* The ANSI C standard defines several phases of translation:
57
58 1. Terminate line by \n, regardless of the external representation
59 of a text line. Stdio does this for us.
60
61 2. Convert trigraphs to their single character equivalents.
62
63 3. Concatenate each line ending in backslash (\) with the following
64 line.
65
66 4. Replace each comment with a space character.
67
68 5. Parse each resulting logical line as preprocessing tokens a
69 white space.
70
71 6. Recognize and carry out directives (it also expands macros on
72 non-directive lines, which we do not do here).
73
74 7. Replaces escape sequences within character strings with their
75 single character equivalents (we do this in step 5, because we
76 don't have to worry about the #include argument).
77
78 8. Concatenates adjacent string literals to form single string
79 literals (because we don't expand macros, there are a few things
80 we will miss).
81
82 9. Converts the remaining preprocessing tokens to C tokens and
83 discards any white space from the translation unit.
84
85 This lexer implements the above, and presents the scanner (in
86 xgettext.c) with a stream of C tokens. The comments are
87 accumulated in a buffer, and given to xgettext when asked for. */
88
89
90 /* ========================= Lexer customization. ========================= */
91
92 static bool trigraphs = false;
93
94 void
x_c_trigraphs()95 x_c_trigraphs ()
96 {
97 trigraphs = true;
98 }
99
100
101 /* ====================== Keyword set customization. ====================== */
102
103 /* If true extract all strings. */
104 static bool extract_all = false;
105
106 static hash_table c_keywords;
107 static hash_table objc_keywords;
108 static bool default_keywords = true;
109
110
111 void
x_c_extract_all()112 x_c_extract_all ()
113 {
114 extract_all = true;
115 }
116
117
118 static void
add_keyword(const char * name,hash_table * keywords)119 add_keyword (const char *name, hash_table *keywords)
120 {
121 if (name == NULL)
122 default_keywords = false;
123 else
124 {
125 const char *end;
126 struct callshape shape;
127 const char *colon;
128
129 if (keywords->table == NULL)
130 hash_init (keywords, 100);
131
132 split_keywordspec (name, &end, &shape);
133
134 /* The characters between name and end should form a valid C identifier.
135 A colon means an invalid parse in split_keywordspec(). */
136 colon = strchr (name, ':');
137 if (colon == NULL || colon >= end)
138 insert_keyword_callshape (keywords, name, end - name, &shape);
139 }
140 }
141
142 void
x_c_keyword(const char * name)143 x_c_keyword (const char *name)
144 {
145 add_keyword (name, &c_keywords);
146 }
147
148 void
x_objc_keyword(const char * name)149 x_objc_keyword (const char *name)
150 {
151 add_keyword (name, &objc_keywords);
152 }
153
154 static bool additional_keywords_kde;
155
156 void
activate_additional_keywords_kde()157 activate_additional_keywords_kde ()
158 {
159 additional_keywords_kde = true;
160 }
161
162 /* Finish initializing the keywords hash tables.
163 Called after argument processing, before each file is processed. */
164 static void
init_keywords()165 init_keywords ()
166 {
167 if (default_keywords)
168 {
169 /* When adding new keywords here, also update the documentation in
170 xgettext.texi! */
171 x_c_keyword ("gettext");
172 x_c_keyword ("dgettext:2");
173 x_c_keyword ("dcgettext:2");
174 x_c_keyword ("ngettext:1,2");
175 x_c_keyword ("dngettext:2,3");
176 x_c_keyword ("dcngettext:2,3");
177 x_c_keyword ("gettext_noop");
178 x_c_keyword ("pgettext:1c,2");
179 x_c_keyword ("dpgettext:2c,3");
180 x_c_keyword ("dcpgettext:2c,3");
181 x_c_keyword ("npgettext:1c,2,3");
182 x_c_keyword ("dnpgettext:2c,3,4");
183 x_c_keyword ("dcnpgettext:2c,3,4");
184
185 if (additional_keywords_kde)
186 {
187 x_c_keyword ("i18n:1");
188 x_c_keyword ("i18nc:1c,2");
189 x_c_keyword ("i18np:1,2");
190 x_c_keyword ("i18ncp:1c,2,3");
191 x_c_keyword ("i18nd:2");
192 x_c_keyword ("i18ndc:2c,3");
193 x_c_keyword ("i18ndp:2,3");
194 x_c_keyword ("i18ndcp:2c,3,4");
195 x_c_keyword ("ki18n:1");
196 x_c_keyword ("ki18nc:1c,2");
197 x_c_keyword ("ki18np:1,2");
198 x_c_keyword ("ki18ncp:1c,2,3");
199 x_c_keyword ("ki18nd:2");
200 x_c_keyword ("ki18ndc:2c,3");
201 x_c_keyword ("ki18ndp:2,3");
202 x_c_keyword ("ki18ndcp:2c,3,4");
203 x_c_keyword ("I18N_NOOP:1");
204 x_c_keyword ("I18NC_NOOP:1c,2");
205 x_c_keyword ("I18N_NOOP2:1c,2");
206 x_c_keyword ("I18N_NOOP2_NOSTRIP:1c,2");
207 x_c_keyword ("xi18n:1");
208 x_c_keyword ("xi18nc:1c,2");
209 x_c_keyword ("xi18np:1,2");
210 x_c_keyword ("xi18ncp:1c,2,3");
211 x_c_keyword ("xi18nd:2");
212 x_c_keyword ("xi18ndc:2c,3");
213 x_c_keyword ("xi18ndp:2,3");
214 x_c_keyword ("xi18ndcp:2c,3,4");
215 x_c_keyword ("kxi18n:1");
216 x_c_keyword ("kxi18nc:1c,2");
217 x_c_keyword ("kxi18np:1,2");
218 x_c_keyword ("kxi18ncp:1c,2,3");
219 x_c_keyword ("kxi18nd:2");
220 x_c_keyword ("kxi18ndc:2c,3");
221 x_c_keyword ("kxi18ndp:2,3");
222 x_c_keyword ("kxi18ndcp:2c,3,4");
223 x_c_keyword ("XI18N_NOOP:1");
224 x_c_keyword ("XI18NC_NOOP:1c,2");
225 x_c_keyword ("XI18N_NOOP2:1c,2");
226 x_c_keyword ("XI18N_NOOP2_NOSTRIP:1c,2");
227 }
228
229 x_objc_keyword ("gettext");
230 x_objc_keyword ("dgettext:2");
231 x_objc_keyword ("dcgettext:2");
232 x_objc_keyword ("ngettext:1,2");
233 x_objc_keyword ("dngettext:2,3");
234 x_objc_keyword ("dcngettext:2,3");
235 x_objc_keyword ("gettext_noop");
236 x_objc_keyword ("pgettext:1c,2");
237 x_objc_keyword ("dpgettext:2c,3");
238 x_objc_keyword ("dcpgettext:2c,3");
239 x_objc_keyword ("npgettext:1c,2,3");
240 x_objc_keyword ("dnpgettext:2c,3,4");
241 x_objc_keyword ("dcnpgettext:2c,3,4");
242 x_objc_keyword ("NSLocalizedString"); /* similar to gettext */
243 x_objc_keyword ("_"); /* similar to gettext */
244 x_objc_keyword ("NSLocalizedStaticString"); /* similar to gettext_noop */
245 x_objc_keyword ("__"); /* similar to gettext_noop */
246
247 default_keywords = false;
248 }
249 }
250
251 void
init_flag_table_c()252 init_flag_table_c ()
253 {
254 xgettext_record_flag ("gettext:1:pass-c-format");
255 xgettext_record_flag ("dgettext:2:pass-c-format");
256 xgettext_record_flag ("dcgettext:2:pass-c-format");
257 xgettext_record_flag ("ngettext:1:pass-c-format");
258 xgettext_record_flag ("ngettext:2:pass-c-format");
259 xgettext_record_flag ("dngettext:2:pass-c-format");
260 xgettext_record_flag ("dngettext:3:pass-c-format");
261 xgettext_record_flag ("dcngettext:2:pass-c-format");
262 xgettext_record_flag ("dcngettext:3:pass-c-format");
263 xgettext_record_flag ("gettext_noop:1:pass-c-format");
264 xgettext_record_flag ("pgettext:2:pass-c-format");
265 xgettext_record_flag ("dpgettext:3:pass-c-format");
266 xgettext_record_flag ("dcpgettext:3:pass-c-format");
267 xgettext_record_flag ("npgettext:2:pass-c-format");
268 xgettext_record_flag ("npgettext:3:pass-c-format");
269 xgettext_record_flag ("dnpgettext:3:pass-c-format");
270 xgettext_record_flag ("dnpgettext:4:pass-c-format");
271 xgettext_record_flag ("dcnpgettext:3:pass-c-format");
272 xgettext_record_flag ("dcnpgettext:4:pass-c-format");
273
274 /* <stdio.h> */
275 xgettext_record_flag ("fprintf:2:c-format");
276 xgettext_record_flag ("vfprintf:2:c-format");
277 xgettext_record_flag ("printf:1:c-format");
278 xgettext_record_flag ("vprintf:1:c-format");
279 xgettext_record_flag ("sprintf:2:c-format");
280 xgettext_record_flag ("vsprintf:2:c-format");
281 xgettext_record_flag ("snprintf:3:c-format");
282 xgettext_record_flag ("vsnprintf:3:c-format");
283 #if 0 /* These functions are not standard. */
284 /* <stdio.h> */
285 xgettext_record_flag ("asprintf:2:c-format");
286 xgettext_record_flag ("vasprintf:2:c-format");
287 xgettext_record_flag ("dprintf:2:c-format");
288 xgettext_record_flag ("vdprintf:2:c-format");
289 xgettext_record_flag ("obstack_printf:2:c-format");
290 xgettext_record_flag ("obstack_vprintf:2:c-format");
291 /* <error.h> */
292 xgettext_record_flag ("error:3:c-format");
293 xgettext_record_flag ("error_at_line:5:c-format");
294 /* <argp.h> */
295 xgettext_record_flag ("argp_error:2:c-format");
296 xgettext_record_flag ("argp_failure:2:c-format");
297 #endif
298
299 xgettext_record_flag ("gettext:1:pass-qt-format");
300 xgettext_record_flag ("dgettext:2:pass-qt-format");
301 xgettext_record_flag ("dcgettext:2:pass-qt-format");
302 xgettext_record_flag ("ngettext:1:pass-qt-format");
303 xgettext_record_flag ("ngettext:2:pass-qt-format");
304 xgettext_record_flag ("dngettext:2:pass-qt-format");
305 xgettext_record_flag ("dngettext:3:pass-qt-format");
306 xgettext_record_flag ("dcngettext:2:pass-qt-format");
307 xgettext_record_flag ("dcngettext:3:pass-qt-format");
308 xgettext_record_flag ("gettext_noop:1:pass-qt-format");
309 xgettext_record_flag ("pgettext:2:pass-qt-format");
310 xgettext_record_flag ("dpgettext:3:pass-qt-format");
311 xgettext_record_flag ("dcpgettext:3:pass-qt-format");
312 xgettext_record_flag ("npgettext:2:pass-qt-format");
313 xgettext_record_flag ("npgettext:3:pass-qt-format");
314 xgettext_record_flag ("dnpgettext:3:pass-qt-format");
315 xgettext_record_flag ("dnpgettext:4:pass-qt-format");
316 xgettext_record_flag ("dcnpgettext:3:pass-qt-format");
317 xgettext_record_flag ("dcnpgettext:4:pass-qt-format");
318
319 xgettext_record_flag ("gettext:1:pass-kde-format");
320 xgettext_record_flag ("dgettext:2:pass-kde-format");
321 xgettext_record_flag ("dcgettext:2:pass-kde-format");
322 xgettext_record_flag ("ngettext:1:pass-kde-format");
323 xgettext_record_flag ("ngettext:2:pass-kde-format");
324 xgettext_record_flag ("dngettext:2:pass-kde-format");
325 xgettext_record_flag ("dngettext:3:pass-kde-format");
326 xgettext_record_flag ("dcngettext:2:pass-kde-format");
327 xgettext_record_flag ("dcngettext:3:pass-kde-format");
328 xgettext_record_flag ("gettext_noop:1:pass-kde-format");
329 xgettext_record_flag ("pgettext:2:pass-kde-format");
330 xgettext_record_flag ("dpgettext:3:pass-kde-format");
331 xgettext_record_flag ("dcpgettext:3:pass-kde-format");
332 xgettext_record_flag ("npgettext:2:pass-kde-format");
333 xgettext_record_flag ("npgettext:3:pass-kde-format");
334 xgettext_record_flag ("dnpgettext:3:pass-kde-format");
335 xgettext_record_flag ("dnpgettext:4:pass-kde-format");
336 xgettext_record_flag ("dcnpgettext:3:pass-kde-format");
337 xgettext_record_flag ("dcnpgettext:4:pass-kde-format");
338
339 xgettext_record_flag ("gettext:1:pass-boost-format");
340 xgettext_record_flag ("dgettext:2:pass-boost-format");
341 xgettext_record_flag ("dcgettext:2:pass-boost-format");
342 xgettext_record_flag ("ngettext:1:pass-boost-format");
343 xgettext_record_flag ("ngettext:2:pass-boost-format");
344 xgettext_record_flag ("dngettext:2:pass-boost-format");
345 xgettext_record_flag ("dngettext:3:pass-boost-format");
346 xgettext_record_flag ("dcngettext:2:pass-boost-format");
347 xgettext_record_flag ("dcngettext:3:pass-boost-format");
348 xgettext_record_flag ("gettext_noop:1:pass-boost-format");
349 xgettext_record_flag ("pgettext:2:pass-boost-format");
350 xgettext_record_flag ("dpgettext:3:pass-boost-format");
351 xgettext_record_flag ("dcpgettext:3:pass-boost-format");
352 xgettext_record_flag ("npgettext:2:pass-boost-format");
353 xgettext_record_flag ("npgettext:3:pass-boost-format");
354 xgettext_record_flag ("dnpgettext:3:pass-boost-format");
355 xgettext_record_flag ("dnpgettext:4:pass-boost-format");
356 xgettext_record_flag ("dcnpgettext:3:pass-boost-format");
357 xgettext_record_flag ("dcnpgettext:4:pass-boost-format");
358
359 /* <boost/format.hpp> */
360 xgettext_record_flag ("format:1:boost-format");
361 }
362
363 void
init_flag_table_objc()364 init_flag_table_objc ()
365 {
366 /* Since the settings done in init_flag_table_c() also have an effect for
367 the ObjectiveC parser, we don't have to repeat them here. */
368 xgettext_record_flag ("gettext:1:pass-objc-format");
369 xgettext_record_flag ("dgettext:2:pass-objc-format");
370 xgettext_record_flag ("dcgettext:2:pass-objc-format");
371 xgettext_record_flag ("ngettext:1:pass-objc-format");
372 xgettext_record_flag ("ngettext:2:pass-objc-format");
373 xgettext_record_flag ("dngettext:2:pass-objc-format");
374 xgettext_record_flag ("dngettext:3:pass-objc-format");
375 xgettext_record_flag ("dcngettext:2:pass-objc-format");
376 xgettext_record_flag ("dcngettext:3:pass-objc-format");
377 xgettext_record_flag ("gettext_noop:1:pass-objc-format");
378 xgettext_record_flag ("pgettext:2:pass-objc-format");
379 xgettext_record_flag ("dpgettext:3:pass-objc-format");
380 xgettext_record_flag ("dcpgettext:3:pass-objc-format");
381 xgettext_record_flag ("npgettext:2:pass-objc-format");
382 xgettext_record_flag ("npgettext:3:pass-objc-format");
383 xgettext_record_flag ("dnpgettext:3:pass-objc-format");
384 xgettext_record_flag ("dnpgettext:4:pass-objc-format");
385 xgettext_record_flag ("dcnpgettext:3:pass-objc-format");
386 xgettext_record_flag ("dcnpgettext:4:pass-objc-format");
387 xgettext_record_flag ("NSLocalizedString:1:pass-c-format");
388 xgettext_record_flag ("NSLocalizedString:1:pass-objc-format");
389 xgettext_record_flag ("_:1:pass-c-format");
390 xgettext_record_flag ("_:1:pass-objc-format");
391 xgettext_record_flag ("stringWithFormat::1:objc-format");
392 xgettext_record_flag ("initWithFormat::1:objc-format");
393 xgettext_record_flag ("stringByAppendingFormat::1:objc-format");
394 xgettext_record_flag ("localizedStringWithFormat::1:objc-format");
395 xgettext_record_flag ("appendFormat::1:objc-format");
396 }
397
398 void
init_flag_table_gcc_internal()399 init_flag_table_gcc_internal ()
400 {
401 xgettext_record_flag ("gettext:1:pass-gcc-internal-format");
402 xgettext_record_flag ("dgettext:2:pass-gcc-internal-format");
403 xgettext_record_flag ("dcgettext:2:pass-gcc-internal-format");
404 xgettext_record_flag ("ngettext:1:pass-gcc-internal-format");
405 xgettext_record_flag ("ngettext:2:pass-gcc-internal-format");
406 xgettext_record_flag ("dngettext:2:pass-gcc-internal-format");
407 xgettext_record_flag ("dngettext:3:pass-gcc-internal-format");
408 xgettext_record_flag ("dcngettext:2:pass-gcc-internal-format");
409 xgettext_record_flag ("dcngettext:3:pass-gcc-internal-format");
410 xgettext_record_flag ("gettext_noop:1:pass-gcc-internal-format");
411 xgettext_record_flag ("pgettext:2:pass-gcc-internal-format");
412 xgettext_record_flag ("dpgettext:3:pass-gcc-internal-format");
413 xgettext_record_flag ("dcpgettext:3:pass-gcc-internal-format");
414 xgettext_record_flag ("npgettext:2:pass-gcc-internal-format");
415 xgettext_record_flag ("npgettext:3:pass-gcc-internal-format");
416 xgettext_record_flag ("dnpgettext:3:pass-gcc-internal-format");
417 xgettext_record_flag ("dnpgettext:4:pass-gcc-internal-format");
418 xgettext_record_flag ("dcnpgettext:3:pass-gcc-internal-format");
419 xgettext_record_flag ("dcnpgettext:4:pass-gcc-internal-format");
420 #if 0 /* This should better be done inside GCC. */
421 /* grepping for ATTRIBUTE_PRINTF in gcc-3.3/gcc/?*.h */
422 /* c-format.c */
423 xgettext_record_flag ("status_warning:2:gcc-internal-format");
424 /* c-tree.h */
425 xgettext_record_flag ("pedwarn_c99:1:pass-gcc-internal-format");
426 /* collect2.h */
427 //xgettext_record_flag ("error:1:c-format"); // 3 different versions
428 xgettext_record_flag ("notice:1:c-format");
429 //xgettext_record_flag ("fatal:1:c-format"); // 2 different versions
430 xgettext_record_flag ("fatal_perror:1:c-format");
431 /* cpplib.h */
432 xgettext_record_flag ("cpp_error:3:c-format");
433 xgettext_record_flag ("cpp_error_with_line:5:c-format");
434 /* diagnostic.h */
435 xgettext_record_flag ("diagnostic_set_info:2:pass-gcc-internal-format");
436 xgettext_record_flag ("output_printf:2:gcc-internal-format");
437 xgettext_record_flag ("output_verbatim:2:pass-gcc-internal-format");
438 xgettext_record_flag ("verbatim:1:gcc-internal-format");
439 xgettext_record_flag ("inform:1:pass-gcc-internal-format");
440 /* gcc.h */
441 //xgettext_record_flag ("fatal:1:c-format"); // 2 different versions
442 //xgettext_record_flag ("error:1:c-format"); // 3 different versions
443 /* genattrtab.h */
444 xgettext_record_flag ("attr_printf:2:pass-c-format");
445 /* gengtype.h */
446 xgettext_record_flag ("error_at_line:2:pass-c-format");
447 xgettext_record_flag ("xvasprintf:2:pass-c-format");
448 xgettext_record_flag ("xasprintf:1:pass-c-format");
449 xgettext_record_flag ("oprintf:2:pass-c-format");
450 /* gensupport.h */
451 xgettext_record_flag ("message_with_line:2:pass-c-format");
452 /* output.h */
453 xgettext_record_flag ("output_operand_lossage:1:c-format");
454 /* ra.h */
455 xgettext_record_flag ("ra_debug_msg:2:pass-c-format");
456 /* toplev.h */
457 xgettext_record_flag ("fnotice:2:c-format");
458 xgettext_record_flag ("fatal_io_error:2:gcc-internal-format");
459 xgettext_record_flag ("error_for_asm:2:pass-gcc-internal-format");
460 xgettext_record_flag ("warning_for_asm:2:pass-gcc-internal-format");
461 xgettext_record_flag ("error_with_file_and_line:3:pass-gcc-internal-format");
462 xgettext_record_flag ("error_with_decl:2:pass-gcc-internal-format");
463 xgettext_record_flag ("pedwarn:1:gcc-internal-format");
464 xgettext_record_flag ("pedwarn_with_file_and_line:3:gcc-internal-format");
465 xgettext_record_flag ("pedwarn_with_decl:2:gcc-internal-format");
466 xgettext_record_flag ("sorry:1:gcc-internal-format");
467 xgettext_record_flag ("error:1:pass-gcc-internal-format");
468 xgettext_record_flag ("fatal_error:1:pass-gcc-internal-format");
469 xgettext_record_flag ("internal_error:1:pass-gcc-internal-format");
470 xgettext_record_flag ("warning:1:pass-gcc-internal-format");
471 xgettext_record_flag ("warning_with_file_and_line:3:pass-gcc-internal-format");
472 xgettext_record_flag ("warning_with_decl:2:pass-gcc-internal-format");
473 /* f/com.h */
474 xgettext_record_flag ("ffecom_get_invented_identifier:1:pass-c-format");
475 /* f/sts.h */
476 xgettext_record_flag ("ffests_printf:2:pass-c-format");
477 /* java/java-tree.h */
478 xgettext_record_flag ("parse_error_context:2:pass-c-format");
479 #endif
480
481 xgettext_record_flag ("gettext:1:pass-gfc-internal-format");
482 xgettext_record_flag ("dgettext:2:pass-gfc-internal-format");
483 xgettext_record_flag ("dcgettext:2:pass-gfc-internal-format");
484 xgettext_record_flag ("ngettext:1:pass-gfc-internal-format");
485 xgettext_record_flag ("ngettext:2:pass-gfc-internal-format");
486 xgettext_record_flag ("dngettext:2:pass-gfc-internal-format");
487 xgettext_record_flag ("dngettext:3:pass-gfc-internal-format");
488 xgettext_record_flag ("dcngettext:2:pass-gfc-internal-format");
489 xgettext_record_flag ("dcngettext:3:pass-gfc-internal-format");
490 xgettext_record_flag ("gettext_noop:1:pass-gfc-internal-format");
491 xgettext_record_flag ("pgettext:2:pass-gfc-internal-format");
492 xgettext_record_flag ("dpgettext:3:pass-gfc-internal-format");
493 xgettext_record_flag ("dcpgettext:3:pass-gfc-internal-format");
494 xgettext_record_flag ("npgettext:2:pass-gfc-internal-format");
495 xgettext_record_flag ("npgettext:3:pass-gfc-internal-format");
496 xgettext_record_flag ("dnpgettext:3:pass-gfc-internal-format");
497 xgettext_record_flag ("dnpgettext:4:pass-gfc-internal-format");
498 xgettext_record_flag ("dcnpgettext:3:pass-gfc-internal-format");
499 xgettext_record_flag ("dcnpgettext:4:pass-gfc-internal-format");
500 #if 0 /* This should better be done inside GCC. */
501 /* fortran/error.c */
502 xgettext_record_flag ("gfc_error:1:gfc-internal-format");
503 xgettext_record_flag ("gfc_error_now:1:gfc-internal-format");
504 xgettext_record_flag ("gfc_fatal_error:1:gfc-internal-format");
505 xgettext_record_flag ("gfc_internal_error:1:gfc-internal-format");
506 xgettext_record_flag ("gfc_notify_std:2:gfc-internal-format");
507 xgettext_record_flag ("gfc_warning:1:gfc-internal-format");
508 xgettext_record_flag ("gfc_warning_now:1:gfc-internal-format");
509 #endif
510 }
511
512 void
init_flag_table_kde()513 init_flag_table_kde ()
514 {
515 xgettext_record_flag ("i18n:1:kde-format");
516 xgettext_record_flag ("i18nc:2:kde-format");
517 xgettext_record_flag ("i18np:1:kde-format");
518 xgettext_record_flag ("i18ncp:2:kde-format");
519 xgettext_record_flag ("i18nd:2:kde-format");
520 xgettext_record_flag ("i18ndc:3:kde-format");
521 xgettext_record_flag ("i18ndp:2:kde-format");
522 xgettext_record_flag ("i18ndcp:3:kde-format");
523 xgettext_record_flag ("ki18n:1:kde-format");
524 xgettext_record_flag ("ki18nc:2:kde-format");
525 xgettext_record_flag ("ki18np:1:kde-format");
526 xgettext_record_flag ("ki18ncp:2:kde-format");
527 xgettext_record_flag ("ki18nd:2:kde-format");
528 xgettext_record_flag ("ki18ndc:3:kde-format");
529 xgettext_record_flag ("ki18ndp:2:kde-format");
530 xgettext_record_flag ("ki18ndcp:3:kde-format");
531 xgettext_record_flag ("I18N_NOOP:1:kde-format");
532 xgettext_record_flag ("I18NC_NOOP:2:kde-format");
533 xgettext_record_flag ("I18N_NOOP2:2:kde-format");
534 xgettext_record_flag ("I18N_NOOP2_NOSTRIP:2:kde-format");
535 xgettext_record_flag ("xi18n:1:kde-kuit-format");
536 xgettext_record_flag ("xi18nc:2:kde-kuit-format");
537 xgettext_record_flag ("xi18np:1:kde-kuit-format");
538 xgettext_record_flag ("xi18ncp:2:kde-kuit-format");
539 xgettext_record_flag ("xi18nd:2:kde-kuit-format");
540 xgettext_record_flag ("xi18ndc:3:kde-kuit-format");
541 xgettext_record_flag ("xi18ndp:2:kde-kuit-format");
542 xgettext_record_flag ("xi18ndcp:3:kde-kuit-format");
543 xgettext_record_flag ("kxi18n:1:kde-kuit-format");
544 xgettext_record_flag ("kxi18nc:2:kde-kuit-format");
545 xgettext_record_flag ("kxi18np:1:kde-kuit-format");
546 xgettext_record_flag ("kxi18ncp:2:kde-kuit-format");
547 xgettext_record_flag ("kxi18nd:2:kde-kuit-format");
548 xgettext_record_flag ("kxi18ndc:3:kde-kuit-format");
549 xgettext_record_flag ("kxi18ndp:2:kde-kuit-format");
550 xgettext_record_flag ("kxi18ndcp:3:kde-kuit-format");
551 xgettext_record_flag ("XI18N_NOOP:1:kde-kuit-format");
552 xgettext_record_flag ("XI18NC_NOOP:2:kde-kuit-format");
553 xgettext_record_flag ("XI18N_NOOP2:2:kde-kuit-format");
554 xgettext_record_flag ("XI18N_NOOP2_NOSTRIP:2:kde-kuit-format");
555 }
556
557 /* ======================== Reading of characters. ======================== */
558
559 /* The input file stream. */
560 static FILE *fp;
561
562
563 /* 0. Terminate line by \n, regardless whether the external representation of
564 a line terminator is LF (Unix), CR (Mac) or CR/LF (DOS/Windows).
565 It is debatable whether supporting CR/LF line terminators in C sources
566 on Unix is ISO C or POSIX compliant, but since GCC 3.3 now supports it
567 unconditionally, it must be OK.
568 The so-called "text mode" in stdio on DOS/Windows translates CR/LF to \n
569 automatically, but here we also need this conversion on Unix. As a side
570 effect, on DOS/Windows we also parse CR/CR/LF into a single \n, but this
571 is not a problem. */
572
573
574 static int
phase0_getc()575 phase0_getc ()
576 {
577 int c;
578
579 c = getc (fp);
580 if (c == EOF)
581 {
582 if (ferror (fp))
583 error (EXIT_FAILURE, errno, _("error while reading \"%s\""),
584 real_file_name);
585 return EOF;
586 }
587
588 if (c == '\r')
589 {
590 int c1 = getc (fp);
591
592 if (c1 != EOF && c1 != '\n')
593 ungetc (c1, fp);
594
595 /* Seen line terminator CR or CR/LF. */
596 return '\n';
597 }
598
599 return c;
600 }
601
602
603 /* Supports only one pushback character, and not '\n'. */
604 static inline void
phase0_ungetc(int c)605 phase0_ungetc (int c)
606 {
607 if (c != EOF)
608 ungetc (c, fp);
609 }
610
611
612 /* 1. line_number handling. Combine backslash-newline to nothing. */
613
614 static unsigned char phase1_pushback[2];
615 static int phase1_pushback_length;
616
617
618 static int
phase1_getc()619 phase1_getc ()
620 {
621 int c;
622
623 if (phase1_pushback_length)
624 {
625 c = phase1_pushback[--phase1_pushback_length];
626 if (c == '\n')
627 ++line_number;
628 return c;
629 }
630 for (;;)
631 {
632 c = phase0_getc ();
633 switch (c)
634 {
635 case '\n':
636 ++line_number;
637 return '\n';
638
639 case '\\':
640 c = phase0_getc ();
641 if (c != '\n')
642 {
643 phase0_ungetc (c);
644 return '\\';
645 }
646 ++line_number;
647 break;
648
649 default:
650 return c;
651 }
652 }
653 }
654
655
656 /* Supports 2 characters of pushback. */
657 static void
phase1_ungetc(int c)658 phase1_ungetc (int c)
659 {
660 switch (c)
661 {
662 case EOF:
663 break;
664
665 case '\n':
666 --line_number;
667 /* FALLTHROUGH */
668
669 default:
670 if (phase1_pushback_length == SIZEOF (phase1_pushback))
671 abort ();
672 phase1_pushback[phase1_pushback_length++] = c;
673 break;
674 }
675 }
676
677
678 /* 2. Convert trigraphs to their single character equivalents. Most
679 sane human beings vomit copiously at the mention of trigraphs, which
680 is why they are an option. */
681
682 static unsigned char phase2_pushback[1];
683 static int phase2_pushback_length;
684
685
686 static int
phase2_getc()687 phase2_getc ()
688 {
689 int c;
690
691 if (phase2_pushback_length)
692 return phase2_pushback[--phase2_pushback_length];
693 if (!trigraphs)
694 return phase1_getc ();
695
696 c = phase1_getc ();
697 if (c != '?')
698 return c;
699 c = phase1_getc ();
700 if (c != '?')
701 {
702 phase1_ungetc (c);
703 return '?';
704 }
705 c = phase1_getc ();
706 switch (c)
707 {
708 case '(':
709 return '[';
710 case '/':
711 return '\\';
712 case ')':
713 return ']';
714 case '\'':
715 return '^';
716 case '<':
717 return '{';
718 case '!':
719 return '|';
720 case '>':
721 return '}';
722 case '-':
723 return '~';
724 case '#':
725 return '=';
726 }
727 phase1_ungetc (c);
728 phase1_ungetc ('?');
729 return '?';
730 }
731
732
733 /* Supports only one pushback character. */
734 static void
phase2_ungetc(int c)735 phase2_ungetc (int c)
736 {
737 if (c != EOF)
738 {
739 if (phase2_pushback_length == SIZEOF (phase2_pushback))
740 abort ();
741 phase2_pushback[phase2_pushback_length++] = c;
742 }
743 }
744
745
746 /* 3. Concatenate each line ending in backslash (\) with the following
747 line. Basically, all you need to do is elide "\\\n" sequences from
748 the input. */
749
750 static unsigned char phase3_pushback[9];
751 static int phase3_pushback_length;
752
753
754 static int
phase3_getc()755 phase3_getc ()
756 {
757 if (phase3_pushback_length)
758 return phase3_pushback[--phase3_pushback_length];
759 for (;;)
760 {
761 int c = phase2_getc ();
762 if (c != '\\')
763 return c;
764 c = phase2_getc ();
765 if (c != '\n')
766 {
767 phase2_ungetc (c);
768 return '\\';
769 }
770 }
771 }
772
773
774 /* Supports 9 characters of pushback. */
775 static void
phase3_ungetc(int c)776 phase3_ungetc (int c)
777 {
778 if (c != EOF)
779 {
780 if (phase3_pushback_length == SIZEOF (phase3_pushback))
781 abort ();
782 phase3_pushback[phase3_pushback_length++] = c;
783 }
784 }
785
786
787 /* Accumulating comments. */
788
789 static char *buffer;
790 static size_t bufmax;
791 static size_t buflen;
792
793 static inline void
comment_start()794 comment_start ()
795 {
796 buflen = 0;
797 }
798
799 static inline void
comment_add(int c)800 comment_add (int c)
801 {
802 if (buflen >= bufmax)
803 {
804 bufmax = 2 * bufmax + 10;
805 buffer = xrealloc (buffer, bufmax);
806 }
807 buffer[buflen++] = c;
808 }
809
810 static inline void
comment_line_end(size_t chars_to_remove)811 comment_line_end (size_t chars_to_remove)
812 {
813 buflen -= chars_to_remove;
814 while (buflen >= 1
815 && (buffer[buflen - 1] == ' ' || buffer[buflen - 1] == '\t'))
816 --buflen;
817 if (chars_to_remove == 0 && buflen >= bufmax)
818 {
819 bufmax = 2 * bufmax + 10;
820 buffer = xrealloc (buffer, bufmax);
821 }
822 buffer[buflen] = '\0';
823 savable_comment_add (buffer);
824 }
825
826
827 /* These are for tracking whether comments count as immediately before
828 keyword. */
829 static int last_comment_line;
830 static int last_non_comment_line;
831 static int newline_count;
832
833
834 /* 4. Replace each comment that is not inside a character constant or
835 string literal with a space character. We need to remember the
836 comment for later, because it may be attached to a keyword string.
837 We also optionally understand C++ comments. */
838
839 static int
phase4_getc()840 phase4_getc ()
841 {
842 int c;
843 bool last_was_star;
844
845 c = phase3_getc ();
846 if (c != '/')
847 return c;
848 c = phase3_getc ();
849 switch (c)
850 {
851 default:
852 phase3_ungetc (c);
853 return '/';
854
855 case '*':
856 /* C comment. */
857 comment_start ();
858 last_was_star = false;
859 for (;;)
860 {
861 c = phase3_getc ();
862 if (c == EOF)
863 break;
864 /* We skip all leading white space, but not EOLs. */
865 if (!(buflen == 0 && (c == ' ' || c == '\t')))
866 comment_add (c);
867 switch (c)
868 {
869 case '\n':
870 comment_line_end (1);
871 comment_start ();
872 last_was_star = false;
873 continue;
874
875 case '*':
876 last_was_star = true;
877 continue;
878
879 case '/':
880 if (last_was_star)
881 {
882 comment_line_end (2);
883 break;
884 }
885 /* FALLTHROUGH */
886
887 default:
888 last_was_star = false;
889 continue;
890 }
891 break;
892 }
893 last_comment_line = newline_count;
894 return ' ';
895
896 case '/':
897 /* C++ or ISO C 99 comment. */
898 comment_start ();
899 for (;;)
900 {
901 c = phase3_getc ();
902 if (c == '\n' || c == EOF)
903 break;
904 /* We skip all leading white space, but not EOLs. */
905 if (!(buflen == 0 && (c == ' ' || c == '\t')))
906 comment_add (c);
907 }
908 comment_line_end (0);
909 last_comment_line = newline_count;
910 return '\n';
911 }
912 }
913
914
915 /* Supports only one pushback character. */
916 static void
phase4_ungetc(int c)917 phase4_ungetc (int c)
918 {
919 phase3_ungetc (c);
920 }
921
922
923 /* ========================== Reading of tokens. ========================== */
924
925
926 /* True if ObjectiveC extensions are recognized. */
927 static bool objc_extensions;
928
929 /* True if C++ extensions are recognized. */
930 static bool cxx_extensions;
931
932 enum token_type_ty
933 {
934 token_type_character_constant, /* 'x' */
935 token_type_eof,
936 token_type_eoln,
937 token_type_hash, /* # */
938 token_type_lparen, /* ( */
939 token_type_rparen, /* ) */
940 token_type_comma, /* , */
941 token_type_colon, /* : */
942 token_type_name, /* abc */
943 token_type_number, /* 2.7 */
944 token_type_string_literal, /* "abc" */
945 token_type_symbol, /* < > = etc. */
946 token_type_objc_special, /* @ */
947 token_type_white_space
948 };
949 typedef enum token_type_ty token_type_ty;
950
951 typedef struct token_ty token_ty;
952 struct token_ty
953 {
954 token_type_ty type;
955 char *string; /* for token_type_name */
956 mixed_string_ty *mixed_string; /* for token_type_string_literal */
957 refcounted_string_list_ty *comment; /* for token_type_string_literal,
958 token_type_objc_special */
959 long number;
960 int line_number;
961 };
962
963
964 /* 7. Replace escape sequences within character strings with their
965 single character equivalents. This is called from phase 5, because
966 we don't have to worry about the #include argument. There are
967 pathological cases which could bite us (like the DOS directory
968 separator), but just pretend it can't happen. */
969
970 /* Return value of phase7_getc when EOF is reached. */
971 #define P7_EOF (-1)
972 #define P7_STRING_END (-2)
973
974 /* Replace escape sequences within character strings with their single
975 character equivalents. */
976 #define P7_QUOTES (-3)
977 #define P7_QUOTE (-4)
978 #define P7_NEWLINE (-5)
979
980 /* Convert an UTF-16 or UTF-32 code point to a return value that can be
981 distinguished from a single-byte return value. */
982 #define UNICODE(code) (0x100 + (code))
983
984 /* Test a return value of phase7_getuc whether it designates an UTF-16 or
985 UTF-32 code point. */
986 #define IS_UNICODE(p7_result) ((p7_result) >= 0x100)
987
988 /* Extract the UTF-16 or UTF-32 code of a return value that satisfies
989 IS_UNICODE. */
990 #define UNICODE_VALUE(p7_result) ((p7_result) - 0x100)
991
992
993 static int
phase7_getc()994 phase7_getc ()
995 {
996 int c, n, j;
997
998 /* Use phase 3, because phase 4 elides comments. */
999 c = phase3_getc ();
1000
1001 /* Return a magic newline indicator, so that we can distinguish
1002 between the user requesting a newline in the string (e.g. using
1003 "\n" or "\012") from the user failing to terminate the string or
1004 character constant. The ANSI C standard says: 3.1.3.4 Character
1005 Constants contain "any character except single quote, backslash or
1006 newline; or an escape sequence" and 3.1.4 String Literals contain
1007 "any character except double quote, backslash or newline; or an
1008 escape sequence".
1009
1010 Most compilers give a fatal error in this case, however gcc is
1011 stupidly silent, even though this is a very common typo. OK, so
1012 "gcc --pedantic" will tell me, but that gripes about too much other
1013 stuff. Could I have a "gcc -Wnewline-in-string" option, or
1014 better yet a "gcc -fno-newline-in-string" option, please? Gcc is
1015 also inconsistent between string literals and character constants:
1016 you may not embed newlines in character constants; try it, you get
1017 a useful diagnostic. --PMiller */
1018 if (c == '\n')
1019 return P7_NEWLINE;
1020
1021 if (c == '"')
1022 return P7_QUOTES;
1023 if (c == '\'')
1024 return P7_QUOTE;
1025 if (c != '\\')
1026 return c;
1027 c = phase3_getc ();
1028 switch (c)
1029 {
1030 default:
1031 /* Invalid escape sequences generate a GCC warning, and GCC transforms
1032 \c to the character c. So let's do the same. */
1033 case '"':
1034 case '\'':
1035 case '?':
1036 case '\\':
1037 return c;
1038
1039 case 'a':
1040 return '\a';
1041 case 'b':
1042 return '\b';
1043
1044 /* The \e escape is preculiar to gcc, and assumes an ASCII
1045 character set (or superset). We don't provide support for it
1046 here. */
1047
1048 case 'f':
1049 return '\f';
1050 case 'n':
1051 return '\n';
1052 case 'r':
1053 return '\r';
1054 case 't':
1055 return '\t';
1056 case 'v':
1057 return '\v';
1058
1059 case 'x':
1060 c = phase3_getc ();
1061 switch (c)
1062 {
1063 default:
1064 phase3_ungetc (c);
1065 phase3_ungetc ('x');
1066 return '\\';
1067
1068 case '0': case '1': case '2': case '3': case '4':
1069 case '5': case '6': case '7': case '8': case '9':
1070 case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
1071 case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
1072 break;
1073 }
1074 n = 0;
1075 for (;;)
1076 {
1077 switch (c)
1078 {
1079 default:
1080 phase3_ungetc (c);
1081 return n;
1082
1083 case '0': case '1': case '2': case '3': case '4':
1084 case '5': case '6': case '7': case '8': case '9':
1085 n = n * 16 + c - '0';
1086 break;
1087
1088 case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
1089 n = n * 16 + 10 + c - 'A';
1090 break;
1091
1092 case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
1093 n = n * 16 + 10 + c - 'a';
1094 break;
1095 }
1096 c = phase3_getc ();
1097 }
1098 return n;
1099
1100 case '0': case '1': case '2': case '3':
1101 case '4': case '5': case '6': case '7':
1102 n = 0;
1103 for (j = 0; j < 3; ++j)
1104 {
1105 n = n * 8 + c - '0';
1106 c = phase3_getc ();
1107 switch (c)
1108 {
1109 default:
1110 break;
1111
1112 case '0': case '1': case '2': case '3':
1113 case '4': case '5': case '6': case '7':
1114 continue;
1115 }
1116 break;
1117 }
1118 phase3_ungetc (c);
1119 return n;
1120
1121 case 'U': case 'u':
1122 {
1123 unsigned char buf[8];
1124
1125 n = 0;
1126 for (j = 0; j < (c == 'u' ? 4 : 8); j++)
1127 {
1128 int c1 = phase3_getc ();
1129
1130 if (c1 >= '0' && c1 <= '9')
1131 n = (n << 4) + (c1 - '0');
1132 else if (c1 >= 'A' && c1 <= 'F')
1133 n = (n << 4) + (c1 - 'A' + 10);
1134 else if (c1 >= 'a' && c1 <= 'f')
1135 n = (n << 4) + (c1 - 'a' + 10);
1136 else
1137 {
1138 phase3_ungetc (c1);
1139 while (--j >= 0)
1140 phase3_ungetc (buf[j]);
1141 phase3_ungetc (c);
1142 return '\\';
1143 }
1144
1145 buf[j] = c1;
1146 }
1147
1148 if (n < 0x110000)
1149 return UNICODE (n);
1150
1151 error_with_progname = false;
1152 error (0, 0, _("%s:%d: warning: invalid Unicode character"),
1153 logical_file_name, line_number);
1154 error_with_progname = true;
1155
1156 while (--j >= 0)
1157 phase3_ungetc (buf[j]);
1158 phase3_ungetc (c);
1159 return '\\';
1160 }
1161 }
1162 }
1163
1164
1165 static void
phase7_ungetc(int c)1166 phase7_ungetc (int c)
1167 {
1168 phase3_ungetc (c);
1169 }
1170
1171
1172 /* Free the memory pointed to by a 'struct token_ty'. */
1173 static inline void
free_token(token_ty * tp)1174 free_token (token_ty *tp)
1175 {
1176 if (tp->type == token_type_name)
1177 free (tp->string);
1178 if (tp->type == token_type_string_literal)
1179 mixed_string_free (tp->mixed_string);
1180 if (tp->type == token_type_string_literal
1181 || tp->type == token_type_objc_special)
1182 drop_reference (tp->comment);
1183 }
1184
1185
1186 /* 5. Parse each resulting logical line as preprocessing tokens and
1187 white space. Preprocessing tokens and C tokens don't always match. */
1188
1189 static token_ty phase5_pushback[1];
1190 static int phase5_pushback_length;
1191
1192
1193 static void
phase5_get(token_ty * tp)1194 phase5_get (token_ty *tp)
1195 {
1196 static char *buffer;
1197 static int bufmax;
1198 int bufpos;
1199 int c;
1200
1201 if (phase5_pushback_length)
1202 {
1203 *tp = phase5_pushback[--phase5_pushback_length];
1204 return;
1205 }
1206 tp->string = NULL;
1207 tp->number = 0;
1208 tp->line_number = line_number;
1209 c = phase4_getc ();
1210 switch (c)
1211 {
1212 case EOF:
1213 tp->type = token_type_eof;
1214 return;
1215
1216 case '\n':
1217 tp->type = token_type_eoln;
1218 return;
1219
1220 case ' ':
1221 case '\f':
1222 case '\t':
1223 for (;;)
1224 {
1225 c = phase4_getc ();
1226 switch (c)
1227 {
1228 case ' ':
1229 case '\f':
1230 case '\t':
1231 continue;
1232
1233 default:
1234 phase4_ungetc (c);
1235 break;
1236 }
1237 break;
1238 }
1239 tp->type = token_type_white_space;
1240 return;
1241
1242 case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': case 'G':
1243 case 'H': case 'I': case 'J': case 'K': case 'L': case 'M': case 'N':
1244 case 'O': case 'P': case 'Q': case 'R': case 'S': case 'T': case 'U':
1245 case 'V': case 'W': case 'X': case 'Y': case 'Z':
1246 case '_':
1247 case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': case 'g':
1248 case 'h': case 'i': case 'j': case 'k': case 'l': case 'm': case 'n':
1249 case 'o': case 'p': case 'q': case 'r': case 's': case 't': case 'u':
1250 case 'v': case 'w': case 'x': case 'y': case 'z':
1251 bufpos = 0;
1252 for (;;)
1253 {
1254 if (bufpos >= bufmax)
1255 {
1256 bufmax = 2 * bufmax + 10;
1257 buffer = xrealloc (buffer, bufmax);
1258 }
1259 buffer[bufpos++] = c;
1260 c = phase4_getc ();
1261 switch (c)
1262 {
1263 case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
1264 case 'G': case 'H': case 'I': case 'J': case 'K': case 'L':
1265 case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R':
1266 case 'S': case 'T': case 'U': case 'V': case 'W': case 'X':
1267 case 'Y': case 'Z':
1268 case '_':
1269 case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
1270 case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
1271 case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
1272 case 's': case 't': case 'u': case 'v': case 'w': case 'x':
1273 case 'y': case 'z':
1274 case '0': case '1': case '2': case '3': case '4':
1275 case '5': case '6': case '7': case '8': case '9':
1276 continue;
1277
1278 case '"':
1279 /* Recognize C11 / C++11 string literals.
1280 See (for C) ISO 9899:2011 section 6.4.5
1281 and (for C++) ISO C++ 11 section 2.14.5 [lex.string].
1282 Note: The programmer who passes an UTF-8 encoded string to
1283 gettext() or similar API functions will have to have called
1284 bind_textdomain_codeset (DOMAIN, "UTF-8") first. */
1285 if (bufpos == 2 && buffer[0] == 'u' && buffer[1] == '8')
1286 goto string_literal;
1287 /* Recognize C++11 raw string literals.
1288 See ISO C++ 11 section 2.14.5 [lex.string].
1289 Here it is important to properly parse all cases according to
1290 the standard, otherwise our parser could get confused by
1291 double-quotes inside the raw string.
1292 Note: The programmer who passes an UTF-8 encoded string to
1293 gettext() or similar API functions will have to have called
1294 bind_textdomain_codeset (DOMAIN, "UTF-8") first. */
1295 if (cxx_extensions
1296 && (bufpos == 1
1297 || (bufpos == 2
1298 && (buffer[0] == 'u' || buffer[0] == 'U'
1299 || buffer[0] == 'L'))
1300 || (bufpos == 3 && buffer[0] == 'u' && buffer[1] == '8'))
1301 && buffer[bufpos - 1] == 'R')
1302 {
1303 /* Only R and u8R raw strings can be used as gettext()
1304 arguments, for type reasons. */
1305 const bool relevant = (bufpos != 2);
1306 int starting_line_number = line_number;
1307 bufpos = 0;
1308 /* Start the buffer with a closing parenthesis. This makes the
1309 parsing code below simpler. */
1310 buffer[bufpos++] = ')';
1311 /* Parse the initial delimiter. */
1312 for (;;)
1313 {
1314 bool valid_delimiter_char;
1315
1316 c = phase3_getc ();
1317 switch (c)
1318 {
1319 case 'A': case 'B': case 'C': case 'D': case 'E':
1320 case 'F': case 'G': case 'H': case 'I': case 'J':
1321 case 'K': case 'L': case 'M': case 'N': case 'O':
1322 case 'P': case 'Q': case 'R': case 'S': case 'T':
1323 case 'U': case 'V': case 'W': case 'X': case 'Y':
1324 case 'Z':
1325 case 'a': case 'b': case 'c': case 'd': case 'e':
1326 case 'f': case 'g': case 'h': case 'i': case 'j':
1327 case 'k': case 'l': case 'm': case 'n': case 'o':
1328 case 'p': case 'q': case 'r': case 's': case 't':
1329 case 'u': case 'v': case 'w': case 'x': case 'y':
1330 case 'z':
1331 case '0': case '1': case '2': case '3': case '4':
1332 case '5': case '6': case '7': case '8': case '9':
1333 case '_': case '{': case '}': case '[': case ']':
1334 case '#': case '<': case '>': case '%': case ':':
1335 case ';': case '.': case '?': case '*': case '+':
1336 case '-': case '/': case '^': case '&': case '|':
1337 case '~': case '!': case '=': case ',': case '\'':
1338 valid_delimiter_char = true;
1339 break;
1340 case '"':
1341 /* A double-quote within the delimiter! This is too
1342 weird. We don't support this. */
1343 error_with_progname = false;
1344 error (0, 0, _("%s:%d: warning: a double-quote in the delimiter of a raw string literal is unsupported"),
1345 logical_file_name, starting_line_number);
1346 error_with_progname = true;
1347 /* FALLTHROUGH */
1348 default:
1349 valid_delimiter_char = false;
1350 break;
1351 }
1352 if (!valid_delimiter_char)
1353 break;
1354
1355 if (bufpos >= bufmax)
1356 {
1357 bufmax = 2 * bufmax + 10;
1358 buffer = xrealloc (buffer, bufmax);
1359 }
1360 buffer[bufpos++] = c;
1361 }
1362 if (c == '(')
1363 {
1364 struct mixed_string_buffer msb;
1365 /* The state is either 0 or
1366 N, after a ')' and N-1 bytes of the delimiter have been
1367 encountered. */
1368 int state;
1369
1370 /* Start accumulating the string. */
1371 if (relevant)
1372 mixed_string_buffer_init (&msb, lc_string,
1373 logical_file_name,
1374 line_number);
1375 state = 0;
1376
1377 for (;;)
1378 {
1379 c = phase3_getc ();
1380
1381 /* Keep line_number in sync. */
1382 if (relevant)
1383 msb.line_number = line_number;
1384
1385 if (c == EOF)
1386 break;
1387
1388 /* Update the state. */
1389 if (c == (state < bufpos ? buffer[state] : '"'))
1390 {
1391 if (state < bufpos)
1392 state++;
1393 else /* state == bufpos && c == '"' */
1394 {
1395 /* Finished parsing the string. */
1396 if (relevant)
1397 {
1398 tp->type = token_type_string_literal;
1399 tp->mixed_string = mixed_string_buffer_result (&msb);
1400 tp->comment = add_reference (savable_comment);
1401 }
1402 else
1403 tp->type = token_type_symbol;
1404 return;
1405 }
1406 }
1407 else
1408 {
1409 int i;
1410
1411 /* None of the bytes buffer[0]...buffer[state-1]
1412 can be ')'. */
1413 if (relevant)
1414 for (i = 0; i < state; i++)
1415 mixed_string_buffer_append_char (&msb, buffer[i]);
1416
1417 /* But c may be ')'. */
1418 if (c == ')')
1419 state = 1;
1420 else
1421 {
1422 if (relevant)
1423 mixed_string_buffer_append_char (&msb, c);
1424 state = 0;
1425 }
1426 }
1427 }
1428 }
1429 if (c == EOF)
1430 {
1431 error_with_progname = false;
1432 error (0, 0, _("%s:%d: warning: unterminated raw string literal"),
1433 logical_file_name, starting_line_number);
1434 error_with_progname = true;
1435 tp->type = token_type_eof;
1436 return;
1437 }
1438 /* The error message for c == '"' was already emitted above. */
1439 if (c != '"')
1440 {
1441 error_with_progname = false;
1442 error (0, 0, _("%s:%d: warning: invalid raw string literal syntax"),
1443 logical_file_name, starting_line_number);
1444 error_with_progname = true;
1445 }
1446 /* To get into a sane state, read up until the next double-quote,
1447 newline, or EOF. */
1448 while (!(c == EOF || c == '"' || c == '\n'))
1449 c = phase3_getc ();
1450 tp->type = token_type_symbol;
1451 return;
1452 }
1453 /* FALLTHROUGH */
1454
1455 default:
1456 phase4_ungetc (c);
1457 break;
1458 }
1459 break;
1460 }
1461 if (bufpos >= bufmax)
1462 {
1463 bufmax = 2 * bufmax + 10;
1464 buffer = xrealloc (buffer, bufmax);
1465 }
1466 buffer[bufpos] = 0;
1467 tp->string = xstrdup (buffer);
1468 tp->type = token_type_name;
1469 return;
1470
1471 case '.':
1472 c = phase4_getc ();
1473 phase4_ungetc (c);
1474 switch (c)
1475 {
1476 default:
1477 tp->type = token_type_symbol;
1478 return;
1479
1480 case '0': case '1': case '2': case '3': case '4':
1481 case '5': case '6': case '7': case '8': case '9':
1482 c = '.';
1483 break;
1484 }
1485 /* FALLTHROUGH */
1486
1487 case '0': case '1': case '2': case '3': case '4':
1488 case '5': case '6': case '7': case '8': case '9':
1489 /* The preprocessing number token is more "generous" than the C
1490 number tokens. This is mostly due to token pasting (another
1491 thing we can ignore here). */
1492 bufpos = 0;
1493 for (;;)
1494 {
1495 if (bufpos >= bufmax)
1496 {
1497 bufmax = 2 * bufmax + 10;
1498 buffer = xrealloc (buffer, bufmax);
1499 }
1500 buffer[bufpos++] = c;
1501 c = phase4_getc ();
1502 switch (c)
1503 {
1504 case 'p':
1505 case 'P':
1506 /* In C99 and C++17, 'p' and 'P' can be used as an exponent
1507 marker. */
1508 /* FALLTHROUGH */
1509 case 'e':
1510 case 'E':
1511 if (bufpos >= bufmax)
1512 {
1513 bufmax = 2 * bufmax + 10;
1514 buffer = xrealloc (buffer, bufmax);
1515 }
1516 buffer[bufpos++] = c;
1517 c = phase4_getc ();
1518 if (c != '+' && c != '-')
1519 {
1520 phase4_ungetc (c);
1521 break;
1522 }
1523 continue;
1524
1525 case 'A': case 'B': case 'C': case 'D': case 'F':
1526 case 'G': case 'H': case 'I': case 'J': case 'K': case 'L':
1527 case 'M': case 'N': case 'O': case 'Q': case 'R':
1528 case 'S': case 'T': case 'U': case 'V': case 'W': case 'X':
1529 case 'Y': case 'Z':
1530 case 'a': case 'b': case 'c': case 'd': case 'f':
1531 case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
1532 case 'm': case 'n': case 'o': case 'q': case 'r':
1533 case 's': case 't': case 'u': case 'v': case 'w': case 'x':
1534 case 'y': case 'z':
1535 case '0': case '1': case '2': case '3': case '4':
1536 case '5': case '6': case '7': case '8': case '9':
1537 case '.':
1538 continue;
1539
1540 case '_':
1541 if (cxx_extensions)
1542 /* In C++, an underscore can be part of a preprocessing number
1543 token. */
1544 continue;
1545 else
1546 {
1547 phase4_ungetc (c);
1548 break;
1549 }
1550
1551 case '\'':
1552 if (cxx_extensions)
1553 {
1554 /* In C++14, a single-quote followed by a digit, ASCII letter,
1555 or underscore can be part of a preprocessing number token. */
1556 int c1 = phase4_getc ();
1557 switch (c1)
1558 {
1559 case '0': case '1': case '2': case '3': case '4':
1560 case '5': case '6': case '7': case '8': case '9':
1561 case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
1562 case 'G': case 'H': case 'I': case 'J': case 'K': case 'L':
1563 case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R':
1564 case 'S': case 'T': case 'U': case 'V': case 'W': case 'X':
1565 case 'Y': case 'Z':
1566 case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
1567 case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
1568 case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
1569 case 's': case 't': case 'u': case 'v': case 'w': case 'x':
1570 case 'y': case 'z':
1571 case '_':
1572 if (bufpos >= bufmax)
1573 {
1574 bufmax = 2 * bufmax + 10;
1575 buffer = xrealloc (buffer, bufmax);
1576 }
1577 buffer[bufpos++] = c;
1578 c = c1;
1579 continue;
1580 default:
1581 /* The two phase4_getc() calls that returned c and c1 did
1582 nothing more than to call phase3_getc(), without any
1583 lookahead. Therefore 2 pushback characters are
1584 supported in this case. */
1585 phase4_ungetc (c1);
1586 break;
1587 }
1588 }
1589 /* FALLTHROUGH */
1590 default:
1591 phase4_ungetc (c);
1592 break;
1593 }
1594 break;
1595 }
1596 if (bufpos >= bufmax)
1597 {
1598 bufmax = 2 * bufmax + 10;
1599 buffer = xrealloc (buffer, bufmax);
1600 }
1601 buffer[bufpos] = 0;
1602 tp->type = token_type_number;
1603 tp->number = atol (buffer);
1604 return;
1605
1606 case '\'':
1607 /* We could worry about the 'L' before wide character constants,
1608 but ignoring it has no effect unless one of the keywords is
1609 "L". Just pretend it won't happen. Also, we don't need to
1610 remember the character constant. */
1611 for (;;)
1612 {
1613 c = phase7_getc ();
1614 if (c == P7_NEWLINE)
1615 {
1616 error_with_progname = false;
1617 error (0, 0, _("%s:%d: warning: unterminated character constant"),
1618 logical_file_name, line_number - 1);
1619 error_with_progname = true;
1620 phase7_ungetc ('\n');
1621 break;
1622 }
1623 if (c == EOF || c == P7_QUOTE)
1624 break;
1625 }
1626 tp->type = token_type_character_constant;
1627 return;
1628
1629 case '"':
1630 string_literal:
1631 /* We could worry about the 'L' or 'u' or 'U' before wide string
1632 constants, but since gettext's argument is a 'const char *', not
1633 a 'const wchar_t *' (for 'L') nor a 'const char16_t *' (for 'u')
1634 nor a 'const char32_t *' (for 'U'), the compiler would complain
1635 about the argument not matching the prototype. Just pretend it
1636 won't happen. */
1637 {
1638 struct mixed_string_buffer msb;
1639
1640 /* Start accumulating the string. */
1641 mixed_string_buffer_init (&msb, lc_string,
1642 logical_file_name, line_number);
1643
1644 for (;;)
1645 {
1646 c = phase7_getc ();
1647
1648 /* Keep line_number in sync. */
1649 msb.line_number = line_number;
1650
1651 if (c == P7_NEWLINE)
1652 {
1653 error_with_progname = false;
1654 error (0, 0, _("%s:%d: warning: unterminated string literal"),
1655 logical_file_name, line_number - 1);
1656 error_with_progname = true;
1657 phase7_ungetc ('\n');
1658 break;
1659 }
1660 if (c == EOF || c == P7_QUOTES)
1661 break;
1662 if (c == P7_QUOTE)
1663 c = '\'';
1664 if (IS_UNICODE (c))
1665 {
1666 assert (UNICODE_VALUE (c) >= 0
1667 && UNICODE_VALUE (c) < 0x110000);
1668 mixed_string_buffer_append_unicode (&msb, UNICODE_VALUE (c));
1669 }
1670 else
1671 mixed_string_buffer_append_char (&msb, c);
1672 }
1673 tp->type = token_type_string_literal;
1674 tp->mixed_string = mixed_string_buffer_result (&msb);
1675 tp->comment = add_reference (savable_comment);
1676 return;
1677 }
1678
1679 case '(':
1680 tp->type = token_type_lparen;
1681 return;
1682
1683 case ')':
1684 tp->type = token_type_rparen;
1685 return;
1686
1687 case ',':
1688 tp->type = token_type_comma;
1689 return;
1690
1691 case '#':
1692 tp->type = token_type_hash;
1693 return;
1694
1695 case ':':
1696 tp->type = token_type_colon;
1697 return;
1698
1699 case '@':
1700 if (objc_extensions)
1701 {
1702 tp->type = token_type_objc_special;
1703 tp->comment = add_reference (savable_comment);
1704 return;
1705 }
1706 /* FALLTHROUGH */
1707
1708 default:
1709 /* We could carefully recognize each of the 2 and 3 character
1710 operators, but it is not necessary, as we only need to recognize
1711 gettext invocations. Don't bother. */
1712 tp->type = token_type_symbol;
1713 return;
1714 }
1715 }
1716
1717
1718 /* Supports only one pushback token. */
1719 static void
phase5_unget(token_ty * tp)1720 phase5_unget (token_ty *tp)
1721 {
1722 if (tp->type != token_type_eof)
1723 {
1724 if (phase5_pushback_length == SIZEOF (phase5_pushback))
1725 abort ();
1726 phase5_pushback[phase5_pushback_length++] = *tp;
1727 }
1728 }
1729
1730
1731 /* X. Recognize a leading # symbol. Leave leading hash as a hash, but
1732 turn hash in the middle of a line into a plain symbol token. This
1733 makes the phase 6 easier. */
1734
1735 static void
phaseX_get(token_ty * tp)1736 phaseX_get (token_ty *tp)
1737 {
1738 static bool middle; /* false at the beginning of a line, true otherwise. */
1739
1740 phase5_get (tp);
1741
1742 if (tp->type == token_type_eoln || tp->type == token_type_eof)
1743 middle = false;
1744 else
1745 {
1746 if (middle)
1747 {
1748 /* Turn hash in the middle of a line into a plain symbol token. */
1749 if (tp->type == token_type_hash)
1750 tp->type = token_type_symbol;
1751 }
1752 else
1753 {
1754 /* When we see leading whitespace followed by a hash sign,
1755 discard the leading white space token. The hash is all
1756 phase 6 is interested in. */
1757 if (tp->type == token_type_white_space)
1758 {
1759 token_ty next;
1760
1761 phase5_get (&next);
1762 if (next.type == token_type_hash)
1763 *tp = next;
1764 else
1765 phase5_unget (&next);
1766 }
1767 middle = true;
1768 }
1769 }
1770 }
1771
1772
1773 /* 6. Recognize and carry out directives (it also expands macros on
1774 non-directive lines, which we do not do here). The only directive
1775 we care about are the #line and #define directive. We throw all the
1776 others away. */
1777
1778 static token_ty phase6_pushback[2];
1779 static int phase6_pushback_length;
1780
1781
1782 static void
phase6_get(token_ty * tp)1783 phase6_get (token_ty *tp)
1784 {
1785 static token_ty *buf;
1786 static int bufmax;
1787 int bufpos;
1788 int j;
1789
1790 if (phase6_pushback_length)
1791 {
1792 *tp = phase6_pushback[--phase6_pushback_length];
1793 return;
1794 }
1795 for (;;)
1796 {
1797 /* Get the next token. If it is not a '#' at the beginning of a
1798 line (ignoring whitespace), return immediately. */
1799 phaseX_get (tp);
1800 if (tp->type != token_type_hash)
1801 return;
1802
1803 /* Accumulate the rest of the directive in a buffer, until the
1804 "define" keyword is seen or until end of line. */
1805 bufpos = 0;
1806 for (;;)
1807 {
1808 phaseX_get (tp);
1809 if (tp->type == token_type_eoln || tp->type == token_type_eof)
1810 break;
1811
1812 /* Before the "define" keyword and inside other directives
1813 white space is irrelevant. So just throw it away. */
1814 if (tp->type != token_type_white_space)
1815 {
1816 /* If it is a #define directive, return immediately,
1817 thus treating the body of the #define directive like
1818 normal input. */
1819 if (bufpos == 0
1820 && tp->type == token_type_name
1821 && strcmp (tp->string, "define") == 0)
1822 return;
1823
1824 /* Accumulate. */
1825 if (bufpos >= bufmax)
1826 {
1827 bufmax = 2 * bufmax + 10;
1828 buf = xrealloc (buf, bufmax * sizeof (buf[0]));
1829 }
1830 buf[bufpos++] = *tp;
1831 }
1832 }
1833
1834 /* If it is a #line directive, with no macros to expand, act on
1835 it. Ignore all other directives. */
1836 if (bufpos >= 3 && buf[0].type == token_type_name
1837 && strcmp (buf[0].string, "line") == 0
1838 && buf[1].type == token_type_number
1839 && buf[2].type == token_type_string_literal)
1840 {
1841 logical_file_name = mixed_string_contents (buf[2].mixed_string);
1842 line_number = buf[1].number;
1843 }
1844 if (bufpos >= 2 && buf[0].type == token_type_number
1845 && buf[1].type == token_type_string_literal)
1846 {
1847 logical_file_name = mixed_string_contents (buf[1].mixed_string);
1848 line_number = buf[0].number;
1849 }
1850
1851 /* Release the storage held by the directive. */
1852 for (j = 0; j < bufpos; ++j)
1853 free_token (&buf[j]);
1854
1855 /* We must reset the selected comments. */
1856 savable_comment_reset ();
1857 }
1858 }
1859
1860
1861 /* Supports 2 tokens of pushback. */
1862 static void
phase6_unget(token_ty * tp)1863 phase6_unget (token_ty *tp)
1864 {
1865 if (tp->type != token_type_eof)
1866 {
1867 if (phase6_pushback_length == SIZEOF (phase6_pushback))
1868 abort ();
1869 phase6_pushback[phase6_pushback_length++] = *tp;
1870 }
1871 }
1872
1873
1874 /* 8a. Convert ISO C 99 section 7.8.1 format string directives to string
1875 literal placeholders. */
1876
1877 /* Test for an ISO C 99 section 7.8.1 format string directive. */
1878 static bool
is_inttypes_macro(const char * name)1879 is_inttypes_macro (const char *name)
1880 {
1881 /* Syntax:
1882 P R I { d | i | o | u | x | X }
1883 { { | LEAST | FAST } { 8 | 16 | 32 | 64 } | MAX | PTR } */
1884 if (name[0] == 'P' && name[1] == 'R' && name[2] == 'I')
1885 {
1886 name += 3;
1887 if (name[0] == 'd' || name[0] == 'i' || name[0] == 'o' || name[0] == 'u'
1888 || name[0] == 'x' || name[0] == 'X')
1889 {
1890 name += 1;
1891 if (name[0] == 'M' && name[1] == 'A' && name[2] == 'X'
1892 && name[3] == '\0')
1893 return true;
1894 if (name[0] == 'P' && name[1] == 'T' && name[2] == 'R'
1895 && name[3] == '\0')
1896 return true;
1897 if (name[0] == 'L' && name[1] == 'E' && name[2] == 'A'
1898 && name[3] == 'S' && name[4] == 'T')
1899 name += 5;
1900 else if (name[0] == 'F' && name[1] == 'A' && name[2] == 'S'
1901 && name[3] == 'T')
1902 name += 4;
1903 if (name[0] == '8' && name[1] == '\0')
1904 return true;
1905 if (name[0] == '1' && name[1] == '6' && name[2] == '\0')
1906 return true;
1907 if (name[0] == '3' && name[1] == '2' && name[2] == '\0')
1908 return true;
1909 if (name[0] == '6' && name[1] == '4' && name[2] == '\0')
1910 return true;
1911 }
1912 }
1913 return false;
1914 }
1915
1916 static void
phase8a_get(token_ty * tp)1917 phase8a_get (token_ty *tp)
1918 {
1919 phase6_get (tp);
1920 if (tp->type == token_type_name && is_inttypes_macro (tp->string))
1921 {
1922 /* Turn PRIdXXX into "<PRIdXXX>". */
1923 char *new_string = xasprintf ("<%s>", tp->string);
1924 free (tp->string);
1925 tp->mixed_string =
1926 mixed_string_alloc_utf8 (new_string, lc_string,
1927 logical_file_name, line_number);
1928 tp->comment = add_reference (savable_comment);
1929 tp->type = token_type_string_literal;
1930 }
1931 }
1932
1933 /* Supports 2 tokens of pushback. */
1934 static inline void
phase8a_unget(token_ty * tp)1935 phase8a_unget (token_ty *tp)
1936 {
1937 phase6_unget (tp);
1938 }
1939
1940
1941 /* 8b. Drop whitespace. */
1942 static void
phase8b_get(token_ty * tp)1943 phase8b_get (token_ty *tp)
1944 {
1945 for (;;)
1946 {
1947 phase8a_get (tp);
1948
1949 if (tp->type == token_type_white_space)
1950 continue;
1951 if (tp->type == token_type_eoln)
1952 {
1953 /* We have to track the last occurrence of a string. One
1954 mode of xgettext allows to group an extracted message
1955 with a comment for documentation. The rule which states
1956 which comment is assumed to be grouped with the message
1957 says it should immediately precede it. Our
1958 interpretation: between the last line of the comment and
1959 the line in which the keyword is found must be no line
1960 with non-white space tokens. */
1961 ++newline_count;
1962 if (last_non_comment_line > last_comment_line)
1963 savable_comment_reset ();
1964 continue;
1965 }
1966 break;
1967 }
1968 }
1969
1970 /* Supports 2 tokens of pushback. */
1971 static inline void
phase8b_unget(token_ty * tp)1972 phase8b_unget (token_ty *tp)
1973 {
1974 phase8a_unget (tp);
1975 }
1976
1977
1978 /* 8c. In ObjectiveC mode, drop '@' before a literal string. We need to
1979 do this before performing concatenation of adjacent string literals. */
1980 static void
phase8c_get(token_ty * tp)1981 phase8c_get (token_ty *tp)
1982 {
1983 token_ty tmp;
1984
1985 phase8b_get (tp);
1986 if (tp->type != token_type_objc_special)
1987 return;
1988 phase8b_get (&tmp);
1989 if (tmp.type != token_type_string_literal)
1990 {
1991 phase8b_unget (&tmp);
1992 return;
1993 }
1994 /* Drop the '@' token and return immediately the following string. */
1995 drop_reference (tmp.comment);
1996 tmp.comment = tp->comment;
1997 *tp = tmp;
1998 }
1999
2000 /* Supports only one pushback token. */
2001 static inline void
phase8c_unget(token_ty * tp)2002 phase8c_unget (token_ty *tp)
2003 {
2004 phase8b_unget (tp);
2005 }
2006
2007
2008 /* 8. Concatenate adjacent string literals to form single string
2009 literals (because we don't expand macros, there are a few things we
2010 will miss). */
2011
2012 static void
phase8_get(token_ty * tp)2013 phase8_get (token_ty *tp)
2014 {
2015 phase8c_get (tp);
2016 if (tp->type != token_type_string_literal)
2017 return;
2018 for (;;)
2019 {
2020 token_ty tmp;
2021
2022 phase8c_get (&tmp);
2023 if (tmp.type != token_type_string_literal)
2024 {
2025 phase8c_unget (&tmp);
2026 return;
2027 }
2028 tp->mixed_string =
2029 mixed_string_concat_free1 (tp->mixed_string, tmp.mixed_string);
2030 free_token (&tmp);
2031 }
2032 }
2033
2034
2035 /* ===================== Reading of high-level tokens. ==================== */
2036
2037
2038 enum xgettext_token_type_ty
2039 {
2040 xgettext_token_type_eof,
2041 xgettext_token_type_keyword,
2042 xgettext_token_type_symbol,
2043 xgettext_token_type_lparen,
2044 xgettext_token_type_rparen,
2045 xgettext_token_type_comma,
2046 xgettext_token_type_colon,
2047 xgettext_token_type_string_literal,
2048 xgettext_token_type_other
2049 };
2050 typedef enum xgettext_token_type_ty xgettext_token_type_ty;
2051
2052 typedef struct xgettext_token_ty xgettext_token_ty;
2053 struct xgettext_token_ty
2054 {
2055 xgettext_token_type_ty type;
2056
2057 /* This field is used only for xgettext_token_type_keyword. */
2058 const struct callshapes *shapes;
2059
2060 /* This field is used only for xgettext_token_type_keyword,
2061 xgettext_token_type_symbol. */
2062 char *string;
2063
2064 /* This field is used only for xgettext_token_type_string_literal. */
2065 mixed_string_ty *mixed_string;
2066
2067 /* This field is used only for xgettext_token_type_string_literal. */
2068 refcounted_string_list_ty *comment;
2069
2070 /* This field is used only for xgettext_token_type_keyword,
2071 xgettext_token_type_string_literal. */
2072 lex_pos_ty pos;
2073 };
2074
2075
2076 /* 9. Convert the remaining preprocessing tokens to C tokens and
2077 discards any white space from the translation unit. */
2078
2079 static void
x_c_lex(xgettext_token_ty * tp)2080 x_c_lex (xgettext_token_ty *tp)
2081 {
2082 for (;;)
2083 {
2084 token_ty token;
2085 void *keyword_value;
2086
2087 phase8_get (&token);
2088 switch (token.type)
2089 {
2090 case token_type_eof:
2091 tp->type = xgettext_token_type_eof;
2092 return;
2093
2094 case token_type_name:
2095 last_non_comment_line = newline_count;
2096
2097 if (hash_find_entry (objc_extensions ? &objc_keywords : &c_keywords,
2098 token.string, strlen (token.string),
2099 &keyword_value)
2100 == 0)
2101 {
2102 tp->type = xgettext_token_type_keyword;
2103 tp->shapes = (const struct callshapes *) keyword_value;
2104 tp->pos.file_name = logical_file_name;
2105 tp->pos.line_number = token.line_number;
2106 }
2107 else
2108 tp->type = xgettext_token_type_symbol;
2109 tp->string = token.string;
2110 return;
2111
2112 case token_type_lparen:
2113 last_non_comment_line = newline_count;
2114
2115 tp->type = xgettext_token_type_lparen;
2116 return;
2117
2118 case token_type_rparen:
2119 last_non_comment_line = newline_count;
2120
2121 tp->type = xgettext_token_type_rparen;
2122 return;
2123
2124 case token_type_comma:
2125 last_non_comment_line = newline_count;
2126
2127 tp->type = xgettext_token_type_comma;
2128 return;
2129
2130 case token_type_colon:
2131 last_non_comment_line = newline_count;
2132
2133 tp->type = xgettext_token_type_colon;
2134 return;
2135
2136 case token_type_string_literal:
2137 last_non_comment_line = newline_count;
2138
2139 tp->type = xgettext_token_type_string_literal;
2140 tp->mixed_string = token.mixed_string;
2141 tp->comment = token.comment;
2142 tp->pos.file_name = logical_file_name;
2143 tp->pos.line_number = token.line_number;
2144 return;
2145
2146 case token_type_objc_special:
2147 drop_reference (token.comment);
2148 /* FALLTHROUGH */
2149
2150 default:
2151 last_non_comment_line = newline_count;
2152
2153 tp->type = xgettext_token_type_other;
2154 return;
2155 }
2156 }
2157 }
2158
2159
2160 /* ========================= Extracting strings. ========================== */
2161
2162
2163 /* Context lookup table. */
2164 static flag_context_list_table_ty *flag_context_list_table;
2165
2166
2167 /* The file is broken into tokens. Scan the token stream, looking for
2168 a keyword, followed by a left paren, followed by a string. When we
2169 see this sequence, we have something to remember. We assume we are
2170 looking at a valid C or C++ program, and leave the complaints about
2171 the grammar to the compiler.
2172
2173 Normal handling: Look for
2174 keyword ( ... msgid ... )
2175 Plural handling: Look for
2176 keyword ( ... msgid ... msgid_plural ... )
2177
2178 We use recursion because the arguments before msgid or between msgid
2179 and msgid_plural can contain subexpressions of the same form. */
2180
2181
2182 /* Extract messages until the next balanced closing parenthesis.
2183 Extracted messages are added to MLP.
2184 Return true upon eof, false upon closing parenthesis. */
2185 static bool
extract_parenthesized(message_list_ty * mlp,flag_context_ty outer_context,flag_context_list_iterator_ty context_iter,struct arglist_parser * argparser)2186 extract_parenthesized (message_list_ty *mlp,
2187 flag_context_ty outer_context,
2188 flag_context_list_iterator_ty context_iter,
2189 struct arglist_parser *argparser)
2190 {
2191 /* Current argument number. */
2192 int arg = 1;
2193 /* 0 when no keyword has been seen. 1 right after a keyword is seen. */
2194 int state;
2195 /* Parameters of the keyword just seen. Defined only in state 1. */
2196 const struct callshapes *next_shapes = NULL;
2197 /* Context iterator that will be used if the next token is a '('. */
2198 flag_context_list_iterator_ty next_context_iter =
2199 passthrough_context_list_iterator;
2200 /* Context iterator that will be used if the next token is a ':'.
2201 (Objective C selector syntax.) */
2202 flag_context_list_iterator_ty selectorcall_context_iter =
2203 passthrough_context_list_iterator;
2204 /* Current context. */
2205 flag_context_ty inner_context =
2206 inherited_context (outer_context,
2207 flag_context_list_iterator_advance (&context_iter));
2208
2209 /* Start state is 0. */
2210 state = 0;
2211
2212 for (;;)
2213 {
2214 xgettext_token_ty token;
2215
2216 x_c_lex (&token);
2217 switch (token.type)
2218 {
2219 case xgettext_token_type_keyword:
2220 next_shapes = token.shapes;
2221 state = 1;
2222 goto keyword_or_symbol;
2223
2224 case xgettext_token_type_symbol:
2225 state = 0;
2226 keyword_or_symbol:
2227 next_context_iter =
2228 flag_context_list_iterator (
2229 flag_context_list_table_lookup (
2230 flag_context_list_table,
2231 token.string, strlen (token.string)));
2232 if (objc_extensions)
2233 {
2234 size_t token_string_len = strlen (token.string);
2235 token.string = xrealloc (token.string, token_string_len + 2);
2236 token.string[token_string_len] = ':';
2237 token.string[token_string_len + 1] = '\0';
2238 selectorcall_context_iter =
2239 flag_context_list_iterator (
2240 flag_context_list_table_lookup (
2241 flag_context_list_table,
2242 token.string, token_string_len + 1));
2243 }
2244 free (token.string);
2245 continue;
2246
2247 case xgettext_token_type_lparen:
2248 if (extract_parenthesized (mlp, inner_context, next_context_iter,
2249 arglist_parser_alloc (mlp,
2250 state ? next_shapes : NULL)))
2251 {
2252 arglist_parser_done (argparser, arg);
2253 return true;
2254 }
2255 next_context_iter = null_context_list_iterator;
2256 selectorcall_context_iter = null_context_list_iterator;
2257 state = 0;
2258 continue;
2259
2260 case xgettext_token_type_rparen:
2261 arglist_parser_done (argparser, arg);
2262 return false;
2263
2264 case xgettext_token_type_comma:
2265 arg++;
2266 inner_context =
2267 inherited_context (outer_context,
2268 flag_context_list_iterator_advance (
2269 &context_iter));
2270 next_context_iter = passthrough_context_list_iterator;
2271 selectorcall_context_iter = passthrough_context_list_iterator;
2272 state = 0;
2273 continue;
2274
2275 case xgettext_token_type_colon:
2276 if (objc_extensions)
2277 {
2278 context_iter = selectorcall_context_iter;
2279 inner_context =
2280 inherited_context (inner_context,
2281 flag_context_list_iterator_advance (
2282 &context_iter));
2283 next_context_iter = passthrough_context_list_iterator;
2284 selectorcall_context_iter = passthrough_context_list_iterator;
2285 }
2286 else
2287 {
2288 next_context_iter = null_context_list_iterator;
2289 selectorcall_context_iter = null_context_list_iterator;
2290 }
2291 state = 0;
2292 continue;
2293
2294 case xgettext_token_type_string_literal:
2295 {
2296 if (extract_all)
2297 {
2298 char *string = mixed_string_contents (token.mixed_string);
2299 mixed_string_free (token.mixed_string);
2300 remember_a_message (mlp, NULL, string, true, false,
2301 inner_context, &token.pos,
2302 NULL, token.comment, false);
2303 }
2304 else
2305 arglist_parser_remember (argparser, arg, token.mixed_string,
2306 inner_context,
2307 token.pos.file_name,
2308 token.pos.line_number,
2309 token.comment, false);
2310 drop_reference (token.comment);
2311 }
2312 next_context_iter = null_context_list_iterator;
2313 selectorcall_context_iter = null_context_list_iterator;
2314 state = 0;
2315 continue;
2316
2317 case xgettext_token_type_other:
2318 next_context_iter = null_context_list_iterator;
2319 selectorcall_context_iter = null_context_list_iterator;
2320 state = 0;
2321 continue;
2322
2323 case xgettext_token_type_eof:
2324 arglist_parser_done (argparser, arg);
2325 return true;
2326
2327 default:
2328 abort ();
2329 }
2330 }
2331 }
2332
2333
2334 static void
extract_whole_file(FILE * f,const char * real_filename,const char * logical_filename,flag_context_list_table_ty * flag_table,msgdomain_list_ty * mdlp)2335 extract_whole_file (FILE *f,
2336 const char *real_filename, const char *logical_filename,
2337 flag_context_list_table_ty *flag_table,
2338 msgdomain_list_ty *mdlp)
2339 {
2340 message_list_ty *mlp = mdlp->item[0]->messages;
2341
2342 fp = f;
2343 real_file_name = real_filename;
2344 logical_file_name = xstrdup (logical_filename);
2345 line_number = 1;
2346
2347 phase1_pushback_length = 0;
2348 phase2_pushback_length = 0;
2349 phase3_pushback_length = 0;
2350
2351 last_comment_line = -1;
2352 last_non_comment_line = -1;
2353 newline_count = 0;
2354
2355 phase5_pushback_length = 0;
2356 phase6_pushback_length = 0;
2357
2358 flag_context_list_table = flag_table;
2359
2360 init_keywords ();
2361
2362 /* Eat tokens until eof is seen. When extract_parenthesized returns
2363 due to an unbalanced closing parenthesis, just restart it. */
2364 while (!extract_parenthesized (mlp, null_context, null_context_list_iterator,
2365 arglist_parser_alloc (mlp, NULL)))
2366 ;
2367
2368 /* Close scanner. */
2369 fp = NULL;
2370 real_file_name = NULL;
2371 logical_file_name = NULL;
2372 line_number = 0;
2373 }
2374
2375
2376 void
extract_c(FILE * f,const char * real_filename,const char * logical_filename,flag_context_list_table_ty * flag_table,msgdomain_list_ty * mdlp)2377 extract_c (FILE *f,
2378 const char *real_filename, const char *logical_filename,
2379 flag_context_list_table_ty *flag_table,
2380 msgdomain_list_ty *mdlp)
2381 {
2382 objc_extensions = false;
2383 cxx_extensions = false;
2384 extract_whole_file (f, real_filename, logical_filename, flag_table, mdlp);
2385 }
2386
2387 void
extract_cxx(FILE * f,const char * real_filename,const char * logical_filename,flag_context_list_table_ty * flag_table,msgdomain_list_ty * mdlp)2388 extract_cxx (FILE *f,
2389 const char *real_filename, const char *logical_filename,
2390 flag_context_list_table_ty *flag_table,
2391 msgdomain_list_ty *mdlp)
2392 {
2393 objc_extensions = false;
2394 cxx_extensions = true;
2395 extract_whole_file (f, real_filename, logical_filename, flag_table, mdlp);
2396 }
2397
2398 void
extract_objc(FILE * f,const char * real_filename,const char * logical_filename,flag_context_list_table_ty * flag_table,msgdomain_list_ty * mdlp)2399 extract_objc (FILE *f,
2400 const char *real_filename, const char *logical_filename,
2401 flag_context_list_table_ty *flag_table,
2402 msgdomain_list_ty *mdlp)
2403 {
2404 objc_extensions = true;
2405 cxx_extensions = false;
2406 extract_whole_file (f, real_filename, logical_filename, flag_table, mdlp);
2407 }
2408