1 /* GNU gettext - internationalization aids
2 Copyright (C) 1995-1999, 2000-2006 Free Software Foundation, Inc.
3
4 This file was written by Peter Miller <millerp@canb.auug.org.au>.
5 Multibyte character handling by Bruno Haible <haible@clisp.cons.org>.
6
7 This program is free software; you can redistribute it and/or modify
8 it under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 2, or (at your option)
10 any later version.
11
12 This program is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 GNU General Public License for more details.
16
17 You should have received a copy of the GNU General Public License
18 along with this program; if not, write to the Free Software Foundation,
19 Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */
20
21
22 #ifdef HAVE_CONFIG_H
23 # include "config.h"
24 #endif
25
26 /* Specification. */
27 #include "po-lex.h"
28
29 #include <errno.h>
30 #include <limits.h>
31 #include <stdio.h>
32 #include <stdlib.h>
33 #include <string.h>
34 #include <stdarg.h>
35
36 #if HAVE_ICONV
37 # include <iconv.h>
38 #endif
39
40 #include "c-ctype.h"
41 #include "linebreak.h"
42 #include "vasprintf.h"
43 #include "gettext.h"
44 #include "po-charset.h"
45 #include "xalloc.h"
46 #include "exit.h"
47 #include "error.h"
48 #include "error-progname.h"
49 #include "xvasprintf.h"
50 #include "po-error.h"
51 #include "po-xerror.h"
52 #include "pos.h"
53 #include "message.h"
54 #include "str-list.h"
55 #include "po-gram-gen2.h"
56
57 #define _(str) gettext(str)
58
59 #if HAVE_ICONV
60 # include "utf8-ucs4.h"
61 #endif
62
63 #if HAVE_DECL_GETC_UNLOCKED
64 # undef getc
65 # define getc getc_unlocked
66 #endif
67
68
69 /* Current position within the PO file. */
70 lex_pos_ty gram_pos;
71 int gram_pos_column;
72
73
74 /* Error handling during the parsing of a PO file.
75 These functions can access gram_pos and gram_pos_column. */
76
77 /* VARARGS1 */
78 void
po_gram_error(const char * fmt,...)79 po_gram_error (const char *fmt, ...)
80 {
81 va_list ap;
82 char *buffer;
83
84 va_start (ap, fmt);
85 if (vasprintf (&buffer, fmt, ap) < 0)
86 error (EXIT_FAILURE, 0, _("memory exhausted"));
87 va_end (ap);
88 po_xerror (PO_SEVERITY_ERROR, NULL, gram_pos.file_name, gram_pos.line_number,
89 gram_pos_column + 1, false, buffer);
90 free (buffer);
91
92 if (error_message_count >= gram_max_allowed_errors)
93 po_error (EXIT_FAILURE, 0, _("too many errors, aborting"));
94 }
95
96 /* VARARGS2 */
97 void
po_gram_error_at_line(const lex_pos_ty * pp,const char * fmt,...)98 po_gram_error_at_line (const lex_pos_ty *pp, const char *fmt, ...)
99 {
100 va_list ap;
101 char *buffer;
102
103 va_start (ap, fmt);
104 if (vasprintf (&buffer, fmt, ap) < 0)
105 error (EXIT_FAILURE, 0, _("memory exhausted"));
106 va_end (ap);
107 po_xerror (PO_SEVERITY_ERROR, NULL, pp->file_name, pp->line_number,
108 (size_t)(-1), false, buffer);
109 free (buffer);
110
111 if (error_message_count >= gram_max_allowed_errors)
112 po_error (EXIT_FAILURE, 0, _("too many errors, aborting"));
113 }
114
115
116 /* The lowest level of PO file parsing converts bytes to multibyte characters.
117 This is needed
118 1. for C compatibility: ISO C 99 section 5.1.1.2 says that the first
119 translation phase maps bytes to characters.
120 2. to keep track of the current column, for the sake of precise error
121 location. Emacs compile.el interprets the column in error messages
122 by default as a screen column number, not as character number.
123 3. to avoid skipping backslash-newline in the midst of a multibyte
124 character. If XY is a multibyte character, X \ newline Y is invalid.
125 */
126
127 /* Multibyte character data type. */
128 /* Note this depends on po_lex_charset and po_lex_iconv, which get set
129 while the file is being parsed. */
130
131 #define MBCHAR_BUF_SIZE 24
132
133 struct mbchar
134 {
135 size_t bytes; /* number of bytes of current character, > 0 */
136 #if HAVE_ICONV
137 bool uc_valid; /* true if uc is a valid Unicode character */
138 unsigned int uc; /* if uc_valid: the current character */
139 #endif
140 char buf[MBCHAR_BUF_SIZE]; /* room for the bytes */
141 };
142
143 /* We want to pass multibyte characters by reference automatically,
144 therefore we use an array type. */
145 typedef struct mbchar mbchar_t[1];
146
147 /* A version of memcpy optimized for the case n <= 1. */
148 static inline void
memcpy_small(void * dst,const void * src,size_t n)149 memcpy_small (void *dst, const void *src, size_t n)
150 {
151 if (n > 0)
152 {
153 char *q = (char *) dst;
154 const char *p = (const char *) src;
155
156 *q = *p;
157 if (--n > 0)
158 do *++q = *++p; while (--n > 0);
159 }
160 }
161
162 /* EOF (not a real character) is represented with bytes = 0 and
163 uc_valid = false. */
164 static inline bool
mb_iseof(const mbchar_t mbc)165 mb_iseof (const mbchar_t mbc)
166 {
167 return (mbc->bytes == 0);
168 }
169
170 /* Access the current character. */
171 static inline const char *
mb_ptr(const mbchar_t mbc)172 mb_ptr (const mbchar_t mbc)
173 {
174 return mbc->buf;
175 }
176 static inline size_t
mb_len(const mbchar_t mbc)177 mb_len (const mbchar_t mbc)
178 {
179 return mbc->bytes;
180 }
181
182 /* Comparison of characters. */
183
184 static inline bool
mb_iseq(const mbchar_t mbc,char sc)185 mb_iseq (const mbchar_t mbc, char sc)
186 {
187 /* Note: It is wrong to compare only mbc->uc, because when the encoding is
188 SHIFT_JIS, mbc->buf[0] == '\\' corresponds to mbc->uc == 0x00A5, but we
189 want to treat it as an escape character, although it looks like a Yen
190 sign. */
191 #if HAVE_ICONV && 0
192 if (mbc->uc_valid)
193 return (mbc->uc == sc); /* wrong! */
194 else
195 #endif
196 return (mbc->bytes == 1 && mbc->buf[0] == sc);
197 }
198
199 static inline bool
mb_isnul(const mbchar_t mbc)200 mb_isnul (const mbchar_t mbc)
201 {
202 #if HAVE_ICONV
203 if (mbc->uc_valid)
204 return (mbc->uc == 0);
205 else
206 #endif
207 return (mbc->bytes == 1 && mbc->buf[0] == 0);
208 }
209
210 static inline int
mb_cmp(const mbchar_t mbc1,const mbchar_t mbc2)211 mb_cmp (const mbchar_t mbc1, const mbchar_t mbc2)
212 {
213 #if HAVE_ICONV
214 if (mbc1->uc_valid && mbc2->uc_valid)
215 return (int) mbc1->uc - (int) mbc2->uc;
216 else
217 #endif
218 return (mbc1->bytes == mbc2->bytes
219 ? memcmp (mbc1->buf, mbc2->buf, mbc1->bytes)
220 : mbc1->bytes < mbc2->bytes
221 ? (memcmp (mbc1->buf, mbc2->buf, mbc1->bytes) > 0 ? 1 : -1)
222 : (memcmp (mbc1->buf, mbc2->buf, mbc2->bytes) >= 0 ? 1 : -1));
223 }
224
225 static inline bool
mb_equal(const mbchar_t mbc1,const mbchar_t mbc2)226 mb_equal (const mbchar_t mbc1, const mbchar_t mbc2)
227 {
228 #if HAVE_ICONV
229 if (mbc1->uc_valid && mbc2->uc_valid)
230 return mbc1->uc == mbc2->uc;
231 else
232 #endif
233 return (mbc1->bytes == mbc2->bytes
234 && memcmp (mbc1->buf, mbc2->buf, mbc1->bytes) == 0);
235 }
236
237 /* <ctype.h>, <wctype.h> classification. */
238
239 static inline bool
mb_isascii(const mbchar_t mbc)240 mb_isascii (const mbchar_t mbc)
241 {
242 #if HAVE_ICONV
243 if (mbc->uc_valid)
244 return (mbc->uc >= 0x0000 && mbc->uc <= 0x007F);
245 else
246 #endif
247 return mbc->bytes == 1 && (mbc->buf[0] & 0x80) == 0;
248 }
249
250 /* Extra <wchar.h> function. */
251
252 /* Unprintable characters appear as a small box of width 1. */
253 #define MB_UNPRINTABLE_WIDTH 1
254
255 static int
mb_width(const mbchar_t mbc)256 mb_width (const mbchar_t mbc)
257 {
258 #if HAVE_ICONV
259 if (mbc->uc_valid)
260 {
261 unsigned int uc = mbc->uc;
262 const char *encoding =
263 (po_lex_iconv != (iconv_t)(-1) ? po_lex_charset : "");
264 int w = uc_width (uc, encoding);
265 /* For unprintable characters, arbitrarily return 0 for control
266 characters (except tab) and MB_UNPRINTABLE_WIDTH otherwise. */
267 if (w >= 0)
268 return w;
269 if (uc >= 0x0000 && uc <= 0x001F)
270 {
271 if (uc == 0x0009)
272 return 8 - (gram_pos_column & 7);
273 return 0;
274 }
275 if ((uc >= 0x007F && uc <= 0x009F) || (uc >= 0x2028 && uc <= 0x2029))
276 return 0;
277 return MB_UNPRINTABLE_WIDTH;
278 }
279 else
280 #endif
281 {
282 if (mbc->bytes == 1)
283 {
284 if (
285 #if CHAR_MIN < 0x00 /* to avoid gcc warning */
286 mbc->buf[0] >= 0x00 &&
287 #endif
288 mbc->buf[0] <= 0x1F)
289 {
290 if (mbc->buf[0] == 0x09)
291 return 8 - (gram_pos_column & 7);
292 return 0;
293 }
294 if (mbc->buf[0] == 0x7F)
295 return 0;
296 }
297 return MB_UNPRINTABLE_WIDTH;
298 }
299 }
300
301 /* Output. */
302 static inline void
mb_putc(const mbchar_t mbc,FILE * stream)303 mb_putc (const mbchar_t mbc, FILE *stream)
304 {
305 fwrite (mbc->buf, 1, mbc->bytes, stream);
306 }
307
308 /* Assignment. */
309 static inline void
mb_setascii(mbchar_t mbc,char sc)310 mb_setascii (mbchar_t mbc, char sc)
311 {
312 mbc->bytes = 1;
313 #if HAVE_ICONV
314 mbc->uc_valid = 1;
315 mbc->uc = sc;
316 #endif
317 mbc->buf[0] = sc;
318 }
319
320 /* Copying a character. */
321 static inline void
mb_copy(mbchar_t new,const mbchar_t old)322 mb_copy (mbchar_t new, const mbchar_t old)
323 {
324 memcpy_small (&new->buf[0], &old->buf[0], old->bytes);
325 new->bytes = old->bytes;
326 #if HAVE_ICONV
327 if ((new->uc_valid = old->uc_valid))
328 new->uc = old->uc;
329 #endif
330 }
331
332
333 /* Multibyte character input. */
334
335 /* Number of characters that can be pushed back.
336 We need 1 for lex_getc, plus 1 for lex_ungetc. */
337 #define NPUSHBACK 2
338
339 /* Data type of a multibyte character input stream. */
340 struct mbfile
341 {
342 FILE *fp;
343 bool eof_seen;
344 int have_pushback;
345 unsigned int bufcount;
346 char buf[MBCHAR_BUF_SIZE];
347 struct mbchar pushback[NPUSHBACK];
348 };
349
350 /* We want to pass multibyte streams by reference automatically,
351 therefore we use an array type. */
352 typedef struct mbfile mbfile_t[1];
353
354 /* Whether invalid multibyte sequences in the input shall be signalled
355 or silently tolerated. */
356 static bool signal_eilseq;
357
358 static inline void
mbfile_init(mbfile_t mbf,FILE * stream)359 mbfile_init (mbfile_t mbf, FILE *stream)
360 {
361 mbf->fp = stream;
362 mbf->eof_seen = false;
363 mbf->have_pushback = 0;
364 mbf->bufcount = 0;
365 }
366
367 /* Read the next multibyte character from mbf and put it into mbc.
368 If a read error occurs, errno is set and ferror (mbf->fp) becomes true. */
369 static void
mbfile_getc(mbchar_t mbc,mbfile_t mbf)370 mbfile_getc (mbchar_t mbc, mbfile_t mbf)
371 {
372 size_t bytes;
373
374 /* If EOF has already been seen, don't use getc. This matters if
375 mbf->fp is connected to an interactive tty. */
376 if (mbf->eof_seen)
377 goto eof;
378
379 /* Return character pushed back, if there is one. */
380 if (mbf->have_pushback > 0)
381 {
382 mbf->have_pushback--;
383 mb_copy (mbc, &mbf->pushback[mbf->have_pushback]);
384 return;
385 }
386
387 /* Before using iconv, we need at least one byte. */
388 if (mbf->bufcount == 0)
389 {
390 int c = getc (mbf->fp);
391 if (c == EOF)
392 {
393 mbf->eof_seen = true;
394 goto eof;
395 }
396 mbf->buf[0] = (unsigned char) c;
397 mbf->bufcount++;
398 }
399
400 #if HAVE_ICONV
401 if (po_lex_iconv != (iconv_t)(-1))
402 {
403 /* Use iconv on an increasing number of bytes. Read only as many
404 bytes from mbf->fp as needed. This is needed to give reasonable
405 interactive behaviour when mbf->fp is connected to an interactive
406 tty. */
407 for (;;)
408 {
409 unsigned char scratchbuf[64];
410 const char *inptr = &mbf->buf[0];
411 size_t insize = mbf->bufcount;
412 char *outptr = (char *) &scratchbuf[0];
413 size_t outsize = sizeof (scratchbuf);
414
415 size_t res = iconv (po_lex_iconv,
416 (ICONV_CONST char **) &inptr, &insize,
417 &outptr, &outsize);
418 /* We expect that a character has been produced if and only if
419 some input bytes have been consumed. */
420 if ((insize < mbf->bufcount) != (outsize < sizeof (scratchbuf)))
421 abort ();
422 if (outsize == sizeof (scratchbuf))
423 {
424 /* No character has been produced. Must be an error. */
425 if (res != (size_t)(-1))
426 abort ();
427
428 if (errno == EILSEQ)
429 {
430 /* An invalid multibyte sequence was encountered. */
431 /* Return a single byte. */
432 if (signal_eilseq)
433 po_gram_error (_("invalid multibyte sequence"));
434 bytes = 1;
435 mbc->uc_valid = false;
436 break;
437 }
438 else if (errno == EINVAL)
439 {
440 /* An incomplete multibyte character. */
441 int c;
442
443 if (mbf->bufcount == MBCHAR_BUF_SIZE)
444 {
445 /* An overlong incomplete multibyte sequence was
446 encountered. */
447 /* Return a single byte. */
448 bytes = 1;
449 mbc->uc_valid = false;
450 break;
451 }
452
453 /* Read one more byte and retry iconv. */
454 c = getc (mbf->fp);
455 if (c == EOF)
456 {
457 mbf->eof_seen = true;
458 if (ferror (mbf->fp))
459 goto eof;
460 if (signal_eilseq)
461 po_gram_error (_("\
462 incomplete multibyte sequence at end of file"));
463 bytes = mbf->bufcount;
464 mbc->uc_valid = false;
465 break;
466 }
467 mbf->buf[mbf->bufcount++] = (unsigned char) c;
468 if (c == '\n')
469 {
470 if (signal_eilseq)
471 po_gram_error (_("\
472 incomplete multibyte sequence at end of line"));
473 bytes = mbf->bufcount - 1;
474 mbc->uc_valid = false;
475 break;
476 }
477 }
478 else
479 {
480 const char *errno_description = strerror (errno);
481 po_xerror (PO_SEVERITY_FATAL_ERROR, NULL, NULL, 0, 0, false,
482 xasprintf ("%s: %s",
483 _("iconv failure"),
484 errno_description));
485 }
486 }
487 else
488 {
489 size_t outbytes = sizeof (scratchbuf) - outsize;
490 bytes = mbf->bufcount - insize;
491
492 /* We expect that one character has been produced. */
493 if (bytes == 0)
494 abort ();
495 if (outbytes == 0)
496 abort ();
497 /* Convert it from UTF-8 to UCS-4. */
498 if (u8_mbtouc (&mbc->uc, scratchbuf, outbytes) < outbytes)
499 {
500 /* scratchbuf contains an out-of-range Unicode character
501 (> 0x10ffff). */
502 if (signal_eilseq)
503 po_gram_error (_("invalid multibyte sequence"));
504 mbc->uc_valid = false;
505 break;
506 }
507 mbc->uc_valid = true;
508 break;
509 }
510 }
511 }
512 else
513 #endif
514 {
515 if (po_lex_weird_cjk
516 /* Special handling of encodings with CJK structure. */
517 && (unsigned char) mbf->buf[0] >= 0x80)
518 {
519 if (mbf->bufcount == 1)
520 {
521 /* Read one more byte. */
522 int c = getc (mbf->fp);
523 if (c == EOF)
524 {
525 if (ferror (mbf->fp))
526 {
527 mbf->eof_seen = true;
528 goto eof;
529 }
530 }
531 else
532 {
533 mbf->buf[1] = (unsigned char) c;
534 mbf->bufcount++;
535 }
536 }
537 if (mbf->bufcount >= 2 && (unsigned char) mbf->buf[1] >= 0x30)
538 /* Return a double byte. */
539 bytes = 2;
540 else
541 /* Return a single byte. */
542 bytes = 1;
543 }
544 else
545 {
546 /* Return a single byte. */
547 bytes = 1;
548 }
549 #if HAVE_ICONV
550 mbc->uc_valid = false;
551 #endif
552 }
553
554 /* Return the multibyte sequence mbf->buf[0..bytes-1]. */
555 memcpy_small (&mbc->buf[0], &mbf->buf[0], bytes);
556 mbc->bytes = bytes;
557
558 mbf->bufcount -= bytes;
559 if (mbf->bufcount > 0)
560 {
561 /* It's not worth calling memmove() for so few bytes. */
562 unsigned int count = mbf->bufcount;
563 char *p = &mbf->buf[0];
564
565 do
566 {
567 *p = *(p + bytes);
568 p++;
569 }
570 while (--count > 0);
571 }
572 return;
573
574 eof:
575 /* An mbchar_t with bytes == 0 is used to indicate EOF. */
576 mbc->bytes = 0;
577 #if HAVE_ICONV
578 mbc->uc_valid = false;
579 #endif
580 return;
581 }
582
583 static void
mbfile_ungetc(const mbchar_t mbc,mbfile_t mbf)584 mbfile_ungetc (const mbchar_t mbc, mbfile_t mbf)
585 {
586 if (mbf->have_pushback >= NPUSHBACK)
587 abort ();
588 mb_copy (&mbf->pushback[mbf->have_pushback], mbc);
589 mbf->have_pushback++;
590 }
591
592
593 /* Lexer variables. */
594
595 static mbfile_t mbf;
596 unsigned int gram_max_allowed_errors = 20;
597 static bool po_lex_obsolete;
598 static bool po_lex_previous;
599 static bool pass_comments = false;
600 bool pass_obsolete_entries = false;
601
602
603 /* Prepare lexical analysis. */
604 void
lex_start(FILE * fp,const char * real_filename,const char * logical_filename)605 lex_start (FILE *fp, const char *real_filename, const char *logical_filename)
606 {
607 /* Ignore the logical_filename, because PO file entries already have
608 their file names attached. But use real_filename for error messages. */
609 gram_pos.file_name = xstrdup (real_filename);
610
611 mbfile_init (mbf, fp);
612
613 gram_pos.line_number = 1;
614 gram_pos_column = 0;
615 signal_eilseq = true;
616 po_lex_obsolete = false;
617 po_lex_previous = false;
618 po_lex_charset_init ();
619 }
620
621 /* Terminate lexical analysis. */
622 void
lex_end()623 lex_end ()
624 {
625 mbf->fp = NULL;
626 gram_pos.file_name = NULL;
627 gram_pos.line_number = 0;
628 gram_pos_column = 0;
629 signal_eilseq = false;
630 po_lex_obsolete = false;
631 po_lex_previous = false;
632 po_lex_charset_close ();
633 }
634
635
636 /* Read a single character, dealing with backslash-newline.
637 Also keep track of the current line number and column number. */
638 static void
lex_getc(mbchar_t mbc)639 lex_getc (mbchar_t mbc)
640 {
641 for (;;)
642 {
643 mbfile_getc (mbc, mbf);
644
645 if (mb_iseof (mbc))
646 {
647 if (ferror (mbf->fp))
648 bomb:
649 {
650 const char *errno_description = strerror (errno);
651 po_xerror (PO_SEVERITY_FATAL_ERROR, NULL, NULL, 0, 0, false,
652 xasprintf ("%s: %s",
653 xasprintf (_("error while reading \"%s\""),
654 gram_pos.file_name),
655 errno_description));
656 }
657 break;
658 }
659
660 if (mb_iseq (mbc, '\n'))
661 {
662 gram_pos.line_number++;
663 gram_pos_column = 0;
664 break;
665 }
666
667 gram_pos_column += mb_width (mbc);
668
669 if (mb_iseq (mbc, '\\'))
670 {
671 mbchar_t mbc2;
672
673 mbfile_getc (mbc2, mbf);
674
675 if (mb_iseof (mbc2))
676 {
677 if (ferror (mbf->fp))
678 goto bomb;
679 break;
680 }
681
682 if (!mb_iseq (mbc2, '\n'))
683 {
684 mbfile_ungetc (mbc2, mbf);
685 break;
686 }
687
688 gram_pos.line_number++;
689 gram_pos_column = 0;
690 }
691 else
692 break;
693 }
694 }
695
696
697 static void
lex_ungetc(const mbchar_t mbc)698 lex_ungetc (const mbchar_t mbc)
699 {
700 if (!mb_iseof (mbc))
701 {
702 if (mb_iseq (mbc, '\n'))
703 /* Decrement the line number, but don't care about the column. */
704 gram_pos.line_number--;
705 else
706 /* Decrement the column number. Also works well enough for tabs. */
707 gram_pos_column -= mb_width (mbc);
708
709 mbfile_ungetc (mbc, mbf);
710 }
711 }
712
713
714 static int
keyword_p(const char * s)715 keyword_p (const char *s)
716 {
717 if (!po_lex_previous)
718 {
719 if (!strcmp (s, "domain"))
720 return DOMAIN;
721 if (!strcmp (s, "msgid"))
722 return MSGID;
723 if (!strcmp (s, "msgid_plural"))
724 return MSGID_PLURAL;
725 if (!strcmp (s, "msgstr"))
726 return MSGSTR;
727 if (!strcmp (s, "msgctxt"))
728 return MSGCTXT;
729 }
730 else
731 {
732 /* Inside a "#|" context, the keywords have a different meaning. */
733 if (!strcmp (s, "msgid"))
734 return PREV_MSGID;
735 if (!strcmp (s, "msgid_plural"))
736 return PREV_MSGID_PLURAL;
737 if (!strcmp (s, "msgctxt"))
738 return PREV_MSGCTXT;
739 }
740 po_gram_error_at_line (&gram_pos, _("keyword \"%s\" unknown"), s);
741 return NAME;
742 }
743
744
745 static int
control_sequence()746 control_sequence ()
747 {
748 mbchar_t mbc;
749 int val;
750 int max;
751
752 lex_getc (mbc);
753 if (mb_len (mbc) == 1)
754 switch (mb_ptr (mbc) [0])
755 {
756 case 'n':
757 return '\n';
758
759 case 't':
760 return '\t';
761
762 case 'b':
763 return '\b';
764
765 case 'r':
766 return '\r';
767
768 case 'f':
769 return '\f';
770
771 case 'v':
772 return '\v';
773
774 case 'a':
775 return '\a';
776
777 case '\\':
778 case '"':
779 return mb_ptr (mbc) [0];
780
781 case '0': case '1': case '2': case '3':
782 case '4': case '5': case '6': case '7':
783 val = 0;
784 max = 0;
785 for (;;)
786 {
787 char c = mb_ptr (mbc) [0];
788 /* Warning: not portable, can't depend on '0'..'7' ordering. */
789 val = val * 8 + (c - '0');
790 if (++max == 3)
791 break;
792 lex_getc (mbc);
793 if (mb_len (mbc) == 1)
794 switch (mb_ptr (mbc) [0])
795 {
796 case '0': case '1': case '2': case '3':
797 case '4': case '5': case '6': case '7':
798 continue;
799
800 default:
801 break;
802 }
803 lex_ungetc (mbc);
804 break;
805 }
806 return val;
807
808 case 'x':
809 lex_getc (mbc);
810 if (mb_iseof (mbc) || mb_len (mbc) != 1
811 || !c_isxdigit (mb_ptr (mbc) [0]))
812 break;
813
814 val = 0;
815 for (;;)
816 {
817 char c = mb_ptr (mbc) [0];
818 val *= 16;
819 if (c_isdigit (c))
820 /* Warning: not portable, can't depend on '0'..'9' ordering */
821 val += c - '0';
822 else if (c_isupper (c))
823 /* Warning: not portable, can't depend on 'A'..'F' ordering */
824 val += c - 'A' + 10;
825 else
826 /* Warning: not portable, can't depend on 'a'..'f' ordering */
827 val += c - 'a' + 10;
828
829 lex_getc (mbc);
830 if (mb_len (mbc) == 1)
831 switch (mb_ptr (mbc) [0])
832 {
833 case '0': case '1': case '2': case '3': case '4':
834 case '5': case '6': case '7': case '8': case '9':
835 case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
836 case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
837 continue;
838
839 default:
840 break;
841 }
842 lex_ungetc (mbc);
843 break;
844 }
845 return val;
846
847 /* FIXME: \u and \U are not handled. */
848 }
849 lex_ungetc (mbc);
850 po_gram_error (_("invalid control sequence"));
851 return ' ';
852 }
853
854
855 /* Return the next token in the PO file. The return codes are defined
856 in "po-gram-gen2.h". Associated data is put in 'po_gram_lval'. */
857 int
po_gram_lex()858 po_gram_lex ()
859 {
860 static char *buf;
861 static size_t bufmax;
862 mbchar_t mbc;
863 size_t bufpos;
864
865 for (;;)
866 {
867 lex_getc (mbc);
868
869 if (mb_iseof (mbc))
870 /* Yacc want this for end of file. */
871 return 0;
872
873 if (mb_len (mbc) == 1)
874 switch (mb_ptr (mbc) [0])
875 {
876 case '\n':
877 po_lex_obsolete = false;
878 po_lex_previous = false;
879 /* Ignore whitespace, not relevant for the grammar. */
880 break;
881
882 case ' ':
883 case '\t':
884 case '\r':
885 case '\f':
886 case '\v':
887 /* Ignore whitespace, not relevant for the grammar. */
888 break;
889
890 case '#':
891 lex_getc (mbc);
892 if (mb_iseq (mbc, '~'))
893 /* A pseudo-comment beginning with #~ is found. This is
894 not a comment. It is the format for obsolete entries.
895 We simply discard the "#~" prefix. The following
896 characters are expected to be well formed. */
897 {
898 po_lex_obsolete = true;
899 /* A pseudo-comment beginning with #~| denotes a previous
900 untranslated string in an obsolete entry. This does not
901 make much sense semantically, and is implemented here
902 for completeness only. */
903 lex_getc (mbc);
904 if (mb_iseq (mbc, '|'))
905 po_lex_previous = true;
906 else
907 lex_ungetc (mbc);
908 break;
909 }
910 if (mb_iseq (mbc, '|'))
911 /* A pseudo-comment beginning with #| is found. This is
912 the previous untranslated string. We discard the "#|"
913 prefix, but change the keywords and string returns
914 accordingly. */
915 {
916 po_lex_previous = true;
917 break;
918 }
919
920 /* Accumulate comments into a buffer. If we have been asked
921 to pass comments, generate a COMMENT token, otherwise
922 discard it. */
923 signal_eilseq = false;
924 if (pass_comments)
925 {
926 bufpos = 0;
927 for (;;)
928 {
929 while (bufpos + mb_len (mbc) >= bufmax)
930 {
931 bufmax += 100;
932 buf = xrealloc (buf, bufmax);
933 }
934 if (mb_iseof (mbc) || mb_iseq (mbc, '\n'))
935 break;
936
937 memcpy_small (&buf[bufpos], mb_ptr (mbc), mb_len (mbc));
938 bufpos += mb_len (mbc);
939
940 lex_getc (mbc);
941 }
942 buf[bufpos] = '\0';
943
944 po_gram_lval.string.string = buf;
945 po_gram_lval.string.pos = gram_pos;
946 po_gram_lval.string.obsolete = po_lex_obsolete;
947 po_lex_obsolete = false;
948 signal_eilseq = true;
949 return COMMENT;
950 }
951 else
952 {
953 /* We do this in separate loop because collecting large
954 comments while they get not passed to the upper layers
955 is not very efficient. */
956 while (!mb_iseof (mbc) && !mb_iseq (mbc, '\n'))
957 lex_getc (mbc);
958 po_lex_obsolete = false;
959 signal_eilseq = true;
960 }
961 break;
962
963 case '"':
964 /* Accumulate a string. */
965 bufpos = 0;
966 for (;;)
967 {
968 lex_getc (mbc);
969 while (bufpos + mb_len (mbc) >= bufmax)
970 {
971 bufmax += 100;
972 buf = xrealloc (buf, bufmax);
973 }
974 if (mb_iseof (mbc))
975 {
976 po_gram_error_at_line (&gram_pos,
977 _("end-of-file within string"));
978 break;
979 }
980 if (mb_iseq (mbc, '\n'))
981 {
982 po_gram_error_at_line (&gram_pos,
983 _("end-of-line within string"));
984 break;
985 }
986 if (mb_iseq (mbc, '"'))
987 break;
988 if (mb_iseq (mbc, '\\'))
989 {
990 buf[bufpos++] = control_sequence ();
991 continue;
992 }
993
994 /* Add mbc to the accumulator. */
995 memcpy_small (&buf[bufpos], mb_ptr (mbc), mb_len (mbc));
996 bufpos += mb_len (mbc);
997 }
998 buf[bufpos] = '\0';
999
1000 /* Strings cannot contain the msgctxt separator, because it cannot
1001 be faithfully represented in the msgid of a .mo file. */
1002 if (strchr (buf, MSGCTXT_SEPARATOR) != NULL)
1003 po_gram_error_at_line (&gram_pos,
1004 _("context separator <EOT> within string"));
1005
1006 /* FIXME: Treatment of embedded \000 chars is incorrect. */
1007 po_gram_lval.string.string = xstrdup (buf);
1008 po_gram_lval.string.pos = gram_pos;
1009 po_gram_lval.string.obsolete = po_lex_obsolete;
1010 return (po_lex_previous ? PREV_STRING : STRING);
1011
1012 case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
1013 case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
1014 case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
1015 case 's': case 't': case 'u': case 'v': case 'w': case 'x':
1016 case 'y': case 'z':
1017 case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
1018 case 'G': case 'H': case 'I': case 'J': case 'K': case 'L':
1019 case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R':
1020 case 'S': case 'T': case 'U': case 'V': case 'W': case 'X':
1021 case 'Y': case 'Z':
1022 case '_': case '$':
1023 bufpos = 0;
1024 for (;;)
1025 {
1026 char c = mb_ptr (mbc) [0];
1027 if (bufpos + 1 >= bufmax)
1028 {
1029 bufmax += 100;
1030 buf = xrealloc (buf, bufmax);
1031 }
1032 buf[bufpos++] = c;
1033 lex_getc (mbc);
1034 if (mb_len (mbc) == 1)
1035 switch (mb_ptr (mbc) [0])
1036 {
1037 default:
1038 break;
1039 case 'a': case 'b': case 'c': case 'd': case 'e':
1040 case 'f': case 'g': case 'h': case 'i': case 'j':
1041 case 'k': case 'l': case 'm': case 'n': case 'o':
1042 case 'p': case 'q': case 'r': case 's': case 't':
1043 case 'u': case 'v': case 'w': case 'x': case 'y':
1044 case 'z':
1045 case 'A': case 'B': case 'C': case 'D': case 'E':
1046 case 'F': case 'G': case 'H': case 'I': case 'J':
1047 case 'K': case 'L': case 'M': case 'N': case 'O':
1048 case 'P': case 'Q': case 'R': case 'S': case 'T':
1049 case 'U': case 'V': case 'W': case 'X': case 'Y':
1050 case 'Z':
1051 case '_': case '$':
1052 case '0': case '1': case '2': case '3': case '4':
1053 case '5': case '6': case '7': case '8': case '9':
1054 continue;
1055 }
1056 break;
1057 }
1058 lex_ungetc (mbc);
1059
1060 buf[bufpos] = '\0';
1061
1062 {
1063 int k = keyword_p (buf);
1064 if (k == NAME)
1065 {
1066 po_gram_lval.string.string = xstrdup (buf);
1067 po_gram_lval.string.pos = gram_pos;
1068 po_gram_lval.string.obsolete = po_lex_obsolete;
1069 }
1070 else
1071 {
1072 po_gram_lval.pos.pos = gram_pos;
1073 po_gram_lval.pos.obsolete = po_lex_obsolete;
1074 }
1075 return k;
1076 }
1077
1078 case '0': case '1': case '2': case '3': case '4':
1079 case '5': case '6': case '7': case '8': case '9':
1080 bufpos = 0;
1081 for (;;)
1082 {
1083 char c = mb_ptr (mbc) [0];
1084 if (bufpos + 1 >= bufmax)
1085 {
1086 bufmax += 100;
1087 buf = xrealloc (buf, bufmax + 1);
1088 }
1089 buf[bufpos++] = c;
1090 lex_getc (mbc);
1091 if (mb_len (mbc) == 1)
1092 switch (mb_ptr (mbc) [0])
1093 {
1094 default:
1095 break;
1096
1097 case '0': case '1': case '2': case '3': case '4':
1098 case '5': case '6': case '7': case '8': case '9':
1099 continue;
1100 }
1101 break;
1102 }
1103 lex_ungetc (mbc);
1104
1105 buf[bufpos] = '\0';
1106
1107 po_gram_lval.number.number = atol (buf);
1108 po_gram_lval.number.pos = gram_pos;
1109 po_gram_lval.number.obsolete = po_lex_obsolete;
1110 return NUMBER;
1111
1112 case '[':
1113 po_gram_lval.pos.pos = gram_pos;
1114 po_gram_lval.pos.obsolete = po_lex_obsolete;
1115 return '[';
1116
1117 case ']':
1118 po_gram_lval.pos.pos = gram_pos;
1119 po_gram_lval.pos.obsolete = po_lex_obsolete;
1120 return ']';
1121
1122 default:
1123 /* This will cause a syntax error. */
1124 return JUNK;
1125 }
1126 else
1127 /* This will cause a syntax error. */
1128 return JUNK;
1129 }
1130 }
1131
1132
1133 /* po_gram_lex() can return comments as COMMENT. Switch this on or off. */
1134 void
po_lex_pass_comments(bool flag)1135 po_lex_pass_comments (bool flag)
1136 {
1137 pass_comments = flag;
1138 }
1139
1140
1141 /* po_gram_lex() can return obsolete entries as if they were normal entries.
1142 Switch this on or off. */
1143 void
po_lex_pass_obsolete_entries(bool flag)1144 po_lex_pass_obsolete_entries (bool flag)
1145 {
1146 pass_obsolete_entries = flag;
1147 }
1148