1 /* GNU gettext - internationalization aids
2    Copyright (C) 1995-1999, 2000-2006 Free Software Foundation, Inc.
3 
4    This file was written by Peter Miller <millerp@canb.auug.org.au>.
5    Multibyte character handling by Bruno Haible <haible@clisp.cons.org>.
6 
7    This program is free software; you can redistribute it and/or modify
8    it under the terms of the GNU General Public License as published by
9    the Free Software Foundation; either version 2, or (at your option)
10    any later version.
11 
12    This program is distributed in the hope that it will be useful,
13    but WITHOUT ANY WARRANTY; without even the implied warranty of
14    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15    GNU General Public License for more details.
16 
17    You should have received a copy of the GNU General Public License
18    along with this program; if not, write to the Free Software Foundation,
19    Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.  */
20 
21 
22 #ifdef HAVE_CONFIG_H
23 # include "config.h"
24 #endif
25 
26 /* Specification.  */
27 #include "po-lex.h"
28 
29 #include <errno.h>
30 #include <limits.h>
31 #include <stdio.h>
32 #include <stdlib.h>
33 #include <string.h>
34 #include <stdarg.h>
35 
36 #if HAVE_ICONV
37 # include <iconv.h>
38 #endif
39 
40 #include "c-ctype.h"
41 #include "linebreak.h"
42 #include "vasprintf.h"
43 #include "gettext.h"
44 #include "po-charset.h"
45 #include "xalloc.h"
46 #include "exit.h"
47 #include "error.h"
48 #include "error-progname.h"
49 #include "xvasprintf.h"
50 #include "po-error.h"
51 #include "po-xerror.h"
52 #include "pos.h"
53 #include "message.h"
54 #include "str-list.h"
55 #include "po-gram-gen2.h"
56 
57 #define _(str) gettext(str)
58 
59 #if HAVE_ICONV
60 # include "utf8-ucs4.h"
61 #endif
62 
63 #if HAVE_DECL_GETC_UNLOCKED
64 # undef getc
65 # define getc getc_unlocked
66 #endif
67 
68 
69 /* Current position within the PO file.  */
70 lex_pos_ty gram_pos;
71 int gram_pos_column;
72 
73 
74 /* Error handling during the parsing of a PO file.
75    These functions can access gram_pos and gram_pos_column.  */
76 
77 /* VARARGS1 */
78 void
po_gram_error(const char * fmt,...)79 po_gram_error (const char *fmt, ...)
80 {
81   va_list ap;
82   char *buffer;
83 
84   va_start (ap, fmt);
85   if (vasprintf (&buffer, fmt, ap) < 0)
86     error (EXIT_FAILURE, 0, _("memory exhausted"));
87   va_end (ap);
88   po_xerror (PO_SEVERITY_ERROR, NULL, gram_pos.file_name, gram_pos.line_number,
89 	     gram_pos_column + 1, false, buffer);
90   free (buffer);
91 
92   if (error_message_count >= gram_max_allowed_errors)
93     po_error (EXIT_FAILURE, 0, _("too many errors, aborting"));
94 }
95 
96 /* VARARGS2 */
97 void
po_gram_error_at_line(const lex_pos_ty * pp,const char * fmt,...)98 po_gram_error_at_line (const lex_pos_ty *pp, const char *fmt, ...)
99 {
100   va_list ap;
101   char *buffer;
102 
103   va_start (ap, fmt);
104   if (vasprintf (&buffer, fmt, ap) < 0)
105     error (EXIT_FAILURE, 0, _("memory exhausted"));
106   va_end (ap);
107   po_xerror (PO_SEVERITY_ERROR, NULL, pp->file_name, pp->line_number,
108 	     (size_t)(-1), false, buffer);
109   free (buffer);
110 
111   if (error_message_count >= gram_max_allowed_errors)
112     po_error (EXIT_FAILURE, 0, _("too many errors, aborting"));
113 }
114 
115 
116 /* The lowest level of PO file parsing converts bytes to multibyte characters.
117    This is needed
118    1. for C compatibility: ISO C 99 section 5.1.1.2 says that the first
119       translation phase maps bytes to characters.
120    2. to keep track of the current column, for the sake of precise error
121       location. Emacs compile.el interprets the column in error messages
122       by default as a screen column number, not as character number.
123    3. to avoid skipping backslash-newline in the midst of a multibyte
124       character. If XY is a multibyte character,  X \ newline Y  is invalid.
125  */
126 
127 /* Multibyte character data type.  */
128 /* Note this depends on po_lex_charset and po_lex_iconv, which get set
129    while the file is being parsed.  */
130 
131 #define MBCHAR_BUF_SIZE 24
132 
133 struct mbchar
134 {
135   size_t bytes;		/* number of bytes of current character, > 0 */
136 #if HAVE_ICONV
137   bool uc_valid;	/* true if uc is a valid Unicode character */
138   unsigned int uc;	/* if uc_valid: the current character */
139 #endif
140   char buf[MBCHAR_BUF_SIZE]; /* room for the bytes */
141 };
142 
143 /* We want to pass multibyte characters by reference automatically,
144    therefore we use an array type.  */
145 typedef struct mbchar mbchar_t[1];
146 
147 /* A version of memcpy optimized for the case n <= 1.  */
148 static inline void
memcpy_small(void * dst,const void * src,size_t n)149 memcpy_small (void *dst, const void *src, size_t n)
150 {
151   if (n > 0)
152     {
153       char *q = (char *) dst;
154       const char *p = (const char *) src;
155 
156       *q = *p;
157       if (--n > 0)
158 	do *++q = *++p; while (--n > 0);
159     }
160 }
161 
162 /* EOF (not a real character) is represented with bytes = 0 and
163    uc_valid = false.  */
164 static inline bool
mb_iseof(const mbchar_t mbc)165 mb_iseof (const mbchar_t mbc)
166 {
167   return (mbc->bytes == 0);
168 }
169 
170 /* Access the current character.  */
171 static inline const char *
mb_ptr(const mbchar_t mbc)172 mb_ptr (const mbchar_t mbc)
173 {
174   return mbc->buf;
175 }
176 static inline size_t
mb_len(const mbchar_t mbc)177 mb_len (const mbchar_t mbc)
178 {
179   return mbc->bytes;
180 }
181 
182 /* Comparison of characters.  */
183 
184 static inline bool
mb_iseq(const mbchar_t mbc,char sc)185 mb_iseq (const mbchar_t mbc, char sc)
186 {
187   /* Note: It is wrong to compare only mbc->uc, because when the encoding is
188      SHIFT_JIS, mbc->buf[0] == '\\' corresponds to mbc->uc == 0x00A5, but we
189      want to treat it as an escape character, although it looks like a Yen
190      sign.  */
191 #if HAVE_ICONV && 0
192   if (mbc->uc_valid)
193     return (mbc->uc == sc); /* wrong! */
194   else
195 #endif
196     return (mbc->bytes == 1 && mbc->buf[0] == sc);
197 }
198 
199 static inline bool
mb_isnul(const mbchar_t mbc)200 mb_isnul (const mbchar_t mbc)
201 {
202 #if HAVE_ICONV
203   if (mbc->uc_valid)
204     return (mbc->uc == 0);
205   else
206 #endif
207     return (mbc->bytes == 1 && mbc->buf[0] == 0);
208 }
209 
210 static inline int
mb_cmp(const mbchar_t mbc1,const mbchar_t mbc2)211 mb_cmp (const mbchar_t mbc1, const mbchar_t mbc2)
212 {
213 #if HAVE_ICONV
214   if (mbc1->uc_valid && mbc2->uc_valid)
215     return (int) mbc1->uc - (int) mbc2->uc;
216   else
217 #endif
218     return (mbc1->bytes == mbc2->bytes
219 	    ? memcmp (mbc1->buf, mbc2->buf, mbc1->bytes)
220 	    : mbc1->bytes < mbc2->bytes
221 	      ? (memcmp (mbc1->buf, mbc2->buf, mbc1->bytes) > 0 ? 1 : -1)
222 	      : (memcmp (mbc1->buf, mbc2->buf, mbc2->bytes) >= 0 ? 1 : -1));
223 }
224 
225 static inline bool
mb_equal(const mbchar_t mbc1,const mbchar_t mbc2)226 mb_equal (const mbchar_t mbc1, const mbchar_t mbc2)
227 {
228 #if HAVE_ICONV
229   if (mbc1->uc_valid && mbc2->uc_valid)
230     return mbc1->uc == mbc2->uc;
231   else
232 #endif
233     return (mbc1->bytes == mbc2->bytes
234 	    && memcmp (mbc1->buf, mbc2->buf, mbc1->bytes) == 0);
235 }
236 
237 /* <ctype.h>, <wctype.h> classification.  */
238 
239 static inline bool
mb_isascii(const mbchar_t mbc)240 mb_isascii (const mbchar_t mbc)
241 {
242 #if HAVE_ICONV
243   if (mbc->uc_valid)
244     return (mbc->uc >= 0x0000 && mbc->uc <= 0x007F);
245   else
246 #endif
247     return mbc->bytes == 1 && (mbc->buf[0] & 0x80) == 0;
248 }
249 
250 /* Extra <wchar.h> function.  */
251 
252 /* Unprintable characters appear as a small box of width 1.  */
253 #define MB_UNPRINTABLE_WIDTH 1
254 
255 static int
mb_width(const mbchar_t mbc)256 mb_width (const mbchar_t mbc)
257 {
258 #if HAVE_ICONV
259   if (mbc->uc_valid)
260     {
261       unsigned int uc = mbc->uc;
262       const char *encoding =
263 	(po_lex_iconv != (iconv_t)(-1) ? po_lex_charset : "");
264       int w = uc_width (uc, encoding);
265       /* For unprintable characters, arbitrarily return 0 for control
266 	 characters (except tab) and MB_UNPRINTABLE_WIDTH otherwise.  */
267       if (w >= 0)
268 	return w;
269       if (uc >= 0x0000 && uc <= 0x001F)
270 	{
271 	  if (uc == 0x0009)
272 	    return 8 - (gram_pos_column & 7);
273 	  return 0;
274 	}
275       if ((uc >= 0x007F && uc <= 0x009F) || (uc >= 0x2028 && uc <= 0x2029))
276 	return 0;
277       return MB_UNPRINTABLE_WIDTH;
278     }
279   else
280 #endif
281     {
282       if (mbc->bytes == 1)
283 	{
284 	  if (
285 #if CHAR_MIN < 0x00 /* to avoid gcc warning */
286 	      mbc->buf[0] >= 0x00 &&
287 #endif
288 	      mbc->buf[0] <= 0x1F)
289 	    {
290 	      if (mbc->buf[0] == 0x09)
291 		return 8 - (gram_pos_column & 7);
292 	      return 0;
293 	    }
294 	  if (mbc->buf[0] == 0x7F)
295 	    return 0;
296 	}
297       return MB_UNPRINTABLE_WIDTH;
298     }
299 }
300 
301 /* Output.  */
302 static inline void
mb_putc(const mbchar_t mbc,FILE * stream)303 mb_putc (const mbchar_t mbc, FILE *stream)
304 {
305   fwrite (mbc->buf, 1, mbc->bytes, stream);
306 }
307 
308 /* Assignment.  */
309 static inline void
mb_setascii(mbchar_t mbc,char sc)310 mb_setascii (mbchar_t mbc, char sc)
311 {
312   mbc->bytes = 1;
313 #if HAVE_ICONV
314   mbc->uc_valid = 1;
315   mbc->uc = sc;
316 #endif
317   mbc->buf[0] = sc;
318 }
319 
320 /* Copying a character.  */
321 static inline void
mb_copy(mbchar_t new,const mbchar_t old)322 mb_copy (mbchar_t new, const mbchar_t old)
323 {
324   memcpy_small (&new->buf[0], &old->buf[0], old->bytes);
325   new->bytes = old->bytes;
326 #if HAVE_ICONV
327   if ((new->uc_valid = old->uc_valid))
328     new->uc = old->uc;
329 #endif
330 }
331 
332 
333 /* Multibyte character input.  */
334 
335 /* Number of characters that can be pushed back.
336    We need 1 for lex_getc, plus 1 for lex_ungetc.  */
337 #define NPUSHBACK 2
338 
339 /* Data type of a multibyte character input stream.  */
340 struct mbfile
341 {
342   FILE *fp;
343   bool eof_seen;
344   int have_pushback;
345   unsigned int bufcount;
346   char buf[MBCHAR_BUF_SIZE];
347   struct mbchar pushback[NPUSHBACK];
348 };
349 
350 /* We want to pass multibyte streams by reference automatically,
351    therefore we use an array type.  */
352 typedef struct mbfile mbfile_t[1];
353 
354 /* Whether invalid multibyte sequences in the input shall be signalled
355    or silently tolerated.  */
356 static bool signal_eilseq;
357 
358 static inline void
mbfile_init(mbfile_t mbf,FILE * stream)359 mbfile_init (mbfile_t mbf, FILE *stream)
360 {
361   mbf->fp = stream;
362   mbf->eof_seen = false;
363   mbf->have_pushback = 0;
364   mbf->bufcount = 0;
365 }
366 
367 /* Read the next multibyte character from mbf and put it into mbc.
368    If a read error occurs, errno is set and ferror (mbf->fp) becomes true.  */
369 static void
mbfile_getc(mbchar_t mbc,mbfile_t mbf)370 mbfile_getc (mbchar_t mbc, mbfile_t mbf)
371 {
372   size_t bytes;
373 
374   /* If EOF has already been seen, don't use getc.  This matters if
375      mbf->fp is connected to an interactive tty.  */
376   if (mbf->eof_seen)
377     goto eof;
378 
379   /* Return character pushed back, if there is one.  */
380   if (mbf->have_pushback > 0)
381     {
382       mbf->have_pushback--;
383       mb_copy (mbc, &mbf->pushback[mbf->have_pushback]);
384       return;
385     }
386 
387   /* Before using iconv, we need at least one byte.  */
388   if (mbf->bufcount == 0)
389     {
390       int c = getc (mbf->fp);
391       if (c == EOF)
392 	{
393 	  mbf->eof_seen = true;
394 	  goto eof;
395 	}
396       mbf->buf[0] = (unsigned char) c;
397       mbf->bufcount++;
398     }
399 
400 #if HAVE_ICONV
401   if (po_lex_iconv != (iconv_t)(-1))
402     {
403       /* Use iconv on an increasing number of bytes.  Read only as many
404 	 bytes from mbf->fp as needed.  This is needed to give reasonable
405 	 interactive behaviour when mbf->fp is connected to an interactive
406 	 tty.  */
407       for (;;)
408 	{
409 	  unsigned char scratchbuf[64];
410 	  const char *inptr = &mbf->buf[0];
411 	  size_t insize = mbf->bufcount;
412 	  char *outptr = (char *) &scratchbuf[0];
413 	  size_t outsize = sizeof (scratchbuf);
414 
415 	  size_t res = iconv (po_lex_iconv,
416 			      (ICONV_CONST char **) &inptr, &insize,
417 			      &outptr, &outsize);
418 	  /* We expect that a character has been produced if and only if
419 	     some input bytes have been consumed.  */
420 	  if ((insize < mbf->bufcount) != (outsize < sizeof (scratchbuf)))
421 	    abort ();
422 	  if (outsize == sizeof (scratchbuf))
423 	    {
424 	      /* No character has been produced.  Must be an error.  */
425 	      if (res != (size_t)(-1))
426 		abort ();
427 
428 	      if (errno == EILSEQ)
429 		{
430 		  /* An invalid multibyte sequence was encountered.  */
431 		  /* Return a single byte.  */
432 		  if (signal_eilseq)
433 		    po_gram_error (_("invalid multibyte sequence"));
434 		  bytes = 1;
435 		  mbc->uc_valid = false;
436 		  break;
437 		}
438 	      else if (errno == EINVAL)
439 		{
440 		  /* An incomplete multibyte character.  */
441 		  int c;
442 
443 		  if (mbf->bufcount == MBCHAR_BUF_SIZE)
444 		    {
445 		      /* An overlong incomplete multibyte sequence was
446 			 encountered.  */
447 		      /* Return a single byte.  */
448 		      bytes = 1;
449 		      mbc->uc_valid = false;
450 		      break;
451 		    }
452 
453 		  /* Read one more byte and retry iconv.  */
454 		  c = getc (mbf->fp);
455 		  if (c == EOF)
456 		    {
457 		      mbf->eof_seen = true;
458 		      if (ferror (mbf->fp))
459 			goto eof;
460 		      if (signal_eilseq)
461 			po_gram_error (_("\
462 incomplete multibyte sequence at end of file"));
463 		      bytes = mbf->bufcount;
464 		      mbc->uc_valid = false;
465 		      break;
466 		    }
467 		  mbf->buf[mbf->bufcount++] = (unsigned char) c;
468 		  if (c == '\n')
469 		    {
470 		      if (signal_eilseq)
471 			po_gram_error (_("\
472 incomplete multibyte sequence at end of line"));
473 		      bytes = mbf->bufcount - 1;
474 		      mbc->uc_valid = false;
475 		      break;
476 		    }
477 		}
478 	      else
479 		{
480 		  const char *errno_description = strerror (errno);
481 		  po_xerror (PO_SEVERITY_FATAL_ERROR, NULL, NULL, 0, 0, false,
482 			     xasprintf ("%s: %s",
483 					_("iconv failure"),
484 					errno_description));
485 		}
486 	    }
487 	  else
488 	    {
489 	      size_t outbytes = sizeof (scratchbuf) - outsize;
490 	      bytes = mbf->bufcount - insize;
491 
492 	      /* We expect that one character has been produced.  */
493 	      if (bytes == 0)
494 		abort ();
495 	      if (outbytes == 0)
496 		abort ();
497 	      /* Convert it from UTF-8 to UCS-4.  */
498 	      if (u8_mbtouc (&mbc->uc, scratchbuf, outbytes) < outbytes)
499 		{
500 		  /* scratchbuf contains an out-of-range Unicode character
501 		     (> 0x10ffff).  */
502 		  if (signal_eilseq)
503 		    po_gram_error (_("invalid multibyte sequence"));
504 		  mbc->uc_valid = false;
505 		  break;
506 		}
507 	      mbc->uc_valid = true;
508 	      break;
509 	    }
510 	}
511     }
512   else
513 #endif
514     {
515       if (po_lex_weird_cjk
516 	  /* Special handling of encodings with CJK structure.  */
517 	  && (unsigned char) mbf->buf[0] >= 0x80)
518 	{
519 	  if (mbf->bufcount == 1)
520 	    {
521 	      /* Read one more byte.  */
522 	      int c = getc (mbf->fp);
523 	      if (c == EOF)
524 		{
525 		  if (ferror (mbf->fp))
526 		    {
527 		      mbf->eof_seen = true;
528 		      goto eof;
529 		    }
530 		}
531 	      else
532 		{
533 		  mbf->buf[1] = (unsigned char) c;
534 		  mbf->bufcount++;
535 		}
536 	    }
537 	  if (mbf->bufcount >= 2 && (unsigned char) mbf->buf[1] >= 0x30)
538 	    /* Return a double byte.  */
539 	    bytes = 2;
540 	  else
541 	    /* Return a single byte.  */
542 	    bytes = 1;
543 	}
544       else
545 	{
546 	  /* Return a single byte.  */
547 	  bytes = 1;
548 	}
549 #if HAVE_ICONV
550       mbc->uc_valid = false;
551 #endif
552     }
553 
554   /* Return the multibyte sequence mbf->buf[0..bytes-1].  */
555   memcpy_small (&mbc->buf[0], &mbf->buf[0], bytes);
556   mbc->bytes = bytes;
557 
558   mbf->bufcount -= bytes;
559   if (mbf->bufcount > 0)
560     {
561       /* It's not worth calling memmove() for so few bytes.  */
562       unsigned int count = mbf->bufcount;
563       char *p = &mbf->buf[0];
564 
565       do
566 	{
567 	  *p = *(p + bytes);
568 	  p++;
569 	}
570       while (--count > 0);
571     }
572   return;
573 
574 eof:
575   /* An mbchar_t with bytes == 0 is used to indicate EOF.  */
576   mbc->bytes = 0;
577 #if HAVE_ICONV
578   mbc->uc_valid = false;
579 #endif
580   return;
581 }
582 
583 static void
mbfile_ungetc(const mbchar_t mbc,mbfile_t mbf)584 mbfile_ungetc (const mbchar_t mbc, mbfile_t mbf)
585 {
586   if (mbf->have_pushback >= NPUSHBACK)
587     abort ();
588   mb_copy (&mbf->pushback[mbf->have_pushback], mbc);
589   mbf->have_pushback++;
590 }
591 
592 
593 /* Lexer variables.  */
594 
595 static mbfile_t mbf;
596 unsigned int gram_max_allowed_errors = 20;
597 static bool po_lex_obsolete;
598 static bool po_lex_previous;
599 static bool pass_comments = false;
600 bool pass_obsolete_entries = false;
601 
602 
603 /* Prepare lexical analysis.  */
604 void
lex_start(FILE * fp,const char * real_filename,const char * logical_filename)605 lex_start (FILE *fp, const char *real_filename, const char *logical_filename)
606 {
607   /* Ignore the logical_filename, because PO file entries already have
608      their file names attached.  But use real_filename for error messages.  */
609   gram_pos.file_name = xstrdup (real_filename);
610 
611   mbfile_init (mbf, fp);
612 
613   gram_pos.line_number = 1;
614   gram_pos_column = 0;
615   signal_eilseq = true;
616   po_lex_obsolete = false;
617   po_lex_previous = false;
618   po_lex_charset_init ();
619 }
620 
621 /* Terminate lexical analysis.  */
622 void
lex_end()623 lex_end ()
624 {
625   mbf->fp = NULL;
626   gram_pos.file_name = NULL;
627   gram_pos.line_number = 0;
628   gram_pos_column = 0;
629   signal_eilseq = false;
630   po_lex_obsolete = false;
631   po_lex_previous = false;
632   po_lex_charset_close ();
633 }
634 
635 
636 /* Read a single character, dealing with backslash-newline.
637    Also keep track of the current line number and column number.  */
638 static void
lex_getc(mbchar_t mbc)639 lex_getc (mbchar_t mbc)
640 {
641   for (;;)
642     {
643       mbfile_getc (mbc, mbf);
644 
645       if (mb_iseof (mbc))
646 	{
647 	  if (ferror (mbf->fp))
648 	   bomb:
649 	    {
650 	      const char *errno_description = strerror (errno);
651 	      po_xerror (PO_SEVERITY_FATAL_ERROR, NULL, NULL, 0, 0, false,
652 			 xasprintf ("%s: %s",
653 				    xasprintf (_("error while reading \"%s\""),
654 					       gram_pos.file_name),
655 				    errno_description));
656 	    }
657 	  break;
658 	}
659 
660       if (mb_iseq (mbc, '\n'))
661 	{
662 	  gram_pos.line_number++;
663 	  gram_pos_column = 0;
664 	  break;
665 	}
666 
667       gram_pos_column += mb_width (mbc);
668 
669       if (mb_iseq (mbc, '\\'))
670 	{
671 	  mbchar_t mbc2;
672 
673 	  mbfile_getc (mbc2, mbf);
674 
675 	  if (mb_iseof (mbc2))
676 	    {
677 	      if (ferror (mbf->fp))
678 		goto bomb;
679 	      break;
680 	    }
681 
682 	  if (!mb_iseq (mbc2, '\n'))
683 	    {
684 	      mbfile_ungetc (mbc2, mbf);
685 	      break;
686 	    }
687 
688 	  gram_pos.line_number++;
689 	  gram_pos_column = 0;
690 	}
691       else
692 	break;
693     }
694 }
695 
696 
697 static void
lex_ungetc(const mbchar_t mbc)698 lex_ungetc (const mbchar_t mbc)
699 {
700   if (!mb_iseof (mbc))
701     {
702       if (mb_iseq (mbc, '\n'))
703 	/* Decrement the line number, but don't care about the column.  */
704 	gram_pos.line_number--;
705       else
706 	/* Decrement the column number.  Also works well enough for tabs.  */
707 	gram_pos_column -= mb_width (mbc);
708 
709       mbfile_ungetc (mbc, mbf);
710     }
711 }
712 
713 
714 static int
keyword_p(const char * s)715 keyword_p (const char *s)
716 {
717   if (!po_lex_previous)
718     {
719       if (!strcmp (s, "domain"))
720 	return DOMAIN;
721       if (!strcmp (s, "msgid"))
722 	return MSGID;
723       if (!strcmp (s, "msgid_plural"))
724 	return MSGID_PLURAL;
725       if (!strcmp (s, "msgstr"))
726 	return MSGSTR;
727       if (!strcmp (s, "msgctxt"))
728 	return MSGCTXT;
729     }
730   else
731     {
732       /* Inside a "#|" context, the keywords have a different meaning.  */
733       if (!strcmp (s, "msgid"))
734 	return PREV_MSGID;
735       if (!strcmp (s, "msgid_plural"))
736 	return PREV_MSGID_PLURAL;
737       if (!strcmp (s, "msgctxt"))
738 	return PREV_MSGCTXT;
739     }
740   po_gram_error_at_line (&gram_pos, _("keyword \"%s\" unknown"), s);
741   return NAME;
742 }
743 
744 
745 static int
control_sequence()746 control_sequence ()
747 {
748   mbchar_t mbc;
749   int val;
750   int max;
751 
752   lex_getc (mbc);
753   if (mb_len (mbc) == 1)
754     switch (mb_ptr (mbc) [0])
755       {
756       case 'n':
757 	return '\n';
758 
759       case 't':
760 	return '\t';
761 
762       case 'b':
763 	return '\b';
764 
765       case 'r':
766 	return '\r';
767 
768       case 'f':
769 	return '\f';
770 
771       case 'v':
772 	return '\v';
773 
774       case 'a':
775 	return '\a';
776 
777       case '\\':
778       case '"':
779 	return mb_ptr (mbc) [0];
780 
781       case '0': case '1': case '2': case '3':
782       case '4': case '5': case '6': case '7':
783 	val = 0;
784 	max = 0;
785 	for (;;)
786 	  {
787 	    char c = mb_ptr (mbc) [0];
788 	    /* Warning: not portable, can't depend on '0'..'7' ordering.  */
789 	    val = val * 8 + (c - '0');
790 	    if (++max == 3)
791 	      break;
792 	    lex_getc (mbc);
793 	    if (mb_len (mbc) == 1)
794 	      switch (mb_ptr (mbc) [0])
795 		{
796 		case '0': case '1': case '2': case '3':
797 		case '4': case '5': case '6': case '7':
798 		  continue;
799 
800 		default:
801 		  break;
802 		}
803 	    lex_ungetc (mbc);
804 	    break;
805 	  }
806 	return val;
807 
808       case 'x':
809 	lex_getc (mbc);
810 	if (mb_iseof (mbc) || mb_len (mbc) != 1
811 	    || !c_isxdigit (mb_ptr (mbc) [0]))
812 	  break;
813 
814 	val = 0;
815 	for (;;)
816 	  {
817 	    char c = mb_ptr (mbc) [0];
818 	    val *= 16;
819 	    if (c_isdigit (c))
820 	      /* Warning: not portable, can't depend on '0'..'9' ordering */
821 	      val += c - '0';
822 	    else if (c_isupper (c))
823 	      /* Warning: not portable, can't depend on 'A'..'F' ordering */
824 	      val += c - 'A' + 10;
825 	    else
826 	      /* Warning: not portable, can't depend on 'a'..'f' ordering */
827 	      val += c - 'a' + 10;
828 
829 	    lex_getc (mbc);
830 	    if (mb_len (mbc) == 1)
831 	      switch (mb_ptr (mbc) [0])
832 		{
833 		case '0': case '1': case '2': case '3': case '4':
834 		case '5': case '6': case '7': case '8': case '9':
835 		case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
836 		case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
837 		  continue;
838 
839 		default:
840 		  break;
841 		}
842 	    lex_ungetc (mbc);
843 	    break;
844 	  }
845 	return val;
846 
847       /* FIXME: \u and \U are not handled.  */
848       }
849   lex_ungetc (mbc);
850   po_gram_error (_("invalid control sequence"));
851   return ' ';
852 }
853 
854 
855 /* Return the next token in the PO file.  The return codes are defined
856    in "po-gram-gen2.h".  Associated data is put in 'po_gram_lval'.  */
857 int
po_gram_lex()858 po_gram_lex ()
859 {
860   static char *buf;
861   static size_t bufmax;
862   mbchar_t mbc;
863   size_t bufpos;
864 
865   for (;;)
866     {
867       lex_getc (mbc);
868 
869       if (mb_iseof (mbc))
870 	/* Yacc want this for end of file.  */
871 	return 0;
872 
873       if (mb_len (mbc) == 1)
874 	switch (mb_ptr (mbc) [0])
875 	  {
876 	  case '\n':
877 	    po_lex_obsolete = false;
878 	    po_lex_previous = false;
879 	    /* Ignore whitespace, not relevant for the grammar.  */
880 	    break;
881 
882 	  case ' ':
883 	  case '\t':
884 	  case '\r':
885 	  case '\f':
886 	  case '\v':
887 	    /* Ignore whitespace, not relevant for the grammar.  */
888 	    break;
889 
890 	  case '#':
891 	    lex_getc (mbc);
892 	    if (mb_iseq (mbc, '~'))
893 	      /* A pseudo-comment beginning with #~ is found.  This is
894 		 not a comment.  It is the format for obsolete entries.
895 		 We simply discard the "#~" prefix.  The following
896 		 characters are expected to be well formed.  */
897 	      {
898 		po_lex_obsolete = true;
899 		/* A pseudo-comment beginning with #~| denotes a previous
900 		   untranslated string in an obsolete entry.  This does not
901 		   make much sense semantically, and is implemented here
902 		   for completeness only.  */
903 		lex_getc (mbc);
904 		if (mb_iseq (mbc, '|'))
905 		  po_lex_previous = true;
906 		else
907 		  lex_ungetc (mbc);
908 		break;
909 	      }
910 	    if (mb_iseq (mbc, '|'))
911 	      /* A pseudo-comment beginning with #| is found.  This is
912 		 the previous untranslated string.  We discard the "#|"
913 		 prefix, but change the keywords and string returns
914 		 accordingly.  */
915 	      {
916 		po_lex_previous = true;
917 		break;
918 	      }
919 
920 	    /* Accumulate comments into a buffer.  If we have been asked
921 	       to pass comments, generate a COMMENT token, otherwise
922 	       discard it.  */
923 	    signal_eilseq = false;
924 	    if (pass_comments)
925 	      {
926 		bufpos = 0;
927 		for (;;)
928 		  {
929 		    while (bufpos + mb_len (mbc) >= bufmax)
930 		      {
931 			bufmax += 100;
932 			buf = xrealloc (buf, bufmax);
933 		      }
934 		    if (mb_iseof (mbc) || mb_iseq (mbc, '\n'))
935 		      break;
936 
937 		    memcpy_small (&buf[bufpos], mb_ptr (mbc), mb_len (mbc));
938 		    bufpos += mb_len (mbc);
939 
940 		    lex_getc (mbc);
941 		  }
942 		buf[bufpos] = '\0';
943 
944 		po_gram_lval.string.string = buf;
945 		po_gram_lval.string.pos = gram_pos;
946 		po_gram_lval.string.obsolete = po_lex_obsolete;
947 		po_lex_obsolete = false;
948 		signal_eilseq = true;
949 		return COMMENT;
950 	      }
951 	    else
952 	      {
953 		/* We do this in separate loop because collecting large
954 		   comments while they get not passed to the upper layers
955 		   is not very efficient.  */
956 		while (!mb_iseof (mbc) && !mb_iseq (mbc, '\n'))
957 		  lex_getc (mbc);
958 		po_lex_obsolete = false;
959 		signal_eilseq = true;
960 	      }
961 	    break;
962 
963 	  case '"':
964 	    /* Accumulate a string.  */
965 	    bufpos = 0;
966 	    for (;;)
967 	      {
968 		lex_getc (mbc);
969 		while (bufpos + mb_len (mbc) >= bufmax)
970 		  {
971 		    bufmax += 100;
972 		    buf = xrealloc (buf, bufmax);
973 		  }
974 		if (mb_iseof (mbc))
975 		  {
976 		    po_gram_error_at_line (&gram_pos,
977 					   _("end-of-file within string"));
978 		    break;
979 		  }
980 		if (mb_iseq (mbc, '\n'))
981 		  {
982 		    po_gram_error_at_line (&gram_pos,
983 					   _("end-of-line within string"));
984 		    break;
985 		  }
986 		if (mb_iseq (mbc, '"'))
987 		  break;
988 		if (mb_iseq (mbc, '\\'))
989 		  {
990 		    buf[bufpos++] = control_sequence ();
991 		    continue;
992 		  }
993 
994 		/* Add mbc to the accumulator.  */
995 		memcpy_small (&buf[bufpos], mb_ptr (mbc), mb_len (mbc));
996 		bufpos += mb_len (mbc);
997 	      }
998 	    buf[bufpos] = '\0';
999 
1000 	    /* Strings cannot contain the msgctxt separator, because it cannot
1001 	       be faithfully represented in the msgid of a .mo file.  */
1002 	    if (strchr (buf, MSGCTXT_SEPARATOR) != NULL)
1003 	      po_gram_error_at_line (&gram_pos,
1004 				     _("context separator <EOT> within string"));
1005 
1006 	    /* FIXME: Treatment of embedded \000 chars is incorrect.  */
1007 	    po_gram_lval.string.string = xstrdup (buf);
1008 	    po_gram_lval.string.pos = gram_pos;
1009 	    po_gram_lval.string.obsolete = po_lex_obsolete;
1010 	    return (po_lex_previous ? PREV_STRING : STRING);
1011 
1012 	  case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
1013 	  case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
1014 	  case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
1015 	  case 's': case 't': case 'u': case 'v': case 'w': case 'x':
1016 	  case 'y': case 'z':
1017 	  case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
1018 	  case 'G': case 'H': case 'I': case 'J': case 'K': case 'L':
1019 	  case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R':
1020 	  case 'S': case 'T': case 'U': case 'V': case 'W': case 'X':
1021 	  case 'Y': case 'Z':
1022 	  case '_': case '$':
1023 	    bufpos = 0;
1024 	    for (;;)
1025 	      {
1026 		char c = mb_ptr (mbc) [0];
1027 		if (bufpos + 1 >= bufmax)
1028 		  {
1029 		    bufmax += 100;
1030 		    buf = xrealloc (buf, bufmax);
1031 		  }
1032 		buf[bufpos++] = c;
1033 		lex_getc (mbc);
1034 		if (mb_len (mbc) == 1)
1035 		  switch (mb_ptr (mbc) [0])
1036 		    {
1037 		    default:
1038 		      break;
1039 		    case 'a': case 'b': case 'c': case 'd': case 'e':
1040 		    case 'f': case 'g': case 'h': case 'i': case 'j':
1041 		    case 'k': case 'l': case 'm': case 'n': case 'o':
1042 		    case 'p': case 'q': case 'r': case 's': case 't':
1043 		    case 'u': case 'v': case 'w': case 'x': case 'y':
1044 		    case 'z':
1045 		    case 'A': case 'B': case 'C': case 'D': case 'E':
1046 		    case 'F': case 'G': case 'H': case 'I': case 'J':
1047 		    case 'K': case 'L': case 'M': case 'N': case 'O':
1048 		    case 'P': case 'Q': case 'R': case 'S': case 'T':
1049 		    case 'U': case 'V': case 'W': case 'X': case 'Y':
1050 		    case 'Z':
1051 		    case '_': case '$':
1052 		    case '0': case '1': case '2': case '3': case '4':
1053 		    case '5': case '6': case '7': case '8': case '9':
1054 		      continue;
1055 		    }
1056 		break;
1057 	      }
1058 	    lex_ungetc (mbc);
1059 
1060 	    buf[bufpos] = '\0';
1061 
1062 	    {
1063 	      int k = keyword_p (buf);
1064 	      if (k == NAME)
1065 		{
1066 		  po_gram_lval.string.string = xstrdup (buf);
1067 		  po_gram_lval.string.pos = gram_pos;
1068 		  po_gram_lval.string.obsolete = po_lex_obsolete;
1069 		}
1070 	      else
1071 		{
1072 		  po_gram_lval.pos.pos = gram_pos;
1073 		  po_gram_lval.pos.obsolete = po_lex_obsolete;
1074 		}
1075 	      return k;
1076 	    }
1077 
1078 	  case '0': case '1': case '2': case '3': case '4':
1079 	  case '5': case '6': case '7': case '8': case '9':
1080 	    bufpos = 0;
1081 	    for (;;)
1082 	      {
1083 		char c = mb_ptr (mbc) [0];
1084 		if (bufpos + 1 >= bufmax)
1085 		  {
1086 		    bufmax += 100;
1087 		    buf = xrealloc (buf, bufmax + 1);
1088 		  }
1089 		buf[bufpos++] = c;
1090 		lex_getc (mbc);
1091 		if (mb_len (mbc) == 1)
1092 		  switch (mb_ptr (mbc) [0])
1093 		    {
1094 		    default:
1095 		      break;
1096 
1097 		    case '0': case '1': case '2': case '3': case '4':
1098 		    case '5': case '6': case '7': case '8': case '9':
1099 		      continue;
1100 		    }
1101 		break;
1102 	      }
1103 	    lex_ungetc (mbc);
1104 
1105 	    buf[bufpos] = '\0';
1106 
1107 	    po_gram_lval.number.number = atol (buf);
1108 	    po_gram_lval.number.pos = gram_pos;
1109 	    po_gram_lval.number.obsolete = po_lex_obsolete;
1110 	    return NUMBER;
1111 
1112 	  case '[':
1113 	    po_gram_lval.pos.pos = gram_pos;
1114 	    po_gram_lval.pos.obsolete = po_lex_obsolete;
1115 	    return '[';
1116 
1117 	  case ']':
1118 	    po_gram_lval.pos.pos = gram_pos;
1119 	    po_gram_lval.pos.obsolete = po_lex_obsolete;
1120 	    return ']';
1121 
1122 	  default:
1123 	    /* This will cause a syntax error.  */
1124 	    return JUNK;
1125 	  }
1126       else
1127 	/* This will cause a syntax error.  */
1128 	return JUNK;
1129     }
1130 }
1131 
1132 
1133 /* po_gram_lex() can return comments as COMMENT.  Switch this on or off.  */
1134 void
po_lex_pass_comments(bool flag)1135 po_lex_pass_comments (bool flag)
1136 {
1137   pass_comments = flag;
1138 }
1139 
1140 
1141 /* po_gram_lex() can return obsolete entries as if they were normal entries.
1142    Switch this on or off.  */
1143 void
po_lex_pass_obsolete_entries(bool flag)1144 po_lex_pass_obsolete_entries (bool flag)
1145 {
1146   pass_obsolete_entries = flag;
1147 }
1148