1 /* xgettext Scheme backend.
2    Copyright (C) 2004-2009, 2011, 2014, 2018-2020 Free Software Foundation, Inc.
3 
4    This file was written by Bruno Haible <bruno@clisp.org>, 2004-2005.
5 
6    This program is free software: you can redistribute it and/or modify
7    it under the terms of the GNU General Public License as published by
8    the Free Software Foundation; either version 3 of the License, or
9    (at your option) any later version.
10 
11    This program is distributed in the hope that it will be useful,
12    but WITHOUT ANY WARRANTY; without even the implied warranty of
13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14    GNU General Public License for more details.
15 
16    You should have received a copy of the GNU General Public License
17    along with this program.  If not, see <https://www.gnu.org/licenses/>.  */
18 
19 #ifdef HAVE_CONFIG_H
20 # include "config.h"
21 #endif
22 
23 /* Specification.  */
24 #include "x-scheme.h"
25 
26 #include <errno.h>
27 #include <stdbool.h>
28 #include <stdio.h>
29 #include <stdlib.h>
30 #include <string.h>
31 
32 #include "message.h"
33 #include "xgettext.h"
34 #include "xg-pos.h"
35 #include "xg-mixed-string.h"
36 #include "xg-arglist-context.h"
37 #include "xg-arglist-callshape.h"
38 #include "xg-arglist-parser.h"
39 #include "xg-message.h"
40 #include "error.h"
41 #include "xalloc.h"
42 #include "mem-hash-map.h"
43 #include "gettext.h"
44 
45 #define _(s) gettext(s)
46 
47 
48 /* The Scheme syntax is described in R5RS.  It is implemented in
49    guile-2.0.0/libguile/read.c.
50    Since we are interested only in strings and in forms similar to
51         (gettext msgid ...)
52    or   (ngettext msgid msgid_plural ...)
53    we make the following simplifications:
54 
55    - Assume the keywords and strings are in an ASCII compatible encoding.
56      This means we can read the input file one byte at a time, instead of
57      one character at a time.  No need to worry about multibyte characters:
58      If they occur as part of identifiers, they most probably act as
59      constituent characters, and the byte based approach will do the same.
60 
61    - Assume the read-hash-procedures is in the default state.
62      Non-standard reader extensions are mostly used to read data, not programs.
63 
64    The remaining syntax rules are:
65 
66    - The syntax code assigned to each character, and how tokens are built
67      up from characters (single escape, multiple escape etc.).
68 
69    - Comment syntax: ';' and '#! ... !#' and '#| ... |#' (may be nested).
70 
71    - String syntax: "..." with single escapes.
72 
73    - Read macros and dispatch macro character '#'.  Needed to be able to
74      tell which is the n-th argument of a function call.
75 
76  */
77 
78 
79 /* ====================== Keyword set customization.  ====================== */
80 
81 /* If true extract all strings.  */
82 static bool extract_all = false;
83 
84 static hash_table keywords;
85 static bool default_keywords = true;
86 
87 
88 void
x_scheme_extract_all()89 x_scheme_extract_all ()
90 {
91   extract_all = true;
92 }
93 
94 
95 void
x_scheme_keyword(const char * name)96 x_scheme_keyword (const char *name)
97 {
98   if (name == NULL)
99     default_keywords = false;
100   else
101     {
102       const char *end;
103       struct callshape shape;
104       const char *colon;
105 
106       if (keywords.table == NULL)
107         hash_init (&keywords, 100);
108 
109       split_keywordspec (name, &end, &shape);
110 
111       /* The characters between name and end should form a valid Lisp symbol.
112          Extract the symbol name part.  */
113       colon = strchr (name, ':');
114       if (colon != NULL && colon < end)
115         {
116           name = colon + 1;
117           if (name < end && *name == ':')
118             name++;
119           colon = strchr (name, ':');
120           if (colon != NULL && colon < end)
121             return;
122         }
123 
124       insert_keyword_callshape (&keywords, name, end - name, &shape);
125     }
126 }
127 
128 /* Finish initializing the keywords hash table.
129    Called after argument processing, before each file is processed.  */
130 static void
init_keywords()131 init_keywords ()
132 {
133   if (default_keywords)
134     {
135       /* When adding new keywords here, also update the documentation in
136          xgettext.texi!  */
137       x_scheme_keyword ("gettext");             /* libguile/i18n.c */
138       x_scheme_keyword ("ngettext:1,2");        /* libguile/i18n.c */
139       x_scheme_keyword ("gettext-noop");
140       default_keywords = false;
141     }
142 }
143 
144 void
init_flag_table_scheme()145 init_flag_table_scheme ()
146 {
147   xgettext_record_flag ("gettext:1:pass-scheme-format");
148   xgettext_record_flag ("ngettext:1:pass-scheme-format");
149   xgettext_record_flag ("ngettext:2:pass-scheme-format");
150   xgettext_record_flag ("gettext-noop:1:pass-scheme-format");
151   xgettext_record_flag ("format:2:scheme-format");
152 }
153 
154 
155 /* ======================== Reading of characters.  ======================== */
156 
157 /* The input file stream.  */
158 static FILE *fp;
159 
160 
161 /* Fetch the next character from the input file.  */
162 static int
do_getc()163 do_getc ()
164 {
165   int c = getc (fp);
166 
167   if (c == EOF)
168     {
169       if (ferror (fp))
170         error (EXIT_FAILURE, errno,
171                _("error while reading \"%s\""), real_file_name);
172     }
173   else if (c == '\n')
174    line_number++;
175 
176   return c;
177 }
178 
179 /* Put back the last fetched character, not EOF.  */
180 static void
do_ungetc(int c)181 do_ungetc (int c)
182 {
183   if (c == '\n')
184     line_number--;
185   ungetc (c, fp);
186 }
187 
188 
189 /* ========================== Reading of tokens.  ========================== */
190 
191 
192 /* A token consists of a sequence of characters.  */
193 struct token
194 {
195   int allocated;                /* number of allocated 'token_char's */
196   int charcount;                /* number of used 'token_char's */
197   char *chars;                  /* the token's constituents */
198 };
199 
200 /* Initialize a 'struct token'.  */
201 static inline void
init_token(struct token * tp)202 init_token (struct token *tp)
203 {
204   tp->allocated = 10;
205   tp->chars = XNMALLOC (tp->allocated, char);
206   tp->charcount = 0;
207 }
208 
209 /* Free the memory pointed to by a 'struct token'.  */
210 static inline void
free_token(struct token * tp)211 free_token (struct token *tp)
212 {
213   free (tp->chars);
214 }
215 
216 /* Ensure there is enough room in the token for one more character.  */
217 static inline void
grow_token(struct token * tp)218 grow_token (struct token *tp)
219 {
220   if (tp->charcount == tp->allocated)
221     {
222       tp->allocated *= 2;
223       tp->chars = (char *) xrealloc (tp->chars, tp->allocated * sizeof (char));
224     }
225 }
226 
227 /* Read the next token.  'first' is the first character, which has already
228    been read.  */
229 static void
read_token(struct token * tp,int first)230 read_token (struct token *tp, int first)
231 {
232   init_token (tp);
233 
234   grow_token (tp);
235   tp->chars[tp->charcount++] = first;
236 
237   for (;;)
238     {
239       int c = do_getc ();
240 
241       if (c == EOF)
242         break;
243       if (c == ' ' || c == '\r' || c == '\f' || c == '\t' || c == '\n'
244           || c == '"' || c == '(' || c == ')' || c == ';')
245         {
246           do_ungetc (c);
247           break;
248         }
249       grow_token (tp);
250       tp->chars[tp->charcount++] = c;
251     }
252 }
253 
254 /* Tests if a token represents an integer.
255    Taken from guile-1.6.4/libguile/numbers.c:scm_istr2int().  */
256 static inline bool
is_integer_syntax(const char * str,int len,int radix)257 is_integer_syntax (const char *str, int len, int radix)
258 {
259   const char *p = str;
260   const char *p_end = str + len;
261 
262   /* The accepted syntax is
263        ['+'|'-'] DIGIT+
264      where DIGIT is a hexadecimal digit whose value is below radix.  */
265 
266   if (p == p_end)
267     return false;
268   if (*p == '+' || *p == '-')
269     {
270       p++;
271       if (p == p_end)
272         return false;
273     }
274   do
275     {
276       int c = *p++;
277 
278       if (c >= '0' && c <= '9')
279         c = c - '0';
280       else if (c >= 'A' && c <= 'F')
281         c = c - 'A' + 10;
282       else if (c >= 'a' && c <= 'f')
283         c = c - 'a' + 10;
284       else
285         return false;
286       if (c >= radix)
287         return false;
288     }
289   while (p < p_end);
290   return true;
291 }
292 
293 /* Tests if a token represents a rational, floating-point or complex number.
294    If unconstrained is false, only real numbers are accepted; otherwise,
295    complex numbers are accepted as well.
296    Taken from guile-1.6.4/libguile/numbers.c:scm_istr2flo().  */
297 static inline bool
is_other_number_syntax(const char * str,int len,int radix,bool unconstrained)298 is_other_number_syntax (const char *str, int len, int radix, bool unconstrained)
299 {
300   const char *p = str;
301   const char *p_end = str + len;
302   bool seen_sign;
303   bool seen_digits;
304 
305   /* The accepted syntaxes are:
306      for a floating-point number:
307        ['+'|'-'] DIGIT+ [EXPONENT]
308        ['+'|'-'] DIGIT* '.' DIGIT+ [EXPONENT]
309        where EXPONENT ::= ['d'|'e'|'f'|'l'|'s'] DIGIT+
310        (Dot and exponent are allowed only if radix is 10.)
311      for a rational number:
312        ['+'|'-'] DIGIT+ '/' DIGIT+
313      for a complex number:
314        REAL-NUMBER {'+'|'-'} REAL-NUMBER-WITHOUT-SIGN 'i'
315        REAL-NUMBER {'+'|'-'} 'i'
316        {'+'|'-'} REAL-NUMBER-WITHOUT-SIGN 'i'
317        {'+'|'-'} 'i'
318        REAL-NUMBER '@' REAL-NUMBER
319    */
320   if (p == p_end)
321     return false;
322   /* Parse leading sign.  */
323   seen_sign = false;
324   if (*p == '+' || *p == '-')
325     {
326       p++;
327       if (p == p_end)
328         return false;
329       seen_sign = true;
330       /* Recognize complex number syntax: {'+'|'-'} 'i'  */
331       if (unconstrained && (*p == 'I' || *p == 'i') && p + 1 == p_end)
332         return true;
333     }
334   /* Parse digits before dot or exponent or slash.  */
335   seen_digits = false;
336   do
337     {
338       int c = *p;
339 
340       if (c >= '0' && c <= '9')
341         c = c - '0';
342       else if (c >= 'A' && c <= 'F')
343         {
344           if (c >= 'D' && radix == 10) /* exponent? */
345             break;
346           c = c - 'A' + 10;
347         }
348       else if (c >= 'a' && c <= 'f')
349         {
350           if (c >= 'd' && radix == 10) /* exponent? */
351             break;
352           c = c - 'a' + 10;
353         }
354       else
355         break;
356       if (c >= radix)
357         return false;
358       seen_digits = true;
359       p++;
360     }
361   while (p < p_end);
362   /* If p == p_end, we know that seen_digits = true, and the number is an
363      integer without exponent.  */
364   if (p < p_end)
365     {
366       /* If we have no digits so far, we need a decimal point later.  */
367       if (!seen_digits && !(*p == '.' && radix == 10))
368         return false;
369       /* Trailing '#' signs are equivalent to zeroes.  */
370       while (p < p_end && *p == '#')
371         p++;
372       if (p < p_end)
373         {
374           if (*p == '/')
375             {
376               /* Parse digits after the slash.  */
377               bool all_zeroes = true;
378               p++;
379               for (; p < p_end; p++)
380                 {
381                   int c = *p;
382 
383                   if (c >= '0' && c <= '9')
384                     c = c - '0';
385                   else if (c >= 'A' && c <= 'F')
386                     c = c - 'A' + 10;
387                   else if (c >= 'a' && c <= 'f')
388                     c = c - 'a' + 10;
389                   else
390                     break;
391                   if (c >= radix)
392                     return false;
393                   if (c != 0)
394                     all_zeroes = false;
395                 }
396               /* A zero denominator is not allowed.  */
397               if (all_zeroes)
398                 return false;
399               /* Trailing '#' signs are equivalent to zeroes.  */
400               while (p < p_end && *p == '#')
401                 p++;
402             }
403           else
404             {
405               if (*p == '.')
406                 {
407                   /* Decimal point notation.  */
408                   if (radix != 10)
409                     return false;
410                   /* Parse digits after the decimal point.  */
411                   p++;
412                   for (; p < p_end; p++)
413                     {
414                       int c = *p;
415 
416                       if (c >= '0' && c <= '9')
417                         seen_digits = true;
418                       else
419                         break;
420                     }
421                   /* Digits are required before or after the decimal point.  */
422                   if (!seen_digits)
423                     return false;
424                   /* Trailing '#' signs are equivalent to zeroes.  */
425                   while (p < p_end && *p == '#')
426                     p++;
427                 }
428               if (p < p_end)
429                 {
430                   /* Parse exponent.  */
431                   switch (*p)
432                     {
433                     case 'D': case 'd':
434                     case 'E': case 'e':
435                     case 'F': case 'f':
436                     case 'L': case 'l':
437                     case 'S': case 's':
438                       if (radix != 10)
439                         return false;
440                       p++;
441                       if (p == p_end)
442                         return false;
443                       if (*p == '+' || *p == '-')
444                         {
445                           p++;
446                           if (p == p_end)
447                             return false;
448                         }
449                       if (!(*p >= '0' && *p <= '9'))
450                         return false;
451                       for (;;)
452                         {
453                           p++;
454                           if (p == p_end)
455                             break;
456                           if (!(*p >= '0' && *p <= '9'))
457                             break;
458                         }
459                       break;
460                     default:
461                       break;
462                     }
463                 }
464             }
465         }
466     }
467   if (p == p_end)
468     return true;
469   /* Recognize complex number syntax.  */
470   if (unconstrained)
471     {
472       /* Recognize the syntax  {'+'|'-'} REAL-NUMBER-WITHOUT-SIGN 'i'  */
473       if (seen_sign && (*p == 'I' || *p == 'i') && p + 1 == p_end)
474         return true;
475       /* Recognize the syntaxes
476            REAL-NUMBER {'+'|'-'} REAL-NUMBER-WITHOUT-SIGN 'i'
477            REAL-NUMBER {'+'|'-'} 'i'
478        */
479       if (*p == '+' || *p == '-')
480         return (p_end[-1] == 'I' || p_end[-1] == 'i')
481                 && (p + 1 == p_end - 1
482                     || is_other_number_syntax (p, p_end - 1 - p, radix, false));
483       /* Recognize the syntax  REAL-NUMBER '@' REAL-NUMBER  */
484       if (*p == '@')
485         {
486           p++;
487           return is_other_number_syntax (p, p_end - p, radix, false);
488         }
489     }
490   return false;
491 }
492 
493 /* Tests if a token represents a number.
494    Taken from guile-1.6.4/libguile/numbers.c:scm_istring2number().  */
495 static bool
is_number(const struct token * tp)496 is_number (const struct token *tp)
497 {
498   const char *str = tp->chars;
499   int len = tp->charcount;
500   enum { unknown, exact, inexact } exactness = unknown;
501   bool seen_radix_prefix = false;
502   bool seen_exactness_prefix = false;
503 
504   if (len == 1)
505     if (*str == '+' || *str == '-')
506       return false;
507   while (len >= 2 && *str == '#')
508     {
509       switch (str[1])
510         {
511         case 'B': case 'b':
512           if (seen_radix_prefix)
513             return false;
514           seen_radix_prefix = true;
515           break;
516         case 'O': case 'o':
517           if (seen_radix_prefix)
518             return false;
519           seen_radix_prefix = true;
520           break;
521         case 'D': case 'd':
522           if (seen_radix_prefix)
523             return false;
524           seen_radix_prefix = true;
525           break;
526         case 'X': case 'x':
527           if (seen_radix_prefix)
528             return false;
529           seen_radix_prefix = true;
530           break;
531         case 'E': case 'e':
532           if (seen_exactness_prefix)
533             return false;
534           exactness = exact;
535           seen_exactness_prefix = true;
536           break;
537         case 'I': case 'i':
538           if (seen_exactness_prefix)
539             return false;
540           exactness = inexact;
541           seen_exactness_prefix = true;
542           break;
543         default:
544           return false;
545         }
546       str += 2;
547       len -= 2;
548     }
549   if (exactness != inexact)
550     {
551       /* Try to parse an integer.  */
552       if (is_integer_syntax (str, len, 10))
553         return true;
554       /* FIXME: Other Scheme implementations support exact rational numbers
555          or exact complex numbers.  */
556     }
557   if (exactness != exact)
558     {
559       /* Try to parse a rational, floating-point or complex number.  */
560       if (is_other_number_syntax (str, len, 10, true))
561         return true;
562     }
563   return false;
564 }
565 
566 
567 /* ========================= Accumulating comments ========================= */
568 
569 
570 static char *buffer;
571 static size_t bufmax;
572 static size_t buflen;
573 
574 static inline void
comment_start()575 comment_start ()
576 {
577   buflen = 0;
578 }
579 
580 static inline void
comment_add(int c)581 comment_add (int c)
582 {
583   if (buflen >= bufmax)
584     {
585       bufmax = 2 * bufmax + 10;
586       buffer = xrealloc (buffer, bufmax);
587     }
588   buffer[buflen++] = c;
589 }
590 
591 static inline void
comment_line_end(size_t chars_to_remove)592 comment_line_end (size_t chars_to_remove)
593 {
594   buflen -= chars_to_remove;
595   while (buflen >= 1
596          && (buffer[buflen - 1] == ' ' || buffer[buflen - 1] == '\t'))
597     --buflen;
598   if (chars_to_remove == 0 && buflen >= bufmax)
599     {
600       bufmax = 2 * bufmax + 10;
601       buffer = xrealloc (buffer, bufmax);
602     }
603   buffer[buflen] = '\0';
604   savable_comment_add (buffer);
605 }
606 
607 
608 /* These are for tracking whether comments count as immediately before
609    keyword.  */
610 static int last_comment_line;
611 static int last_non_comment_line;
612 
613 
614 /* ========================= Accumulating messages ========================= */
615 
616 
617 static message_list_ty *mlp;
618 
619 
620 /* ========================== Reading of objects.  ========================= */
621 
622 
623 /* We are only interested in symbols (e.g. gettext or ngettext) and strings.
624    Other objects need not to be represented precisely.  */
625 enum object_type
626 {
627   t_symbol,     /* symbol */
628   t_string,     /* string */
629   t_other,      /* other kind of real object */
630   t_dot,        /* '.' pseudo object */
631   t_close,      /* ')' pseudo object */
632   t_eof         /* EOF marker */
633 };
634 
635 struct object
636 {
637   enum object_type type;
638   struct token *token;          /* for t_symbol and t_string */
639   int line_number_at_start;     /* for t_string */
640 };
641 
642 /* Free the memory pointed to by a 'struct object'.  */
643 static inline void
free_object(struct object * op)644 free_object (struct object *op)
645 {
646   if (op->type == t_symbol || op->type == t_string)
647     {
648       free_token (op->token);
649       free (op->token);
650     }
651 }
652 
653 /* Convert a t_symbol/t_string token to a char*.  */
654 static char *
string_of_object(const struct object * op)655 string_of_object (const struct object *op)
656 {
657   char *str;
658   int n;
659 
660   if (!(op->type == t_symbol || op->type == t_string))
661     abort ();
662   n = op->token->charcount;
663   str = XNMALLOC (n + 1, char);
664   memcpy (str, op->token->chars, n);
665   str[n] = '\0';
666   return str;
667 }
668 
669 /* Context lookup table.  */
670 static flag_context_list_table_ty *flag_context_list_table;
671 
672 /* Read the next object.  */
673 static void
read_object(struct object * op,flag_context_ty outer_context)674 read_object (struct object *op, flag_context_ty outer_context)
675 {
676   for (;;)
677     {
678       int c = do_getc ();
679       bool seen_underscore_prefix = false;
680 
681       switch (c)
682         {
683         case EOF:
684           op->type = t_eof;
685           return;
686 
687         case ' ': case '\r': case '\f': case '\t':
688           continue;
689 
690         case '\n':
691           /* Comments assumed to be grouped with a message must immediately
692              precede it, with no non-whitespace token on a line between
693              both.  */
694           if (last_non_comment_line > last_comment_line)
695             savable_comment_reset ();
696           continue;
697 
698         case ';':
699           {
700             bool all_semicolons = true;
701 
702             last_comment_line = line_number;
703             comment_start ();
704             for (;;)
705               {
706                 c = do_getc ();
707                 if (c == EOF || c == '\n')
708                   break;
709                 if (c != ';')
710                   all_semicolons = false;
711                 if (!all_semicolons)
712                   {
713                     /* We skip all leading white space, but not EOLs.  */
714                     if (!(buflen == 0 && (c == ' ' || c == '\t')))
715                       comment_add (c);
716                   }
717               }
718             comment_line_end (0);
719             continue;
720           }
721 
722         case '(':
723           {
724             int arg = 0;                /* Current argument number.  */
725             flag_context_list_iterator_ty context_iter;
726             const struct callshapes *shapes = NULL;
727             struct arglist_parser *argparser = NULL;
728 
729              for (;; arg++)
730                {
731                 struct object inner;
732                 flag_context_ty inner_context;
733 
734                 if (arg == 0)
735                   inner_context = null_context;
736                 else
737                   inner_context =
738                     inherited_context (outer_context,
739                                        flag_context_list_iterator_advance (
740                                          &context_iter));
741 
742                 read_object (&inner, inner_context);
743 
744                 /* Recognize end of list.  */
745                 if (inner.type == t_close)
746                   {
747                     op->type = t_other;
748                     last_non_comment_line = line_number;
749                     if (argparser != NULL)
750                       arglist_parser_done (argparser, arg);
751                     return;
752                   }
753 
754                 /* Dots are not allowed in every position.
755                    But be tolerant.  */
756 
757                 /* EOF inside list is illegal.
758                    But be tolerant.  */
759                 if (inner.type == t_eof)
760                   break;
761 
762                 if (arg == 0)
763                   {
764                     /* This is the function position.  */
765                     if (inner.type == t_symbol)
766                       {
767                         char *symbol_name = string_of_object (&inner);
768                         void *keyword_value;
769 
770                         if (hash_find_entry (&keywords,
771                                              symbol_name, strlen (symbol_name),
772                                              &keyword_value)
773                             == 0)
774                           shapes = (const struct callshapes *) keyword_value;
775 
776                         argparser = arglist_parser_alloc (mlp, shapes);
777 
778                         context_iter =
779                           flag_context_list_iterator (
780                             flag_context_list_table_lookup (
781                               flag_context_list_table,
782                               symbol_name, strlen (symbol_name)));
783 
784                         free (symbol_name);
785                       }
786                     else
787                       context_iter = null_context_list_iterator;
788                   }
789                 else
790                   {
791                     /* These are the argument positions.  */
792                     if (argparser != NULL && inner.type == t_string)
793                       {
794                         char *s = string_of_object (&inner);
795                         mixed_string_ty *ms =
796                           mixed_string_alloc_simple (s, lc_string,
797                                                      logical_file_name,
798                                                      inner.line_number_at_start);
799                         free (s);
800                         arglist_parser_remember (argparser, arg, ms,
801                                                  inner_context,
802                                                  logical_file_name,
803                                                  inner.line_number_at_start,
804                                                  savable_comment, false);
805                       }
806                   }
807 
808                 free_object (&inner);
809               }
810             if (argparser != NULL)
811               arglist_parser_done (argparser, arg);
812           }
813           op->type = t_other;
814           last_non_comment_line = line_number;
815           return;
816 
817         case ')':
818           /* Tell the caller about the end of list.
819              Unmatched closing parenthesis is illegal.
820              But be tolerant.  */
821           op->type = t_close;
822           last_non_comment_line = line_number;
823           return;
824 
825         case ',':
826           {
827             int c = do_getc ();
828             /* The ,@ handling inside lists is wrong anyway, because
829                ,@form expands to an unknown number of elements.  */
830             if (c != EOF && c != '@')
831               do_ungetc (c);
832           }
833           /*FALLTHROUGH*/
834         case '\'':
835         case '`':
836           {
837             struct object inner;
838 
839             read_object (&inner, null_context);
840 
841             /* Dots and EOF are not allowed here.  But be tolerant.  */
842 
843             free_object (&inner);
844 
845             op->type = t_other;
846             last_non_comment_line = line_number;
847             return;
848           }
849 
850         case '#':
851           /* Dispatch macro handling.  */
852           {
853             c = do_getc ();
854             if (c == EOF)
855               /* Invalid input.  Be tolerant, no error message.  */
856               {
857                 op->type = t_other;
858                 return;
859               }
860 
861             switch (c)
862               {
863               case '(': /* Vector */
864                 do_ungetc (c);
865                 {
866                   struct object inner;
867                   read_object (&inner, null_context);
868                   /* Dots and EOF are not allowed here.
869                      But be tolerant.  */
870                   free_object (&inner);
871                   op->type = t_other;
872                   last_non_comment_line = line_number;
873                   return;
874                 }
875 
876               case 'T': case 't': /* Boolean true */
877               case 'F': /* Boolean false */
878                 op->type = t_other;
879                 last_non_comment_line = line_number;
880                 return;
881 
882               case 'a':
883               case 'c':
884               case 'f':
885               case 'h':
886               case 'l':
887               case 's':
888               case 'u':
889               case 'v':
890               case 'y':
891                 {
892                   struct token token;
893                   do_ungetc (c);
894                   read_token (&token, '#');
895                   if ((token.charcount == 2
896                        && (token.chars[1] == 'a' || token.chars[1] == 'c'
897                            || token.chars[1] == 'h' || token.chars[1] == 'l'
898                            || token.chars[1] == 's' || token.chars[1] == 'u'
899                            || token.chars[1] == 'y'))
900                       || (token.charcount == 3
901                           && (token.chars[1] == 's' || token.chars[1] == 'u')
902                           && token.chars[2] == '8')
903                       || (token.charcount == 4
904                           && (((token.chars[1] == 's' || token.chars[1] == 'u')
905                                && token.chars[2] == '1'
906                                && token.chars[3] == '6')
907                               || ((token.chars[1] == 'c'
908                                    || token.chars[1] == 'f'
909                                    || token.chars[1] == 's'
910                                    || token.chars[1] == 'u')
911                                   && ((token.chars[2] == '3'
912                                        && token.chars[3] == '2')
913                                       || (token.chars[2] == '6'
914                                           && token.chars[3] == '4')))
915                               || (token.chars[1] == 'v'
916                                   && token.chars[2] == 'u'
917                                   && token.chars[3] == '8'))))
918                     {
919                       c = do_getc ();
920                       if (c != EOF)
921                         do_ungetc (c);
922                       if (c == '(')
923                         {
924                           /* Homogenous vector syntax:
925                                #a(...) - vector of char
926                                #c(...) - vector of complex (old)
927                                #c32(...) - vector of complex of single-float
928                                #c64(...) - vector of complex of double-float
929                                #f32(...) - vector of single-float
930                                #f64(...) - vector of double-float
931                                #h(...) - vector of short (old)
932                                #l(...) - vector of long long (old)
933                                #s(...) - vector of single-float (old)
934                                #s8(...) - vector of signed 8-bit integers
935                                #s16(...) - vector of signed 16-bit integers
936                                #s32(...) - vector of signed 32-bit integers
937                                #s64(...) - vector of signed 64-bit integers
938                                #u(...) - vector of unsigned long (old)
939                                #u8(...) - vector of unsigned 8-bit integers
940                                #u16(...) - vector of unsigned 16-bit integers
941                                #u32(...) - vector of unsigned 32-bit integers
942                                #u64(...) - vector of unsigned 64-bit integers
943                                #vu8(...) - vector of byte
944                                #y(...) - vector of byte (old)
945                            */
946                           struct object inner;
947                           read_object (&inner, null_context);
948                           /* Dots and EOF are not allowed here.
949                              But be tolerant.  */
950                           free_token (&token);
951                           free_object (&inner);
952                           op->type = t_other;
953                           last_non_comment_line = line_number;
954                           return;
955                         }
956                     }
957                   /* Boolean false, or unknown # object.  But be tolerant.  */
958                   free_token (&token);
959                   op->type = t_other;
960                   last_non_comment_line = line_number;
961                   return;
962                 }
963 
964               case 'B': case 'b':
965               case 'O': case 'o':
966               case 'D': case 'd':
967               case 'X': case 'x':
968               case 'E': case 'e':
969               case 'I': case 'i':
970                 {
971                   struct token token;
972                   do_ungetc (c);
973                   read_token (&token, '#');
974                   if (is_number (&token))
975                     {
976                       /* A number.  */
977                       free_token (&token);
978                       op->type = t_other;
979                       last_non_comment_line = line_number;
980                       return;
981                     }
982                   else
983                     {
984                       if (token.charcount == 2
985                           && (token.chars[1] == 'e' || token.chars[1] == 'i'))
986                         {
987                           c = do_getc ();
988                           if (c != EOF)
989                             do_ungetc (c);
990                           if (c == '(')
991                             {
992                               /* Homogenous vector syntax:
993                                    #e(...) - vector of long (old)
994                                    #i(...) - vector of double-float (old)
995                                */
996                               struct object inner;
997                               read_object (&inner, null_context);
998                               /* Dots and EOF are not allowed here.
999                                  But be tolerant.  */
1000                               free_token (&token);
1001                               free_object (&inner);
1002                               op->type = t_other;
1003                               last_non_comment_line = line_number;
1004                               return;
1005                             }
1006                         }
1007                       /* Unknown # object.  But be tolerant.  */
1008                       free_token (&token);
1009                       op->type = t_other;
1010                       last_non_comment_line = line_number;
1011                       return;
1012                     }
1013                 }
1014 
1015               case '!':
1016                 /* Block comment '#! ... !#'.  See
1017                    <https://www.gnu.org/software/guile/manual/html_node/Block-Comments.html>.  */
1018                 {
1019                   int c;
1020 
1021                   comment_start ();
1022                   c = do_getc ();
1023                   for (;;)
1024                     {
1025                       if (c == EOF)
1026                         break;
1027                       if (c == '!')
1028                         {
1029                           c = do_getc ();
1030                           if (c == EOF)
1031                             break;
1032                           if (c == '#')
1033                             {
1034                               comment_line_end (0);
1035                               break;
1036                             }
1037                           else
1038                             comment_add ('!');
1039                         }
1040                       else
1041                         {
1042                           /* We skip all leading white space.  */
1043                           if (!(buflen == 0 && (c == ' ' || c == '\t')))
1044                             comment_add (c);
1045                           if (c == '\n')
1046                             {
1047                               comment_line_end (1);
1048                               comment_start ();
1049                             }
1050                           c = do_getc ();
1051                         }
1052                     }
1053                   if (c == EOF)
1054                     {
1055                       /* EOF not allowed here.  But be tolerant.  */
1056                       op->type = t_eof;
1057                       return;
1058                     }
1059                   last_comment_line = line_number;
1060                   continue;
1061                 }
1062 
1063               case '|':
1064                 /* Block comment '#| ... |#'.  See
1065                    <https://www.gnu.org/software/guile/manual/html_node/Block-Comments.html>
1066                    and <https://srfi.schemers.org/srfi-30/srfi-30.html>.  */
1067                 {
1068                   int depth = 0;
1069                   int c;
1070 
1071                   comment_start ();
1072                   c = do_getc ();
1073                   for (;;)
1074                     {
1075                       if (c == EOF)
1076                         break;
1077                       if (c == '|')
1078                         {
1079                           c = do_getc ();
1080                           if (c == EOF)
1081                             break;
1082                           if (c == '#')
1083                             {
1084                               if (depth == 0)
1085                                 {
1086                                   comment_line_end (0);
1087                                   break;
1088                                 }
1089                               depth--;
1090                               comment_add ('|');
1091                               comment_add ('#');
1092                               c = do_getc ();
1093                             }
1094                           else
1095                             comment_add ('|');
1096                         }
1097                       else if (c == '#')
1098                         {
1099                           c = do_getc ();
1100                           if (c == EOF)
1101                             break;
1102                           comment_add ('#');
1103                           if (c == '|')
1104                             {
1105                               depth++;
1106                               comment_add ('|');
1107                               c = do_getc ();
1108                             }
1109                         }
1110                       else
1111                         {
1112                           /* We skip all leading white space.  */
1113                           if (!(buflen == 0 && (c == ' ' || c == '\t')))
1114                             comment_add (c);
1115                           if (c == '\n')
1116                             {
1117                               comment_line_end (1);
1118                               comment_start ();
1119                             }
1120                           c = do_getc ();
1121                         }
1122                     }
1123                   if (c == EOF)
1124                     {
1125                       /* EOF not allowed here.  But be tolerant.  */
1126                       op->type = t_eof;
1127                       return;
1128                     }
1129                   last_comment_line = line_number;
1130                   continue;
1131                 }
1132 
1133               case '*':
1134                 /* Bit vector.  */
1135                 {
1136                   struct token token;
1137                   read_token (&token, c);
1138                   /* The token should consists only of '0' and '1', except
1139                      for the initial '*'.  But be tolerant.  */
1140                   free_token (&token);
1141                   op->type = t_other;
1142                   last_non_comment_line = line_number;
1143                   return;
1144                 }
1145 
1146               case '{':
1147                 /* Symbol with multiple escapes: #{...}#  */
1148                 {
1149                   op->token = XMALLOC (struct token);
1150 
1151                   init_token (op->token);
1152 
1153                   for (;;)
1154                     {
1155                       c = do_getc ();
1156 
1157                       if (c == EOF)
1158                         break;
1159                       if (c == '\\')
1160                         {
1161                           c = do_getc ();
1162                           if (c == EOF)
1163                             break;
1164                         }
1165                       else if (c == '}')
1166                         {
1167                           c = do_getc ();
1168                           if (c == '#')
1169                             break;
1170                           if (c != EOF)
1171                             do_ungetc (c);
1172                           c = '}';
1173                         }
1174                       grow_token (op->token);
1175                       op->token->chars[op->token->charcount++] = c;
1176                     }
1177 
1178                   op->type = t_symbol;
1179                   last_non_comment_line = line_number;
1180                   return;
1181                 }
1182 
1183               case '\\':
1184                 /* Character.  */
1185                 {
1186                   struct token token;
1187                   c = do_getc ();
1188                   if (c != EOF)
1189                     {
1190                       read_token (&token, c);
1191                       free_token (&token);
1192                     }
1193                   op->type = t_other;
1194                   last_non_comment_line = line_number;
1195                   return;
1196                 }
1197 
1198               case ':': /* Keyword.  */
1199               case '&': /* Deprecated keyword, installed in optargs.scm.  */
1200                 {
1201                   struct token token;
1202                   read_token (&token, '-');
1203                   free_token (&token);
1204                   op->type = t_other;
1205                   last_non_comment_line = line_number;
1206                   return;
1207                 }
1208 
1209               /* The following are installed through read-hash-extend.  */
1210 
1211               /* arrays.scm */
1212               case '0': case '1': case '2': case '3': case '4':
1213               case '5': case '6': case '7': case '8': case '9':
1214                 /* Multidimensional array syntax: #nx(...) where
1215                      n ::= DIGIT+
1216                      x ::= {'a'|'b'|'c'|'e'|'i'|'s'|'u'}
1217                  */
1218                 do
1219                   c = do_getc ();
1220                 while (c >= '0' && c <= '9');
1221                 /* c should be one of {'a'|'b'|'c'|'e'|'i'|'s'|'u'}.
1222                    But be tolerant.  */
1223                 /*FALLTHROUGH*/
1224               case '\'': /* boot-9.scm */
1225               case '.': /* boot-9.scm */
1226               case ',': /* srfi-10.scm */
1227                 {
1228                   struct object inner;
1229                   read_object (&inner, null_context);
1230                   /* Dots and EOF are not allowed here.
1231                      But be tolerant.  */
1232                   free_object (&inner);
1233                   op->type = t_other;
1234                   last_non_comment_line = line_number;
1235                   return;
1236                 }
1237 
1238               default:
1239                 /* Unknown.  */
1240                 op->type = t_other;
1241                 last_non_comment_line = line_number;
1242                 return;
1243               }
1244             /*NOTREACHED*/
1245             abort ();
1246           }
1247 
1248         case '_':
1249           /* GIMP script-fu extension: '_' before a string literal is
1250              considered a gettext call on the string.  */
1251           {
1252             int c = do_getc ();
1253             if (c == EOF)
1254               /* Invalid input.  Be tolerant, no error message.  */
1255               {
1256                 op->type = t_other;
1257                 return;
1258               }
1259             if (c != '"')
1260               {
1261                 do_ungetc (c);
1262 
1263                 /* If '_' is not followed by a string literal,
1264                    consider it a part of symbol.  */
1265                 op->token = XMALLOC (struct token);
1266                 read_token (op->token, '_');
1267                 op->type = t_symbol;
1268                 last_non_comment_line = line_number;
1269                 return;
1270               }
1271             seen_underscore_prefix = true;
1272           }
1273           /*FALLTHROUGH*/
1274 
1275         case '"':
1276           {
1277             op->token = XMALLOC (struct token);
1278             init_token (op->token);
1279             op->line_number_at_start = line_number;
1280             for (;;)
1281               {
1282                 int c = do_getc ();
1283                 if (c == EOF)
1284                   /* Invalid input.  Be tolerant, no error message.  */
1285                   break;
1286                 if (c == '"')
1287                   break;
1288                 if (c == '\\')
1289                   {
1290                     c = do_getc ();
1291                     if (c == EOF)
1292                       /* Invalid input.  Be tolerant, no error message.  */
1293                       break;
1294                     switch (c)
1295                       {
1296                       case '\n':
1297                         continue;
1298                       case '0':
1299                         c = '\0';
1300                         break;
1301                       case 'a':
1302                         c = '\a';
1303                         break;
1304                       case 'f':
1305                         c = '\f';
1306                         break;
1307                       case 'n':
1308                         c = '\n';
1309                         break;
1310                       case 'r':
1311                         c = '\r';
1312                         break;
1313                       case 't':
1314                         c = '\t';
1315                         break;
1316                       case 'v':
1317                         c = '\v';
1318                         break;
1319                       default:
1320                         break;
1321                       }
1322                   }
1323                 grow_token (op->token);
1324                 op->token->chars[op->token->charcount++] = c;
1325               }
1326             op->type = t_string;
1327 
1328             if (seen_underscore_prefix || extract_all)
1329               {
1330                 lex_pos_ty pos;
1331 
1332                 pos.file_name = logical_file_name;
1333                 pos.line_number = op->line_number_at_start;
1334                 remember_a_message (mlp, NULL, string_of_object (op), false,
1335                                     false, null_context, &pos,
1336                                     NULL, savable_comment, false);
1337               }
1338             last_non_comment_line = line_number;
1339             return;
1340           }
1341 
1342         case '0': case '1': case '2': case '3': case '4':
1343         case '5': case '6': case '7': case '8': case '9':
1344         case '+': case '-': case '.':
1345           /* Read a number or symbol token.  */
1346           op->token = XMALLOC (struct token);
1347           read_token (op->token, c);
1348           if (op->token->charcount == 1 && op->token->chars[0] == '.')
1349             {
1350               free_token (op->token);
1351               free (op->token);
1352               op->type = t_dot;
1353             }
1354           else if (is_number (op->token))
1355             {
1356               /* A number.  */
1357               free_token (op->token);
1358               free (op->token);
1359               op->type = t_other;
1360             }
1361           else
1362             {
1363               /* A symbol.  */
1364               op->type = t_symbol;
1365             }
1366           last_non_comment_line = line_number;
1367           return;
1368 
1369         case ':':
1370         default:
1371           /* Read a symbol token.  */
1372           op->token = XMALLOC (struct token);
1373           read_token (op->token, c);
1374           op->type = t_symbol;
1375           last_non_comment_line = line_number;
1376           return;
1377         }
1378     }
1379 }
1380 
1381 
1382 void
extract_scheme(FILE * f,const char * real_filename,const char * logical_filename,flag_context_list_table_ty * flag_table,msgdomain_list_ty * mdlp)1383 extract_scheme (FILE *f,
1384                 const char *real_filename, const char *logical_filename,
1385                 flag_context_list_table_ty *flag_table,
1386                 msgdomain_list_ty *mdlp)
1387 {
1388   mlp = mdlp->item[0]->messages;
1389 
1390   fp = f;
1391   real_file_name = real_filename;
1392   logical_file_name = xstrdup (logical_filename);
1393   line_number = 1;
1394 
1395   last_comment_line = -1;
1396   last_non_comment_line = -1;
1397 
1398   flag_context_list_table = flag_table;
1399 
1400   init_keywords ();
1401 
1402   /* Eat tokens until eof is seen.  When read_object returns
1403      due to an unbalanced closing parenthesis, just restart it.  */
1404   do
1405     {
1406       struct object toplevel_object;
1407 
1408       read_object (&toplevel_object, null_context);
1409 
1410       if (toplevel_object.type == t_eof)
1411         break;
1412 
1413       free_object (&toplevel_object);
1414     }
1415   while (!feof (fp));
1416 
1417   /* Close scanner.  */
1418   fp = NULL;
1419   real_file_name = NULL;
1420   logical_file_name = NULL;
1421   line_number = 0;
1422 }
1423