1 /* GNU m4 -- A simple macro processor
2 
3    Copyright (C) 1989-1994, 2004-2014, 2016-2017, 2020-2021 Free
4    Software Foundation, Inc.
5 
6    This file is part of GNU M4.
7 
8    GNU M4 is free software: you can redistribute it and/or modify
9    it under the terms of the GNU General Public License as published by
10    the Free Software Foundation, either version 3 of the License, or
11    (at your option) any later version.
12 
13    GNU M4 is distributed in the hope that it will be useful,
14    but WITHOUT ANY WARRANTY; without even the implied warranty of
15    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16    GNU General Public License for more details.
17 
18    You should have received a copy of the GNU General Public License
19    along with this program.  If not, see <https://www.gnu.org/licenses/>.
20 */
21 
22 /* Handling of different input sources, and lexical analysis.  */
23 
24 #include "m4.h"
25 
26 #include "memchr2.h"
27 
28 /* Unread input can be either files, that should be read (eg. included
29    files), strings, which should be rescanned (eg. macro expansion text),
30    or quoted macro definitions (as returned by the builtin "defn").
31    Unread input are organised in a stack, implemented with an obstack.
32    Each input source is described by a "struct input_block".  The obstack
33    is "current_input".  The top of the input stack is "isp".
34 
35    The macro "m4wrap" places the text to be saved on another input
36    stack, on the obstack "wrapup_stack", whose top is "wsp".  When EOF
37    is seen on normal input (eg, when "current_input" is empty), input is
38    switched over to "wrapup_stack", and the original "current_input" is
39    freed.  A new stack is allocated for "wrapup_stack", which will
40    accept any text produced by calls to "m4wrap" from within the
41    wrapped text.  This process of shuffling "wrapup_stack" to
42    "current_input" can continue indefinitely, even generating infinite
43    loops (e.g. "define(`f',`m4wrap(`f')')f"), without memory leaks.
44 
45    Pushing new input on the input stack is done by push_file (),
46    push_string (), push_wrapup () (for wrapup text), and push_macro ()
47    (for macro definitions).  Because macro expansion needs direct access
48    to the current input obstack (for optimisation), push_string () are
49    split in two functions, push_string_init (), which returns a pointer
50    to the current input stack, and push_string_finish (), which return a
51    pointer to the final text.  The input_block *next is used to manage
52    the coordination between the different push routines.
53 
54    The current file and line number are stored in two global
55    variables, for use by the error handling functions in m4.c.  Macro
56    expansion wants to report the line where a macro name was detected,
57    rather than where it finished collecting arguments.  This also
58    applies to text resulting from macro expansions.  So each input
59    block maintains its own notion of the current file and line, and
60    swapping between input blocks updates the global variables
61    accordingly.  */
62 
63 #ifdef ENABLE_CHANGEWORD
64 #include "regex.h"
65 #endif
66 
67 enum input_type
68 {
69   INPUT_STRING,         /* String resulting from macro expansion.  */
70   INPUT_FILE,           /* File from command line or include.  */
71   INPUT_MACRO           /* Builtin resulting from defn.  */
72 };
73 
74 typedef enum input_type input_type;
75 
76 struct input_block
77 {
78   struct input_block *prev;     /* previous input_block on the input stack */
79   input_type type;              /* see enum values */
80   const char *file;             /* file where this input is from */
81   int line;                     /* line where this input is from */
82   union
83     {
84       struct
85         {
86           char *string;         /* remaining string value */
87           char *end;            /* terminating NUL of string */
88         }
89         u_s;    /* INPUT_STRING */
90       struct
91         {
92           FILE *fp;                  /* input file handle */
93           bool_bitfield end : 1;     /* true if peek has seen EOF */
94           bool_bitfield close : 1;   /* true if we should close file on pop */
95           bool_bitfield advance : 1; /* track previous start_of_input_line */
96         }
97         u_f;    /* INPUT_FILE */
98       builtin_func *func;       /* pointer to macro's function */
99     }
100   u;
101 };
102 
103 typedef struct input_block input_block;
104 
105 
106 /* Current input file name.  */
107 const char *current_file;
108 
109 /* Current input line number.  */
110 int current_line;
111 
112 /* Obstack for storing individual tokens.  */
113 static struct obstack token_stack;
114 
115 /* Obstack for storing file names.  */
116 static struct obstack file_names;
117 
118 /* Wrapup input stack.  */
119 static struct obstack *wrapup_stack;
120 
121 /* Current stack, from input or wrapup.  */
122 static struct obstack *current_input;
123 
124 /* Bottom of token_stack, for obstack_free.  */
125 static void *token_bottom;
126 
127 /* Pointer to top of current_input.  */
128 static input_block *isp;
129 
130 /* Pointer to top of wrapup_stack.  */
131 static input_block *wsp;
132 
133 /* Aux. for handling split push_string ().  */
134 static input_block *next;
135 
136 /* Flag for next_char () to increment current_line.  */
137 static bool start_of_input_line;
138 
139 /* Flag for next_char () to recognize change in input block.  */
140 static bool input_change;
141 
142 #define CHAR_EOF        256     /* character return on EOF */
143 #define CHAR_MACRO      257     /* character return for MACRO token */
144 
145 /* Quote chars.  */
146 STRING rquote;
147 STRING lquote;
148 
149 /* Comment chars.  */
150 STRING bcomm;
151 STRING ecomm;
152 
153 #ifdef ENABLE_CHANGEWORD
154 
155 # define DEFAULT_WORD_REGEXP "[_a-zA-Z][_a-zA-Z0-9]*"
156 
157 static struct re_pattern_buffer word_regexp;
158 static int default_word_regexp;
159 static struct re_registers regs;
160 
161 #else /* ! ENABLE_CHANGEWORD */
162 # define default_word_regexp 1
163 #endif /* ! ENABLE_CHANGEWORD */
164 
165 #ifdef DEBUG_INPUT
166 static const char *token_type_string (token_type);
167 #endif
168 
169 static void pop_input (void);
170 
171 
172 
173 /*-------------------------------------------------------------------.
174 | push_file () pushes an input file on the input stack, saving the   |
175 | current file name and line number.  If next is non-NULL, this push |
176 | invalidates a call to push_string_init (), whose storage is        |
177 | consequently released.  If CLOSE_WHEN_DONE, then close FP after    |
178 | EOF is detected.                                                   |
179 `-------------------------------------------------------------------*/
180 
181 void
push_file(FILE * fp,const char * title,bool close_when_done)182 push_file (FILE *fp, const char *title, bool close_when_done)
183 {
184   input_block *i;
185 
186   if (next != NULL)
187     {
188       obstack_free (current_input, next);
189       next = NULL;
190     }
191 
192   if (debug_level & DEBUG_TRACE_INPUT)
193     DEBUG_MESSAGE1 ("input read from %s", title);
194 
195   i = (input_block *) obstack_alloc (current_input,
196                                      sizeof (struct input_block));
197   i->type = INPUT_FILE;
198   i->file = (char *) obstack_copy0 (&file_names, title, strlen (title));
199   i->line = 1;
200   input_change = true;
201 
202   i->u.u_f.fp = fp;
203   i->u.u_f.end = false;
204   i->u.u_f.close = close_when_done;
205   i->u.u_f.advance = start_of_input_line;
206   output_current_line = -1;
207 
208   i->prev = isp;
209   isp = i;
210 }
211 
212 /*---------------------------------------------------------------.
213 | push_macro () pushes a builtin macro's definition on the input |
214 | stack.  If next is non-NULL, this push invalidates a call to   |
215 | push_string_init (), whose storage is consequently released.   |
216 `---------------------------------------------------------------*/
217 
218 void
push_macro(builtin_func * func)219 push_macro (builtin_func *func)
220 {
221   input_block *i;
222 
223   if (next != NULL)
224     {
225       obstack_free (current_input, next);
226       next = NULL;
227     }
228 
229   i = (input_block *) obstack_alloc (current_input,
230                                      sizeof (struct input_block));
231   i->type = INPUT_MACRO;
232   i->file = current_file;
233   i->line = current_line;
234   input_change = true;
235 
236   i->u.func = func;
237   i->prev = isp;
238   isp = i;
239 }
240 
241 /*------------------------------------------------------------------.
242 | First half of push_string ().  The pointer next points to the new |
243 | input_block.                                                      |
244 `------------------------------------------------------------------*/
245 
246 struct obstack *
push_string_init(void)247 push_string_init (void)
248 {
249   if (next != NULL)
250     {
251       M4ERROR ((warning_status, 0,
252                 "INTERNAL ERROR: recursive push_string!"));
253       abort ();
254     }
255 
256   /* Prefer reusing an older block, for tail-call optimization.  */
257   while (isp && isp->type == INPUT_STRING && !isp->u.u_s.string[0])
258     pop_input ();
259   next = (input_block *) obstack_alloc (current_input,
260                                         sizeof (struct input_block));
261   next->type = INPUT_STRING;
262   next->file = current_file;
263   next->line = current_line;
264 
265   return current_input;
266 }
267 
268 /*-------------------------------------------------------------------.
269 | Last half of push_string ().  If next is now NULL, a call to       |
270 | push_file () has invalidated the previous call to push_string_init |
271 | (), so we just give up.  If the new object is void, we do not push |
272 | it.  The function push_string_finish () returns a pointer to the   |
273 | finished object.  This pointer is only for temporary use, since    |
274 | reading the next token might release the memory used for the       |
275 | object.                                                            |
276 `-------------------------------------------------------------------*/
277 
278 const char *
push_string_finish(void)279 push_string_finish (void)
280 {
281   const char *ret = NULL;
282 
283   if (next == NULL)
284     return NULL;
285 
286   if (obstack_object_size (current_input) > 0)
287     {
288       size_t len = obstack_object_size (current_input);
289       obstack_1grow (current_input, '\0');
290       next->u.u_s.string = (char *) obstack_finish (current_input);
291       next->u.u_s.end = next->u.u_s.string + len;
292       next->prev = isp;
293       isp = next;
294       ret = isp->u.u_s.string; /* for immediate use only */
295       input_change = true;
296     }
297   else
298     obstack_free (current_input, next); /* people might leave garbage on it. */
299   next = NULL;
300   return ret;
301 }
302 
303 /*------------------------------------------------------------------.
304 | The function push_wrapup () pushes a string on the wrapup stack.  |
305 | When the normal input stack gets empty, the wrapup stack will     |
306 | become the input stack, and push_string () and push_file () will  |
307 | operate on wrapup_stack.  Push_wrapup should be done as           |
308 | push_string (), but this will suffice, as long as arguments to    |
309 | m4_m4wrap () are moderate in size.                                |
310 `------------------------------------------------------------------*/
311 
312 void
push_wrapup(const char * s)313 push_wrapup (const char *s)
314 {
315   size_t len = strlen (s);
316   input_block *i;
317   i = (input_block *) obstack_alloc (wrapup_stack,
318                                      sizeof (struct input_block));
319   i->prev = wsp;
320   i->type = INPUT_STRING;
321   i->file = current_file;
322   i->line = current_line;
323   i->u.u_s.string = (char *) obstack_copy0 (wrapup_stack, s, len);
324   i->u.u_s.end = i->u.u_s.string + len;
325   wsp = i;
326 }
327 
328 
329 /*-------------------------------------------------------------------.
330 | The function pop_input () pops one level of input sources.  If the |
331 | popped input_block is a file, current_file and current_line are    |
332 | reset to the saved values before the memory for the input_block is |
333 | released.                                                          |
334 `-------------------------------------------------------------------*/
335 
336 static void
pop_input(void)337 pop_input (void)
338 {
339   input_block *tmp = isp->prev;
340 
341   switch (isp->type)
342     {
343     case INPUT_STRING:
344     case INPUT_MACRO:
345       break;
346 
347     case INPUT_FILE:
348       if (debug_level & DEBUG_TRACE_INPUT)
349         {
350           if (tmp)
351             DEBUG_MESSAGE2 ("input reverted to %s, line %d",
352                             tmp->file, tmp->line);
353           else
354             DEBUG_MESSAGE ("input exhausted");
355         }
356 
357       if (ferror (isp->u.u_f.fp))
358         {
359           M4ERROR ((warning_status, 0, _("read error")));
360           if (isp->u.u_f.close)
361             fclose (isp->u.u_f.fp);
362           retcode = EXIT_FAILURE;
363         }
364       else if (isp->u.u_f.close && fclose (isp->u.u_f.fp) == EOF)
365         {
366           M4ERROR ((warning_status, errno, _("error reading file")));
367           retcode = EXIT_FAILURE;
368         }
369       start_of_input_line = isp->u.u_f.advance;
370       output_current_line = -1;
371       break;
372 
373     default:
374       M4ERROR ((warning_status, 0,
375                 "INTERNAL ERROR: input stack botch in pop_input ()"));
376       abort ();
377     }
378   obstack_free (current_input, isp);
379   next = NULL; /* might be set in push_string_init () */
380 
381   isp = tmp;
382   input_change = true;
383 }
384 
385 /*-------------------------------------------------------------------.
386 | To switch input over to the wrapup stack, main calls pop_wrapup    |
387 | ().  Since wrapup text can install new wrapup text, pop_wrapup ()  |
388 | returns false when there is no wrapup text on the stack, and true  |
389 | otherwise.                                                         |
390 `-------------------------------------------------------------------*/
391 
392 bool
pop_wrapup(void)393 pop_wrapup (void)
394 {
395   next = NULL;
396   obstack_free (current_input, NULL);
397   free (current_input);
398 
399   if (wsp == NULL)
400     {
401       /* End of the program.  Free all memory even though we are about
402          to exit, since it makes leak detection easier.  */
403       obstack_free (&token_stack, NULL);
404       obstack_free (&file_names, NULL);
405       obstack_free (wrapup_stack, NULL);
406       free (wrapup_stack);
407 #ifdef ENABLE_CHANGEWORD
408       regfree (&word_regexp);
409 #endif /* ENABLE_CHANGEWORD */
410       return false;
411     }
412 
413   current_input = wrapup_stack;
414   wrapup_stack = (struct obstack *) xmalloc (sizeof (struct obstack));
415   obstack_init (wrapup_stack);
416 
417   isp = wsp;
418   wsp = NULL;
419   input_change = true;
420 
421   return true;
422 }
423 
424 /*-------------------------------------------------------------------.
425 | When a MACRO token is seen, next_token () uses init_macro_token () |
426 | to retrieve the value of the function pointer.                     |
427 `-------------------------------------------------------------------*/
428 
429 static void
init_macro_token(token_data * td)430 init_macro_token (token_data *td)
431 {
432   if (isp->type != INPUT_MACRO)
433     {
434       M4ERROR ((warning_status, 0,
435                 "INTERNAL ERROR: bad call to init_macro_token ()"));
436       abort ();
437     }
438 
439   TOKEN_DATA_TYPE (td) = TOKEN_FUNC;
440   TOKEN_DATA_FUNC (td) = isp->u.func;
441 }
442 
443 
444 /*-----------------------------------------------------------------.
445 | Low level input is done a character at a time.  The function     |
446 | peek_input () is used to look at the next character in the input |
447 | stream.  At any given time, it reads from the input_block on the |
448 | top of the current input stack.                                  |
449 `-----------------------------------------------------------------*/
450 
451 static int
peek_input(void)452 peek_input (void)
453 {
454   int ch;
455   input_block *block = isp;
456 
457   while (1)
458     {
459       if (block == NULL)
460         return CHAR_EOF;
461 
462       switch (block->type)
463         {
464         case INPUT_STRING:
465           ch = to_uchar (block->u.u_s.string[0]);
466           if (ch != '\0')
467             return ch;
468           break;
469 
470         case INPUT_FILE:
471           ch = getc (block->u.u_f.fp);
472           if (ch != EOF)
473             {
474               ungetc (ch, block->u.u_f.fp);
475               return ch;
476             }
477           block->u.u_f.end = true;
478           break;
479 
480         case INPUT_MACRO:
481           return CHAR_MACRO;
482 
483         default:
484           M4ERROR ((warning_status, 0,
485                     "INTERNAL ERROR: input stack botch in peek_input ()"));
486           abort ();
487         }
488       block = block->prev;
489     }
490 }
491 
492 /*-------------------------------------------------------------------.
493 | The function next_char () is used to read and advance the input to |
494 | the next character.  It also manages line numbers for error        |
495 | messages, so they do not get wrong, due to lookahead.  The token   |
496 | consisting of a newline alone is taken as belonging to the line it |
497 | ends, and the current line number is not incremented until the     |
498 | next character is read.  99.9% of all calls will read from a       |
499 | string, so factor that out into a macro for speed.                 |
500 `-------------------------------------------------------------------*/
501 
502 #define next_char() \
503   (isp && isp->type == INPUT_STRING && isp->u.u_s.string[0]     \
504    && !input_change                                             \
505    ? to_uchar (*isp->u.u_s.string++)                            \
506    : next_char_1 ())
507 
508 static int
next_char_1(void)509 next_char_1 (void)
510 {
511   int ch;
512 
513   while (1)
514     {
515       if (isp == NULL)
516         {
517           current_file = "";
518           current_line = 0;
519           return CHAR_EOF;
520         }
521 
522       if (input_change)
523         {
524           current_file = isp->file;
525           current_line = isp->line;
526           input_change = false;
527         }
528 
529       switch (isp->type)
530         {
531         case INPUT_STRING:
532           ch = to_uchar (*isp->u.u_s.string++);
533           if (ch != '\0')
534             return ch;
535           break;
536 
537         case INPUT_FILE:
538           if (start_of_input_line)
539             {
540               start_of_input_line = false;
541               current_line = ++isp->line;
542             }
543 
544           /* If stdin is a terminal, calling getc after peek_input
545              already called it would make the user have to hit ^D
546              twice to quit.  */
547           ch = isp->u.u_f.end ? EOF : getc (isp->u.u_f.fp);
548           if (ch != EOF)
549             {
550               if (ch == '\n')
551                 start_of_input_line = true;
552               return ch;
553             }
554           break;
555 
556         case INPUT_MACRO:
557           pop_input (); /* INPUT_MACRO input sources has only one token */
558           return CHAR_MACRO;
559 
560         default:
561           M4ERROR ((warning_status, 0,
562                     "INTERNAL ERROR: input stack botch in next_char ()"));
563           abort ();
564         }
565 
566       /* End of input source --- pop one level.  */
567       pop_input ();
568     }
569 }
570 
571 /*-------------------------------------------------------------------.
572 | skip_line () simply discards all immediately following characters, |
573 | upto the first newline.  It is only used from m4_dnl ().           |
574 `-------------------------------------------------------------------*/
575 
576 void
skip_line(void)577 skip_line (void)
578 {
579   int ch;
580   const char *file = current_file;
581   int line = current_line;
582 
583   while ((ch = next_char ()) != CHAR_EOF && ch != '\n')
584     ;
585   if (ch == CHAR_EOF)
586     /* current_file changed to "" if we see CHAR_EOF, use the
587        previous value we stored earlier.  */
588     M4ERROR_AT_LINE ((warning_status, 0, file, line,
589                       _("Warning: end of file treated as newline")));
590   /* On the rare occasion that dnl crosses include file boundaries
591      (either the input file did not end in a newline, or changeword
592      was used), calling next_char can update current_file and
593      current_line, and that update will be undone as we return to
594      expand_macro.  This informs next_char to fix things again.  */
595   if (file != current_file || line != current_line)
596     input_change = true;
597 }
598 
599 
600 /*------------------------------------------------------------------.
601 | This function is for matching a string against a prefix of the    |
602 | input stream.  If the string matches the input and consume is     |
603 | true, the input is discarded; otherwise any characters read are   |
604 | pushed back again.  The function is used only when multicharacter |
605 | quotes or comment delimiters are used.                            |
606 `------------------------------------------------------------------*/
607 
608 static bool
match_input(const char * s,bool consume)609 match_input (const char *s, bool consume)
610 {
611   int n;                        /* number of characters matched */
612   int ch;                       /* input character */
613   const char *t;
614   bool result = false;
615 
616   ch = peek_input ();
617   if (ch != to_uchar (*s))
618     return false;                       /* fail */
619 
620   if (s[1] == '\0')
621     {
622       if (consume)
623         next_char ();
624       return true;                      /* short match */
625     }
626 
627   next_char ();
628   for (n = 1, t = s++; peek_input () == to_uchar (*s++); )
629     {
630       next_char ();
631       n++;
632       if (*s == '\0')           /* long match */
633         {
634           if (consume)
635             return true;
636           result = true;
637           break;
638         }
639     }
640 
641   /* Failed or shouldn't consume, push back input.  */
642   {
643     struct obstack *h = push_string_init ();
644 
645     /* `obstack_grow' may be macro evaluating its arg 1 several times. */
646     obstack_grow (h, t, n);
647   }
648   push_string_finish ();
649   return result;
650 }
651 
652 /*--------------------------------------------------------------------.
653 | The macro MATCH() is used to match a string S against the input.    |
654 | The first character is handled inline, for speed.  Hopefully, this  |
655 | will not hurt efficiency too much when single character quotes and  |
656 | comment delimiters are used.  If CONSUME, then CH is the result of  |
657 | next_char, and a successful match will discard the matched string.  |
658 | Otherwise, CH is the result of peek_char, and the input stream is   |
659 | effectively unchanged.                                              |
660 `--------------------------------------------------------------------*/
661 
662 #define MATCH(ch, s, consume)                                           \
663   (to_uchar ((s)[0]) == (ch)                                            \
664    && (ch) != '\0'                                                      \
665    && ((s)[1] == '\0' || (match_input ((s) + (consume), consume))))
666 
667 
668 /*--------------------------------------------------------.
669 | Initialize input stacks, and quote/comment characters.  |
670 `--------------------------------------------------------*/
671 
672 void
input_init(void)673 input_init (void)
674 {
675   current_file = "";
676   current_line = 0;
677 
678   current_input = (struct obstack *) xmalloc (sizeof (struct obstack));
679   obstack_init (current_input);
680   wrapup_stack = (struct obstack *) xmalloc (sizeof (struct obstack));
681   obstack_init (wrapup_stack);
682 
683   obstack_init (&file_names);
684 
685   /* Allocate an object in the current chunk, so that obstack_free
686      will always work even if the first token parsed spills to a new
687      chunk.  */
688   obstack_init (&token_stack);
689   obstack_alloc (&token_stack, 1);
690   token_bottom = obstack_base (&token_stack);
691 
692   isp = NULL;
693   wsp = NULL;
694   next = NULL;
695 
696   start_of_input_line = false;
697 
698   lquote.string = xstrdup (DEF_LQUOTE);
699   lquote.length = strlen (lquote.string);
700   rquote.string = xstrdup (DEF_RQUOTE);
701   rquote.length = strlen (rquote.string);
702   bcomm.string = xstrdup (DEF_BCOMM);
703   bcomm.length = strlen (bcomm.string);
704   ecomm.string = xstrdup (DEF_ECOMM);
705   ecomm.length = strlen (ecomm.string);
706 
707 #ifdef ENABLE_CHANGEWORD
708   set_word_regexp (user_word_regexp);
709 #endif
710 }
711 
712 
713 /*------------------------------------------------------------------.
714 | Functions for setting quotes and comment delimiters.  Used by     |
715 | m4_changecom () and m4_changequote ().  Pass NULL if the argument |
716 | was not present, to distinguish from an explicit empty string.    |
717 `------------------------------------------------------------------*/
718 
719 void
set_quotes(const char * lq,const char * rq)720 set_quotes (const char *lq, const char *rq)
721 {
722   free (lquote.string);
723   free (rquote.string);
724 
725   /* POSIX states that with 0 arguments, the default quotes are used.
726      POSIX XCU ERN 112 states that behavior is implementation-defined
727      if there was only one argument, or if there is an empty string in
728      either position when there are two arguments.  We allow an empty
729      left quote to disable quoting, but a non-empty left quote will
730      always create a non-empty right quote.  See the texinfo for what
731      some other implementations do.  */
732   if (!lq)
733     {
734       lq = DEF_LQUOTE;
735       rq = DEF_RQUOTE;
736     }
737   else if (!rq || (*lq && !*rq))
738     rq = DEF_RQUOTE;
739 
740   lquote.string = xstrdup (lq);
741   lquote.length = strlen (lquote.string);
742   rquote.string = xstrdup (rq);
743   rquote.length = strlen (rquote.string);
744 }
745 
746 void
set_comment(const char * bc,const char * ec)747 set_comment (const char *bc, const char *ec)
748 {
749   free (bcomm.string);
750   free (ecomm.string);
751 
752   /* POSIX requires no arguments to disable comments.  It requires
753      empty arguments to be used as-is, but this is counter to
754      traditional behavior, because a non-null begin and null end makes
755      it impossible to end a comment.  An aardvark has been filed:
756      http://www.opengroup.org/austin/mailarchives/ag-review/msg02168.html
757      This implementation assumes the aardvark will be approved.  See
758      the texinfo for what some other implementations do.  */
759   if (!bc)
760     bc = ec = "";
761   else if (!ec || (*bc && !*ec))
762     ec = DEF_ECOMM;
763 
764   bcomm.string = xstrdup (bc);
765   bcomm.length = strlen (bcomm.string);
766   ecomm.string = xstrdup (ec);
767   ecomm.length = strlen (ecomm.string);
768 }
769 
770 #ifdef ENABLE_CHANGEWORD
771 
772 void
set_word_regexp(const char * regexp)773 set_word_regexp (const char *regexp)
774 {
775   const char *msg;
776   struct re_pattern_buffer new_word_regexp;
777 
778   if (!*regexp || STREQ (regexp, DEFAULT_WORD_REGEXP))
779     {
780       default_word_regexp = true;
781       return;
782     }
783 
784   /* Dry run to see whether the new expression is compilable.  */
785   init_pattern_buffer (&new_word_regexp, NULL);
786   msg = re_compile_pattern (regexp, strlen (regexp), &new_word_regexp);
787   regfree (&new_word_regexp);
788 
789   if (msg != NULL)
790     {
791       M4ERROR ((warning_status, 0,
792                 _("bad regular expression `%s': %s"), regexp, msg));
793       return;
794     }
795 
796   /* If compilation worked, retry using the word_regexp struct.  We
797      can't rely on struct assigns working, so redo the compilation.
798      The fastmap can be reused between compilations, and will be freed
799      by the final regfree.  */
800   if (!word_regexp.fastmap)
801     word_regexp.fastmap = xcharalloc (UCHAR_MAX + 1);
802   msg = re_compile_pattern (regexp, strlen (regexp), &word_regexp);
803   assert (!msg);
804   re_set_registers (&word_regexp, &regs, regs.num_regs, regs.start, regs.end);
805   if (re_compile_fastmap (&word_regexp))
806     assert (false);
807 
808   default_word_regexp = false;
809 }
810 
811 #endif /* ENABLE_CHANGEWORD */
812 
813 
814 /*--------------------------------------------------------------------.
815 | Parse and return a single token from the input stream.  A token     |
816 | can either be TOKEN_EOF, if the input_stack is empty; it can be     |
817 | TOKEN_STRING for a quoted string; TOKEN_WORD for something that is  |
818 | a potential macro name; and TOKEN_SIMPLE for any single character   |
819 | that is not a part of any of the previous types.  If LINE is not    |
820 | NULL, set *LINE to the line where the token starts.                 |
821 |                                                                     |
822 | Next_token () return the token type, and passes back a pointer to   |
823 | the token data through TD.  The token text is collected on the      |
824 | obstack token_stack, which never contains more than one token text  |
825 | at a time.  The storage pointed to by the fields in TD is           |
826 | therefore subject to change the next time next_token () is called.  |
827 `--------------------------------------------------------------------*/
828 
829 token_type
next_token(token_data * td,int * line)830 next_token (token_data *td, int *line)
831 {
832   int ch;
833   int quote_level;
834   token_type type;
835 #ifdef ENABLE_CHANGEWORD
836   int startpos;
837   char *orig_text = NULL;
838 #endif
839   const char *file;
840   int dummy;
841 
842   obstack_free (&token_stack, token_bottom);
843   if (!line)
844     line = &dummy;
845 
846  /* Can't consume character until after CHAR_MACRO is handled.  */
847   ch = peek_input ();
848   if (ch == CHAR_EOF)
849     {
850 #ifdef DEBUG_INPUT
851       xfprintf (stderr, "next_token -> EOF\n");
852 #endif
853       next_char ();
854       return TOKEN_EOF;
855     }
856   if (ch == CHAR_MACRO)
857     {
858       init_macro_token (td);
859       next_char ();
860 #ifdef DEBUG_INPUT
861       xfprintf (stderr, "next_token -> MACDEF (%s)\n",
862                 find_builtin_by_addr (TOKEN_DATA_FUNC (td))->name);
863 #endif
864       return TOKEN_MACDEF;
865     }
866 
867   next_char (); /* Consume character we already peeked at.  */
868   file = current_file;
869   *line = current_line;
870   if (MATCH (ch, bcomm.string, true))
871     {
872       obstack_grow (&token_stack, bcomm.string, bcomm.length);
873       while ((ch = next_char ()) != CHAR_EOF
874              && !MATCH (ch, ecomm.string, true))
875         obstack_1grow (&token_stack, ch);
876       if (ch != CHAR_EOF)
877         obstack_grow (&token_stack, ecomm.string, ecomm.length);
878       else
879         /* current_file changed to "" if we see CHAR_EOF, use the
880            previous value we stored earlier.  */
881         m4_failure_at_line (0, file, *line, _("ERROR: end of file in comment"));
882 
883       type = TOKEN_STRING;
884     }
885   else if (default_word_regexp && (c_isalpha (ch) || ch == '_'))
886     {
887       obstack_1grow (&token_stack, ch);
888       while ((ch = peek_input ()) != CHAR_EOF && (c_isalnum (ch) || ch == '_'))
889         {
890           obstack_1grow (&token_stack, ch);
891           next_char ();
892         }
893       type = TOKEN_WORD;
894     }
895 
896 #ifdef ENABLE_CHANGEWORD
897 
898   else if (!default_word_regexp && word_regexp.fastmap[ch])
899     {
900       obstack_1grow (&token_stack, ch);
901       while (1)
902         {
903           ch = peek_input ();
904           if (ch == CHAR_EOF)
905             break;
906           obstack_1grow (&token_stack, ch);
907           startpos = re_search (&word_regexp,
908                                 (char *) obstack_base (&token_stack),
909                                 obstack_object_size (&token_stack), 0, 0,
910                                 &regs);
911           if (startpos ||
912               regs.end [0] != (regoff_t) obstack_object_size (&token_stack))
913             {
914               *(((char *) obstack_base (&token_stack)
915                  + obstack_object_size (&token_stack)) - 1) = '\0';
916               break;
917             }
918           next_char ();
919         }
920 
921       obstack_1grow (&token_stack, '\0');
922       orig_text = (char *) obstack_finish (&token_stack);
923 
924       if (regs.start[1] != -1)
925         obstack_grow (&token_stack,orig_text + regs.start[1],
926                       regs.end[1] - regs.start[1]);
927       else
928         obstack_grow (&token_stack, orig_text,regs.end[0]);
929 
930       type = TOKEN_WORD;
931     }
932 
933 #endif /* ENABLE_CHANGEWORD */
934 
935   else if (!MATCH (ch, lquote.string, true))
936     {
937       switch (ch)
938         {
939         case '(':
940           type = TOKEN_OPEN;
941           break;
942         case ',':
943           type = TOKEN_COMMA;
944           break;
945         case ')':
946           type = TOKEN_CLOSE;
947           break;
948         default:
949           type = TOKEN_SIMPLE;
950           break;
951         }
952       obstack_1grow (&token_stack, ch);
953     }
954   else
955     {
956       bool fast = lquote.length == 1 && rquote.length == 1;
957       quote_level = 1;
958       while (1)
959         {
960           /* Try scanning a buffer first.  */
961           const char *buffer = (isp && isp->type == INPUT_STRING
962                                 ? isp->u.u_s.string : NULL);
963           if (buffer && *buffer)
964             {
965               size_t len = isp->u.u_s.end - buffer;
966               const char *p = buffer;
967               do
968                 {
969                   p = (char *) memchr2 (p, *lquote.string, *rquote.string,
970                                         buffer + len - p);
971                 }
972               while (p && fast && (*p++ == *rquote.string
973                                    ? --quote_level : ++quote_level));
974               if (p)
975                 {
976                   if (fast)
977                     {
978                       assert (!quote_level);
979                       obstack_grow (&token_stack, buffer, p - buffer - 1);
980                       isp->u.u_s.string += p - buffer;
981                       break;
982                     }
983                   obstack_grow (&token_stack, buffer, p - buffer);
984                   ch = to_uchar (*p);
985                   isp->u.u_s.string += p - buffer + 1;
986                 }
987               else
988                 {
989                   obstack_grow (&token_stack, buffer, len);
990                   isp->u.u_s.string += len;
991                   continue;
992                 }
993             }
994           /* Fall back to a byte.  */
995           else
996             ch = next_char ();
997           if (ch == CHAR_EOF)
998             /* current_file changed to "" if we see CHAR_EOF, use
999                the previous value we stored earlier.  */
1000             m4_failure_at_line (0, file, *line,
1001                                 _("ERROR: end of file in string"));
1002 
1003           if (MATCH (ch, rquote.string, true))
1004             {
1005               if (--quote_level == 0)
1006                 break;
1007               obstack_grow (&token_stack, rquote.string, rquote.length);
1008             }
1009           else if (MATCH (ch, lquote.string, true))
1010             {
1011               quote_level++;
1012               obstack_grow (&token_stack, lquote.string, lquote.length);
1013             }
1014           else
1015             obstack_1grow (&token_stack, ch);
1016         }
1017       type = TOKEN_STRING;
1018     }
1019 
1020   obstack_1grow (&token_stack, '\0');
1021 
1022   TOKEN_DATA_TYPE (td) = TOKEN_TEXT;
1023   TOKEN_DATA_TEXT (td) = (char *) obstack_finish (&token_stack);
1024 #ifdef ENABLE_CHANGEWORD
1025   if (orig_text == NULL)
1026     orig_text = TOKEN_DATA_TEXT (td);
1027   TOKEN_DATA_ORIG_TEXT (td) = orig_text;
1028 #endif
1029 #ifdef DEBUG_INPUT
1030   xfprintf (stderr, "next_token -> %s (%s)\n",
1031             token_type_string (type), TOKEN_DATA_TEXT (td));
1032 #endif
1033   return type;
1034 }
1035 
1036 /*-----------------------------------------------.
1037 | Peek at the next token from the input stream.  |
1038 `-----------------------------------------------*/
1039 
1040 token_type
peek_token(void)1041 peek_token (void)
1042 {
1043   token_type result;
1044   int ch = peek_input ();
1045 
1046   if (ch == CHAR_EOF)
1047     {
1048       result = TOKEN_EOF;
1049     }
1050   else if (ch == CHAR_MACRO)
1051     {
1052       result = TOKEN_MACDEF;
1053     }
1054   else if (MATCH (ch, bcomm.string, false))
1055     {
1056       result = TOKEN_STRING;
1057     }
1058   else if ((default_word_regexp && (c_isalpha (ch) || ch == '_'))
1059 #ifdef ENABLE_CHANGEWORD
1060            || (! default_word_regexp && word_regexp.fastmap[ch])
1061 #endif /* ENABLE_CHANGEWORD */
1062            )
1063     {
1064       result = TOKEN_WORD;
1065     }
1066   else if (MATCH (ch, lquote.string, false))
1067     {
1068       result = TOKEN_STRING;
1069     }
1070   else
1071     switch (ch)
1072       {
1073       case '(':
1074         result = TOKEN_OPEN;
1075         break;
1076       case ',':
1077         result = TOKEN_COMMA;
1078         break;
1079       case ')':
1080         result = TOKEN_CLOSE;
1081         break;
1082       default:
1083         result = TOKEN_SIMPLE;
1084       }
1085 
1086 #ifdef DEBUG_INPUT
1087   xfprintf (stderr, "peek_token -> %s\n", token_type_string (result));
1088 #endif /* DEBUG_INPUT */
1089   return result;
1090 }
1091 
1092 
1093 #ifdef DEBUG_INPUT
1094 
1095 static const char *
token_type_string(token_type t)1096 token_type_string (token_type t)
1097 {
1098  switch (t)
1099     { /* TOKSW */
1100     case TOKEN_EOF:
1101       return "EOF";
1102     case TOKEN_STRING:
1103       return "STRING";
1104     case TOKEN_WORD:
1105       return "WORD";
1106     case TOKEN_OPEN:
1107       return "OPEN";
1108     case TOKEN_COMMA:
1109       return "COMMA";
1110     case TOKEN_CLOSE:
1111       return "CLOSE";
1112     case TOKEN_SIMPLE:
1113       return "SIMPLE";
1114     case TOKEN_MACDEF:
1115       return "MACDEF";
1116     default:
1117       abort ();
1118     }
1119  }
1120 
1121 static void
print_token(const char * s,token_type t,token_data * td)1122 print_token (const char *s, token_type t, token_data *td)
1123 {
1124   xfprintf (stderr, "%s: ", s);
1125   switch (t)
1126     { /* TOKSW */
1127     case TOKEN_OPEN:
1128     case TOKEN_COMMA:
1129     case TOKEN_CLOSE:
1130     case TOKEN_SIMPLE:
1131       xfprintf (stderr, "char:");
1132       break;
1133 
1134     case TOKEN_WORD:
1135       xfprintf (stderr, "word:");
1136       break;
1137 
1138     case TOKEN_STRING:
1139       xfprintf (stderr, "string:");
1140       break;
1141 
1142     case TOKEN_MACDEF:
1143       xfprintf (stderr, "macro: %p\n", TOKEN_DATA_FUNC (td));
1144       break;
1145 
1146     case TOKEN_EOF:
1147       xfprintf (stderr, "eof\n");
1148       break;
1149     }
1150   xfprintf (stderr, "\t\"%s\"\n", TOKEN_DATA_TEXT (td));
1151 }
1152 
1153 static void MAYBE_UNUSED
lex_debug(void)1154 lex_debug (void)
1155 {
1156   token_type t;
1157   token_data td;
1158 
1159   while ((t = next_token (&td, NULL)) != TOKEN_EOF)
1160     print_token ("lex", t, &td);
1161 }
1162 #endif /* DEBUG_INPUT */
1163