1 /* This is the Assembler Pre-Processor
2    Copyright (C) 1987-2021 Free Software Foundation, Inc.
3 
4    This file is part of GAS, the GNU Assembler.
5 
6    GAS is free software; you can redistribute it and/or modify
7    it under the terms of the GNU General Public License as published by
8    the Free Software Foundation; either version 3, or (at your option)
9    any later version.
10 
11    GAS is distributed in the hope that it will be useful, but WITHOUT
12    ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
13    or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public
14    License for more details.
15 
16    You should have received a copy of the GNU General Public License
17    along with GAS; see the file COPYING.  If not, write to the Free
18    Software Foundation, 51 Franklin Street - Fifth Floor, Boston, MA
19    02110-1301, USA.  */
20 
21 /* Modified by Allen Wirfs-Brock, Instantiations Inc 2/90.  */
22 /* App, the assembler pre-processor.  This pre-processor strips out
23    excess spaces, turns single-quoted characters into a decimal
24    constant, and turns the # in # <number> <filename> <garbage> into a
25    .linefile.  This needs better error-handling.  */
26 
27 #include "as.h"
28 
29 #if (__STDC__ != 1)
30 #ifndef const
31 #define const  /* empty */
32 #endif
33 #endif
34 
35 #ifdef H_TICK_HEX
36 int enable_h_tick_hex = 0;
37 #endif
38 
39 #ifdef TC_M68K
40 /* Whether we are scrubbing in m68k MRI mode.  This is different from
41    flag_m68k_mri, because the two flags will be affected by the .mri
42    pseudo-op at different times.  */
43 static int scrub_m68k_mri;
44 
45 /* The pseudo-op which switches in and out of MRI mode.  See the
46    comment in do_scrub_chars.  */
47 static const char mri_pseudo[] = ".mri 0";
48 #else
49 #define scrub_m68k_mri 0
50 #endif
51 
52 #if defined TC_ARM && defined OBJ_ELF
53 /* The pseudo-op for which we need to special-case `@' characters.
54    See the comment in do_scrub_chars.  */
55 static const char   symver_pseudo[] = ".symver";
56 static const char * symver_state;
57 #endif
58 
59 static char last_char;
60 
61 static char lex[256];
62 static const char symbol_chars[] =
63 "$._ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789";
64 
65 #define LEX_IS_SYMBOL_COMPONENT		1
66 #define LEX_IS_WHITESPACE		2
67 #define LEX_IS_LINE_SEPARATOR		3
68 #define LEX_IS_COMMENT_START		4
69 #define LEX_IS_LINE_COMMENT_START	5
70 #define	LEX_IS_TWOCHAR_COMMENT_1ST	6
71 #define	LEX_IS_STRINGQUOTE		8
72 #define	LEX_IS_COLON			9
73 #define	LEX_IS_NEWLINE			10
74 #define	LEX_IS_ONECHAR_QUOTE		11
75 #ifdef TC_V850
76 #define LEX_IS_DOUBLEDASH_1ST		12
77 #endif
78 #ifdef TC_M32R
79 #define DOUBLEBAR_PARALLEL
80 #endif
81 #ifdef DOUBLEBAR_PARALLEL
82 #define LEX_IS_DOUBLEBAR_1ST		13
83 #endif
84 #define LEX_IS_PARALLEL_SEPARATOR	14
85 #ifdef H_TICK_HEX
86 #define LEX_IS_H			15
87 #endif
88 #define IS_SYMBOL_COMPONENT(c)		(lex[c] == LEX_IS_SYMBOL_COMPONENT)
89 #define IS_WHITESPACE(c)		(lex[c] == LEX_IS_WHITESPACE)
90 #define IS_LINE_SEPARATOR(c)		(lex[c] == LEX_IS_LINE_SEPARATOR)
91 #define IS_PARALLEL_SEPARATOR(c)	(lex[c] == LEX_IS_PARALLEL_SEPARATOR)
92 #define IS_COMMENT(c)			(lex[c] == LEX_IS_COMMENT_START)
93 #define IS_LINE_COMMENT(c)		(lex[c] == LEX_IS_LINE_COMMENT_START)
94 #define	IS_NEWLINE(c)			(lex[c] == LEX_IS_NEWLINE)
95 
96 static int process_escape (int);
97 
98 /* FIXME-soon: The entire lexer/parser thingy should be
99    built statically at compile time rather than dynamically
100    each and every time the assembler is run.  xoxorich.  */
101 
102 void
do_scrub_begin(int m68k_mri ATTRIBUTE_UNUSED)103 do_scrub_begin (int m68k_mri ATTRIBUTE_UNUSED)
104 {
105   const char *p;
106   int c;
107 
108   lex[' '] = LEX_IS_WHITESPACE;
109   lex['\t'] = LEX_IS_WHITESPACE;
110   lex['\r'] = LEX_IS_WHITESPACE;
111   lex['\n'] = LEX_IS_NEWLINE;
112   lex[':'] = LEX_IS_COLON;
113 
114 #ifdef TC_M68K
115   scrub_m68k_mri = m68k_mri;
116 
117   if (! m68k_mri)
118 #endif
119     {
120       lex['"'] = LEX_IS_STRINGQUOTE;
121 
122 #if ! defined (TC_HPPA)
123       lex['\''] = LEX_IS_ONECHAR_QUOTE;
124 #endif
125 
126 #ifdef SINGLE_QUOTE_STRINGS
127       lex['\''] = LEX_IS_STRINGQUOTE;
128 #endif
129     }
130 
131   /* Note: if any other character can be LEX_IS_STRINGQUOTE, the loop
132      in state 5 of do_scrub_chars must be changed.  */
133 
134   /* Note that these override the previous defaults, e.g. if ';' is a
135      comment char, then it isn't a line separator.  */
136   for (p = symbol_chars; *p; ++p)
137     lex[(unsigned char) *p] = LEX_IS_SYMBOL_COMPONENT;
138 
139   for (c = 128; c < 256; ++c)
140     lex[c] = LEX_IS_SYMBOL_COMPONENT;
141 
142 #ifdef tc_symbol_chars
143   /* This macro permits the processor to specify all characters which
144      may appears in an operand.  This will prevent the scrubber from
145      discarding meaningful whitespace in certain cases.  The i386
146      backend uses this to support prefixes, which can confuse the
147      scrubber as to whether it is parsing operands or opcodes.  */
148   for (p = tc_symbol_chars; *p; ++p)
149     lex[(unsigned char) *p] = LEX_IS_SYMBOL_COMPONENT;
150 #endif
151 
152   /* The m68k backend wants to be able to change comment_chars.  */
153 #ifndef tc_comment_chars
154 #define tc_comment_chars comment_chars
155 #endif
156   for (p = tc_comment_chars; *p; p++)
157     lex[(unsigned char) *p] = LEX_IS_COMMENT_START;
158 
159   for (p = line_comment_chars; *p; p++)
160     lex[(unsigned char) *p] = LEX_IS_LINE_COMMENT_START;
161 
162 #ifndef tc_line_separator_chars
163 #define tc_line_separator_chars line_separator_chars
164 #endif
165   for (p = tc_line_separator_chars; *p; p++)
166     lex[(unsigned char) *p] = LEX_IS_LINE_SEPARATOR;
167 
168 #ifdef tc_parallel_separator_chars
169   /* This macro permits the processor to specify all characters which
170      separate parallel insns on the same line.  */
171   for (p = tc_parallel_separator_chars; *p; p++)
172     lex[(unsigned char) *p] = LEX_IS_PARALLEL_SEPARATOR;
173 #endif
174 
175   /* Only allow slash-star comments if slash is not in use.
176      FIXME: This isn't right.  We should always permit them.  */
177   if (lex['/'] == 0)
178     lex['/'] = LEX_IS_TWOCHAR_COMMENT_1ST;
179 
180 #ifdef TC_M68K
181   if (m68k_mri)
182     {
183       lex['\''] = LEX_IS_STRINGQUOTE;
184       lex[';'] = LEX_IS_COMMENT_START;
185       lex['*'] = LEX_IS_LINE_COMMENT_START;
186       /* The MRI documentation says '!' is LEX_IS_COMMENT_START, but
187 	 then it can't be used in an expression.  */
188       lex['!'] = LEX_IS_LINE_COMMENT_START;
189     }
190 #endif
191 
192 #ifdef TC_V850
193   lex['-'] = LEX_IS_DOUBLEDASH_1ST;
194 #endif
195 #ifdef DOUBLEBAR_PARALLEL
196   lex['|'] = LEX_IS_DOUBLEBAR_1ST;
197 #endif
198 #ifdef TC_D30V
199   /* Must do this is we want VLIW instruction with "->" or "<-".  */
200   lex['-'] = LEX_IS_SYMBOL_COMPONENT;
201 #endif
202 
203 #ifdef H_TICK_HEX
204   if (enable_h_tick_hex)
205     {
206       lex['h'] = LEX_IS_H;
207       lex['H'] = LEX_IS_H;
208     }
209 #endif
210 }
211 
212 /* Saved state of the scrubber.  */
213 static int state;
214 static int old_state;
215 static const char *out_string;
216 static char out_buf[20];
217 static int add_newlines;
218 static char *saved_input;
219 static size_t saved_input_len;
220 static char input_buffer[32 * 1024];
221 static const char *mri_state;
222 static char mri_last_ch;
223 
224 /* Data structure for saving the state of app across #include's.  Note that
225    app is called asynchronously to the parsing of the .include's, so our
226    state at the time .include is interpreted is completely unrelated.
227    That's why we have to save it all.  */
228 
229 struct app_save
230 {
231   int          state;
232   int          old_state;
233   const char * out_string;
234   char         out_buf[sizeof (out_buf)];
235   int          add_newlines;
236   char *       saved_input;
237   size_t       saved_input_len;
238 #ifdef TC_M68K
239   int          scrub_m68k_mri;
240 #endif
241   const char * mri_state;
242   char         mri_last_ch;
243 #if defined TC_ARM && defined OBJ_ELF
244   const char * symver_state;
245 #endif
246   char         last_char;
247 };
248 
249 char *
app_push(void)250 app_push (void)
251 {
252   struct app_save *saved;
253 
254   saved = XNEW (struct app_save);
255   saved->state = state;
256   saved->old_state = old_state;
257   saved->out_string = out_string;
258   memcpy (saved->out_buf, out_buf, sizeof (out_buf));
259   saved->add_newlines = add_newlines;
260   if (saved_input == NULL)
261     saved->saved_input = NULL;
262   else
263     {
264       saved->saved_input = XNEWVEC (char, saved_input_len);
265       memcpy (saved->saved_input, saved_input, saved_input_len);
266       saved->saved_input_len = saved_input_len;
267     }
268 #ifdef TC_M68K
269   saved->scrub_m68k_mri = scrub_m68k_mri;
270 #endif
271   saved->mri_state = mri_state;
272   saved->mri_last_ch = mri_last_ch;
273 #if defined TC_ARM && defined OBJ_ELF
274   saved->symver_state = symver_state;
275 #endif
276   saved->last_char = last_char;
277 
278   /* do_scrub_begin() is not useful, just wastes time.  */
279 
280   state = 0;
281   saved_input = NULL;
282   add_newlines = 0;
283 
284   return (char *) saved;
285 }
286 
287 void
app_pop(char * arg)288 app_pop (char *arg)
289 {
290   struct app_save *saved = (struct app_save *) arg;
291 
292   /* There is no do_scrub_end ().  */
293   state = saved->state;
294   old_state = saved->old_state;
295   out_string = saved->out_string;
296   memcpy (out_buf, saved->out_buf, sizeof (out_buf));
297   add_newlines = saved->add_newlines;
298   if (saved->saved_input == NULL)
299     saved_input = NULL;
300   else
301     {
302       gas_assert (saved->saved_input_len <= sizeof (input_buffer));
303       memcpy (input_buffer, saved->saved_input, saved->saved_input_len);
304       saved_input = input_buffer;
305       saved_input_len = saved->saved_input_len;
306       free (saved->saved_input);
307     }
308 #ifdef TC_M68K
309   scrub_m68k_mri = saved->scrub_m68k_mri;
310 #endif
311   mri_state = saved->mri_state;
312   mri_last_ch = saved->mri_last_ch;
313 #if defined TC_ARM && defined OBJ_ELF
314   symver_state = saved->symver_state;
315 #endif
316   last_char = saved->last_char;
317 
318   free (arg);
319 }
320 
321 /* @@ This assumes that \n &c are the same on host and target.  This is not
322    necessarily true.  */
323 
324 static int
process_escape(int ch)325 process_escape (int ch)
326 {
327   switch (ch)
328     {
329     case 'b':
330       return '\b';
331     case 'f':
332       return '\f';
333     case 'n':
334       return '\n';
335     case 'r':
336       return '\r';
337     case 't':
338       return '\t';
339     case '\'':
340       return '\'';
341     case '"':
342       return '\"';
343     default:
344       return ch;
345     }
346 }
347 
348 /* This function is called to process input characters.  The GET
349    parameter is used to retrieve more input characters.  GET should
350    set its parameter to point to a buffer, and return the length of
351    the buffer; it should return 0 at end of file.  The scrubbed output
352    characters are put into the buffer starting at TOSTART; the TOSTART
353    buffer is TOLEN bytes in length.  The function returns the number
354    of scrubbed characters put into TOSTART.  This will be TOLEN unless
355    end of file was seen.  This function is arranged as a state
356    machine, and saves its state so that it may return at any point.
357    This is the way the old code used to work.  */
358 
359 size_t
do_scrub_chars(size_t (* get)(char *,size_t),char * tostart,size_t tolen)360 do_scrub_chars (size_t (*get) (char *, size_t), char *tostart, size_t tolen)
361 {
362   char *to = tostart;
363   char *toend = tostart + tolen;
364   char *from;
365   char *fromend;
366   size_t fromlen;
367   int ch, ch2 = 0;
368   /* Character that started the string we're working on.  */
369   static char quotechar;
370 
371   /*State 0: beginning of normal line
372 	  1: After first whitespace on line (flush more white)
373 	  2: After first non-white (opcode) on line (keep 1white)
374 	  3: after second white on line (into operands) (flush white)
375 	  4: after putting out a .linefile, put out digits
376 	  5: parsing a string, then go to old-state
377 	  6: putting out \ escape in a "d string.
378 	  7: no longer used
379 	  8: no longer used
380 	  9: After seeing symbol char in state 3 (keep 1white after symchar)
381 	 10: After seeing whitespace in state 9 (keep white before symchar)
382 	 11: After seeing a symbol character in state 0 (eg a label definition)
383 	 -1: output string in out_string and go to the state in old_state
384 	 -2: flush text until a '*' '/' is seen, then go to state old_state
385 #ifdef TC_V850
386 	 12: After seeing a dash, looking for a second dash as a start
387 	     of comment.
388 #endif
389 #ifdef DOUBLEBAR_PARALLEL
390 	 13: After seeing a vertical bar, looking for a second
391 	     vertical bar as a parallel expression separator.
392 #endif
393 #ifdef TC_PREDICATE_START_CHAR
394 	 14: After seeing a predicate start character at state 0, looking
395 	     for a predicate end character as predicate.
396 	 15: After seeing a predicate start character at state 1, looking
397 	     for a predicate end character as predicate.
398 #endif
399 #ifdef TC_Z80
400 	 16: After seeing an 'a' or an 'A' at the start of a symbol
401 	 17: After seeing an 'f' or an 'F' in state 16
402 #endif
403 	  */
404 
405   /* I added states 9 and 10 because the MIPS ECOFF assembler uses
406      constructs like ``.loc 1 20''.  This was turning into ``.loc
407      120''.  States 9 and 10 ensure that a space is never dropped in
408      between characters which could appear in an identifier.  Ian
409      Taylor, ian@cygnus.com.
410 
411      I added state 11 so that something like "Lfoo add %r25,%r26,%r27" works
412      correctly on the PA (and any other target where colons are optional).
413      Jeff Law, law@cs.utah.edu.
414 
415      I added state 13 so that something like "cmp r1, r2 || trap #1" does not
416      get squashed into "cmp r1,r2||trap#1", with the all important space
417      between the 'trap' and the '#1' being eliminated.  nickc@cygnus.com  */
418 
419   /* This macro gets the next input character.  */
420 
421 #define GET()							\
422   (from < fromend						\
423    ? * (unsigned char *) (from++)				\
424    : (saved_input = NULL,					\
425       fromlen = (*get) (input_buffer, sizeof input_buffer),	\
426       from = input_buffer,					\
427       fromend = from + fromlen,					\
428       (fromlen == 0						\
429        ? EOF							\
430        : * (unsigned char *) (from++))))
431 
432   /* This macro pushes a character back on the input stream.  */
433 
434 #define UNGET(uch) (*--from = (uch))
435 
436   /* This macro puts a character into the output buffer.  If this
437      character fills the output buffer, this macro jumps to the label
438      TOFULL.  We use this rather ugly approach because we need to
439      handle two different termination conditions: EOF on the input
440      stream, and a full output buffer.  It would be simpler if we
441      always read in the entire input stream before processing it, but
442      I don't want to make such a significant change to the assembler's
443      memory usage.  */
444 
445 #define PUT(pch)				\
446   do						\
447     {						\
448       *to++ = (pch);				\
449       if (to >= toend)				\
450 	goto tofull;				\
451     }						\
452   while (0)
453 
454   if (saved_input != NULL)
455     {
456       from = saved_input;
457       fromend = from + saved_input_len;
458     }
459   else
460     {
461       fromlen = (*get) (input_buffer, sizeof input_buffer);
462       if (fromlen == 0)
463 	return 0;
464       from = input_buffer;
465       fromend = from + fromlen;
466     }
467 
468   while (1)
469     {
470       /* The cases in this switch end with continue, in order to
471 	 branch back to the top of this while loop and generate the
472 	 next output character in the appropriate state.  */
473       switch (state)
474 	{
475 	case -1:
476 	  ch = *out_string++;
477 	  if (*out_string == '\0')
478 	    {
479 	      state = old_state;
480 	      old_state = 3;
481 	    }
482 	  PUT (ch);
483 	  continue;
484 
485 	case -2:
486 	  for (;;)
487 	    {
488 	      do
489 		{
490 		  ch = GET ();
491 
492 		  if (ch == EOF)
493 		    {
494 		      as_warn (_("end of file in comment"));
495 		      goto fromeof;
496 		    }
497 
498 		  if (ch == '\n')
499 		    PUT ('\n');
500 		}
501 	      while (ch != '*');
502 
503 	      while ((ch = GET ()) == '*')
504 		;
505 
506 	      if (ch == EOF)
507 		{
508 		  as_warn (_("end of file in comment"));
509 		  goto fromeof;
510 		}
511 
512 	      if (ch == '/')
513 		break;
514 
515 	      UNGET (ch);
516 	    }
517 
518 	  state = old_state;
519 	  UNGET (' ');
520 	  continue;
521 
522 	case 4:
523 	  ch = GET ();
524 	  if (ch == EOF)
525 	    goto fromeof;
526 	  else if (ch >= '0' && ch <= '9')
527 	    PUT (ch);
528 	  else
529 	    {
530 	      while (ch != EOF && IS_WHITESPACE (ch))
531 		ch = GET ();
532 	      if (ch == '"')
533 		{
534 		  quotechar = ch;
535 		  state = 5;
536 		  old_state = 3;
537 		  PUT (ch);
538 		}
539 	      else
540 		{
541 		  while (ch != EOF && ch != '\n')
542 		    ch = GET ();
543 		  state = 0;
544 		  PUT (ch);
545 		}
546 	    }
547 	  continue;
548 
549 	case 5:
550 	  /* We are going to copy everything up to a quote character,
551 	     with special handling for a backslash.  We try to
552 	     optimize the copying in the simple case without using the
553 	     GET and PUT macros.  */
554 	  {
555 	    char *s;
556 	    ptrdiff_t len;
557 
558 	    for (s = from; s < fromend; s++)
559 	      {
560 		ch = *s;
561 		if (ch == '\\'
562 		    || ch == quotechar
563 		    || ch == '\n')
564 		  break;
565 	      }
566 	    len = s - from;
567 	    if (len > toend - to)
568 	      len = toend - to;
569 	    if (len > 0)
570 	      {
571 		memcpy (to, from, len);
572 		to += len;
573 		from += len;
574 		if (to >= toend)
575 		  goto tofull;
576 	      }
577 	  }
578 
579 	  ch = GET ();
580 	  if (ch == EOF)
581 	    {
582 	      /* This buffer is here specifically so
583 		 that the UNGET below will work.  */
584 	      static char one_char_buf[1];
585 
586 	      as_warn (_("end of file in string; '%c' inserted"), quotechar);
587 	      state = old_state;
588 	      from = fromend = one_char_buf + 1;
589 	      fromlen = 1;
590 	      UNGET ('\n');
591 	      PUT (quotechar);
592 	    }
593 	  else if (ch == quotechar)
594 	    {
595 	      state = old_state;
596 	      PUT (ch);
597 	    }
598 	  else if (TC_STRING_ESCAPES && ch == '\\')
599 	    {
600 	      state = 6;
601 	      PUT (ch);
602 	    }
603 	  else if (scrub_m68k_mri && ch == '\n')
604 	    {
605 	      /* Just quietly terminate the string.  This permits lines like
606 		   bne	label	loop if we haven't reach end yet.  */
607 	      state = old_state;
608 	      UNGET (ch);
609 	      PUT ('\'');
610 	    }
611 	  else
612 	    {
613 	      PUT (ch);
614 	    }
615 	  continue;
616 
617 	case 6:
618 	  state = 5;
619 	  ch = GET ();
620 	  switch (ch)
621 	    {
622 	      /* Handle strings broken across lines, by turning '\n' into
623 		 '\\' and 'n'.  */
624 	    case '\n':
625 	      UNGET ('n');
626 	      add_newlines++;
627 	      PUT ('\\');
628 	      continue;
629 
630 	    case EOF:
631 	      as_warn (_("end of file in string; '%c' inserted"), quotechar);
632 	      PUT (quotechar);
633 	      continue;
634 
635 	    case '"':
636 	    case '\\':
637 	    case 'b':
638 	    case 'f':
639 	    case 'n':
640 	    case 'r':
641 	    case 't':
642 	    case 'v':
643 	    case 'x':
644 	    case 'X':
645 	    case '0':
646 	    case '1':
647 	    case '2':
648 	    case '3':
649 	    case '4':
650 	    case '5':
651 	    case '6':
652 	    case '7':
653 	      break;
654 
655 	    default:
656 #ifdef ONLY_STANDARD_ESCAPES
657 	      as_warn (_("unknown escape '\\%c' in string; ignored"), ch);
658 #endif
659 	      break;
660 	    }
661 	  PUT (ch);
662 	  continue;
663 
664 #ifdef DOUBLEBAR_PARALLEL
665 	case 13:
666 	  ch = GET ();
667 	  if (ch != '|')
668 	    abort ();
669 
670 	  /* Reset back to state 1 and pretend that we are parsing a
671 	     line from just after the first white space.  */
672 	  state = 1;
673 	  PUT ('|');
674 #ifdef TC_TIC6X
675 	  /* "||^" is used for SPMASKed instructions.  */
676 	  ch = GET ();
677 	  if (ch == EOF)
678 	    goto fromeof;
679 	  else if (ch == '^')
680 	    PUT ('^');
681 	  else
682 	    UNGET (ch);
683 #endif
684 	  continue;
685 #endif
686 #ifdef TC_Z80
687 	case 16:
688 	  /* We have seen an 'a' at the start of a symbol, look for an 'f'.  */
689 	  ch = GET ();
690 	  if (ch == 'f' || ch == 'F')
691 	    {
692 	      state = 17;
693 	      PUT (ch);
694 	    }
695 	  else
696 	    {
697 	      state = 9;
698 	      break;
699 	    }
700 	  /* Fall through.  */
701 	case 17:
702 	  /* We have seen "af" at the start of a symbol,
703 	     a ' here is a part of that symbol.  */
704 	  ch = GET ();
705 	  state = 9;
706 	  if (ch == '\'')
707 	    /* Change to avoid warning about unclosed string.  */
708 	    PUT ('`');
709 	  else if (ch != EOF)
710 	    UNGET (ch);
711 	  break;
712 #endif
713 	}
714 
715       /* OK, we are somewhere in states 0 through 4 or 9 through 11.  */
716 
717       /* flushchar: */
718       ch = GET ();
719 
720 #ifdef TC_PREDICATE_START_CHAR
721       if (ch == TC_PREDICATE_START_CHAR && (state == 0 || state == 1))
722 	{
723 	  state += 14;
724 	  PUT (ch);
725 	  continue;
726 	}
727       else if (state == 14 || state == 15)
728 	{
729 	  if (ch == TC_PREDICATE_END_CHAR)
730 	    {
731 	      state -= 14;
732 	      PUT (ch);
733 	      ch = GET ();
734 	    }
735 	  else
736 	    {
737 	      PUT (ch);
738 	      continue;
739 	    }
740 	}
741 #endif
742 
743     recycle:
744 
745 #if defined TC_ARM && defined OBJ_ELF
746       /* We need to watch out for .symver directives.  See the comment later
747 	 in this function.  */
748       if (symver_state == NULL)
749 	{
750 	  if ((state == 0 || state == 1) && ch == symver_pseudo[0])
751 	    symver_state = symver_pseudo + 1;
752 	}
753       else
754 	{
755 	  /* We advance to the next state if we find the right
756 	     character.  */
757 	  if (ch != '\0' && (*symver_state == ch))
758 	    ++symver_state;
759 	  else if (*symver_state != '\0')
760 	    /* We did not get the expected character, or we didn't
761 	       get a valid terminating character after seeing the
762 	       entire pseudo-op, so we must go back to the beginning.  */
763 	    symver_state = NULL;
764 	  else
765 	    {
766 	      /* We've read the entire pseudo-op.  If this is the end
767 		 of the line, go back to the beginning.  */
768 	      if (IS_NEWLINE (ch))
769 		symver_state = NULL;
770 	    }
771 	}
772 #endif /* TC_ARM && OBJ_ELF */
773 
774 #ifdef TC_M68K
775       /* We want to have pseudo-ops which control whether we are in
776 	 MRI mode or not.  Unfortunately, since m68k MRI mode affects
777 	 the scrubber, that means that we need a special purpose
778 	 recognizer here.  */
779       if (mri_state == NULL)
780 	{
781 	  if ((state == 0 || state == 1)
782 	      && ch == mri_pseudo[0])
783 	    mri_state = mri_pseudo + 1;
784 	}
785       else
786 	{
787 	  /* We advance to the next state if we find the right
788 	     character, or if we need a space character and we get any
789 	     whitespace character, or if we need a '0' and we get a
790 	     '1' (this is so that we only need one state to handle
791 	     ``.mri 0'' and ``.mri 1'').  */
792 	  if (ch != '\0'
793 	      && (*mri_state == ch
794 		  || (*mri_state == ' '
795 		      && lex[ch] == LEX_IS_WHITESPACE)
796 		  || (*mri_state == '0'
797 		      && ch == '1')))
798 	    {
799 	      mri_last_ch = ch;
800 	      ++mri_state;
801 	    }
802 	  else if (*mri_state != '\0'
803 		   || (lex[ch] != LEX_IS_WHITESPACE
804 		       && lex[ch] != LEX_IS_NEWLINE))
805 	    {
806 	      /* We did not get the expected character, or we didn't
807 		 get a valid terminating character after seeing the
808 		 entire pseudo-op, so we must go back to the
809 		 beginning.  */
810 	      mri_state = NULL;
811 	    }
812 	  else
813 	    {
814 	      /* We've read the entire pseudo-op.  mips_last_ch is
815 		 either '0' or '1' indicating whether to enter or
816 		 leave MRI mode.  */
817 	      do_scrub_begin (mri_last_ch == '1');
818 	      mri_state = NULL;
819 
820 	      /* We continue handling the character as usual.  The
821 		 main gas reader must also handle the .mri pseudo-op
822 		 to control expression parsing and the like.  */
823 	    }
824 	}
825 #endif
826 
827       if (ch == EOF)
828 	{
829 	  if (state != 0)
830 	    {
831 	      as_warn (_("end of file not at end of a line; newline inserted"));
832 	      state = 0;
833 	      PUT ('\n');
834 	    }
835 	  goto fromeof;
836 	}
837 
838       switch (lex[ch])
839 	{
840 	case LEX_IS_WHITESPACE:
841 	  do
842 	    {
843 	      ch = GET ();
844 	    }
845 	  while (ch != EOF && IS_WHITESPACE (ch));
846 	  if (ch == EOF)
847 	    goto fromeof;
848 
849 	  if (state == 0)
850 	    {
851 	      /* Preserve a single whitespace character at the
852 		 beginning of a line.  */
853 	      state = 1;
854 	      UNGET (ch);
855 	      PUT (' ');
856 	      break;
857 	    }
858 
859 #ifdef KEEP_WHITE_AROUND_COLON
860 	  if (lex[ch] == LEX_IS_COLON)
861 	    {
862 	      /* Only keep this white if there's no white *after* the
863 		 colon.  */
864 	      ch2 = GET ();
865 	      if (ch2 != EOF)
866 		UNGET (ch2);
867 	      if (!IS_WHITESPACE (ch2))
868 		{
869 		  state = 9;
870 		  UNGET (ch);
871 		  PUT (' ');
872 		  break;
873 		}
874 	    }
875 #endif
876 	  if (IS_COMMENT (ch)
877 	      || IS_LINE_SEPARATOR (ch)
878 	      || IS_PARALLEL_SEPARATOR (ch))
879 	    {
880 	      if (scrub_m68k_mri)
881 		{
882 		  /* In MRI mode, we keep these spaces.  */
883 		  UNGET (ch);
884 		  PUT (' ');
885 		  break;
886 		}
887 	      goto recycle;
888 	    }
889 
890 	  /* If we're in state 2 or 11, we've seen a non-white
891 	     character followed by whitespace.  If the next character
892 	     is ':', this is whitespace after a label name which we
893 	     normally must ignore.  In MRI mode, though, spaces are
894 	     not permitted between the label and the colon.  */
895 	  if ((state == 2 || state == 11)
896 	      && lex[ch] == LEX_IS_COLON
897 	      && ! scrub_m68k_mri)
898 	    {
899 	      state = 1;
900 	      PUT (ch);
901 	      break;
902 	    }
903 
904 	  switch (state)
905 	    {
906 	    case 1:
907 	      /* We can arrive here if we leave a leading whitespace
908 		 character at the beginning of a line.  */
909 	      goto recycle;
910 	    case 2:
911 	      state = 3;
912 	      if (to + 1 < toend)
913 		{
914 		  /* Optimize common case by skipping UNGET/GET.  */
915 		  PUT (' ');	/* Sp after opco */
916 		  goto recycle;
917 		}
918 	      UNGET (ch);
919 	      PUT (' ');
920 	      break;
921 	    case 3:
922 #ifndef TC_KEEP_OPERAND_SPACES
923 	      /* For TI C6X, we keep these spaces as they may separate
924 		 functional unit specifiers from operands.  */
925 	      if (scrub_m68k_mri)
926 #endif
927 		{
928 		  /* In MRI mode, we keep these spaces.  */
929 		  UNGET (ch);
930 		  PUT (' ');
931 		  break;
932 		}
933 	      goto recycle;	/* Sp in operands */
934 	    case 9:
935 	    case 10:
936 #ifndef TC_KEEP_OPERAND_SPACES
937 	      if (scrub_m68k_mri)
938 #endif
939 		{
940 		  /* In MRI mode, we keep these spaces.  */
941 		  state = 3;
942 		  UNGET (ch);
943 		  PUT (' ');
944 		  break;
945 		}
946 	      state = 10;	/* Sp after symbol char */
947 	      goto recycle;
948 	    case 11:
949 	      if (LABELS_WITHOUT_COLONS || flag_m68k_mri)
950 		state = 1;
951 	      else
952 		{
953 		  /* We know that ch is not ':', since we tested that
954 		     case above.  Therefore this is not a label, so it
955 		     must be the opcode, and we've just seen the
956 		     whitespace after it.  */
957 		  state = 3;
958 		}
959 	      UNGET (ch);
960 	      PUT (' ');	/* Sp after label definition.  */
961 	      break;
962 	    default:
963 	      BAD_CASE (state);
964 	    }
965 	  break;
966 
967 	case LEX_IS_TWOCHAR_COMMENT_1ST:
968 	  ch2 = GET ();
969 	  if (ch2 == '*')
970 	    {
971 	      for (;;)
972 		{
973 		  do
974 		    {
975 		      ch2 = GET ();
976 		      if (ch2 != EOF && IS_NEWLINE (ch2))
977 			add_newlines++;
978 		    }
979 		  while (ch2 != EOF && ch2 != '*');
980 
981 		  while (ch2 == '*')
982 		    ch2 = GET ();
983 
984 		  if (ch2 == EOF || ch2 == '/')
985 		    break;
986 
987 		  /* This UNGET will ensure that we count newlines
988 		     correctly.  */
989 		  UNGET (ch2);
990 		}
991 
992 	      if (ch2 == EOF)
993 		as_warn (_("end of file in multiline comment"));
994 
995 	      ch = ' ';
996 	      goto recycle;
997 	    }
998 #ifdef DOUBLESLASH_LINE_COMMENTS
999 	  else if (ch2 == '/')
1000 	    {
1001 	      do
1002 		{
1003 		  ch = GET ();
1004 		}
1005 	      while (ch != EOF && !IS_NEWLINE (ch));
1006 	      if (ch == EOF)
1007 		as_warn ("end of file in comment; newline inserted");
1008 	      state = 0;
1009 	      PUT ('\n');
1010 	      break;
1011 	    }
1012 #endif
1013 	  else
1014 	    {
1015 	      if (ch2 != EOF)
1016 		UNGET (ch2);
1017 	      if (state == 9 || state == 10)
1018 		state = 3;
1019 	      PUT (ch);
1020 	    }
1021 	  break;
1022 
1023 	case LEX_IS_STRINGQUOTE:
1024 	  quotechar = ch;
1025 	  if (state == 10)
1026 	    {
1027 	      /* Preserve the whitespace in foo "bar".  */
1028 	      UNGET (ch);
1029 	      state = 3;
1030 	      PUT (' ');
1031 
1032 	      /* PUT didn't jump out.  We could just break, but we
1033 		 know what will happen, so optimize a bit.  */
1034 	      ch = GET ();
1035 	      old_state = 3;
1036 	    }
1037 	  else if (state == 9)
1038 	    old_state = 3;
1039 	  else
1040 	    old_state = state;
1041 	  state = 5;
1042 	  PUT (ch);
1043 	  break;
1044 
1045 	case LEX_IS_ONECHAR_QUOTE:
1046 #ifdef H_TICK_HEX
1047 	  if (state == 9 && enable_h_tick_hex)
1048 	    {
1049 	      char c;
1050 
1051 	      c = GET ();
1052 	      as_warn ("'%c found after symbol", c);
1053 	      UNGET (c);
1054 	    }
1055 #endif
1056 	  if (state == 10)
1057 	    {
1058 	      /* Preserve the whitespace in foo 'b'.  */
1059 	      UNGET (ch);
1060 	      state = 3;
1061 	      PUT (' ');
1062 	      break;
1063 	    }
1064 	  ch = GET ();
1065 	  if (ch == EOF)
1066 	    {
1067 	      as_warn (_("end of file after a one-character quote; \\0 inserted"));
1068 	      ch = 0;
1069 	    }
1070 	  if (ch == '\\')
1071 	    {
1072 	      ch = GET ();
1073 	      if (ch == EOF)
1074 		{
1075 		  as_warn (_("end of file in escape character"));
1076 		  ch = '\\';
1077 		}
1078 	      else
1079 		ch = process_escape (ch);
1080 	    }
1081 	  sprintf (out_buf, "%d", (int) (unsigned char) ch);
1082 
1083 	  /* None of these 'x constants for us.  We want 'x'.  */
1084 	  if ((ch = GET ()) != '\'')
1085 	    {
1086 #ifdef REQUIRE_CHAR_CLOSE_QUOTE
1087 	      as_warn (_("missing close quote; (assumed)"));
1088 #else
1089 	      if (ch != EOF)
1090 		UNGET (ch);
1091 #endif
1092 	    }
1093 	  if (strlen (out_buf) == 1)
1094 	    {
1095 	      PUT (out_buf[0]);
1096 	      break;
1097 	    }
1098 	  if (state == 9)
1099 	    old_state = 3;
1100 	  else
1101 	    old_state = state;
1102 	  state = -1;
1103 	  out_string = out_buf;
1104 	  PUT (*out_string++);
1105 	  break;
1106 
1107 	case LEX_IS_COLON:
1108 #ifdef KEEP_WHITE_AROUND_COLON
1109 	  state = 9;
1110 #else
1111 	  if (state == 9 || state == 10)
1112 	    state = 3;
1113 	  else if (state != 3)
1114 	    state = 1;
1115 #endif
1116 	  PUT (ch);
1117 	  break;
1118 
1119 	case LEX_IS_NEWLINE:
1120 	  /* Roll out a bunch of newlines from inside comments, etc.  */
1121 	  if (add_newlines)
1122 	    {
1123 	      --add_newlines;
1124 	      UNGET (ch);
1125 	    }
1126 	  /* Fall through.  */
1127 
1128 	case LEX_IS_LINE_SEPARATOR:
1129 	  state = 0;
1130 	  PUT (ch);
1131 	  break;
1132 
1133 	case LEX_IS_PARALLEL_SEPARATOR:
1134 	  state = 1;
1135 	  PUT (ch);
1136 	  break;
1137 
1138 #ifdef TC_V850
1139 	case LEX_IS_DOUBLEDASH_1ST:
1140 	  ch2 = GET ();
1141 	  if (ch2 != '-')
1142 	    {
1143 	      if (ch2 != EOF)
1144 		UNGET (ch2);
1145 	      goto de_fault;
1146 	    }
1147 	  /* Read and skip to end of line.  */
1148 	  do
1149 	    {
1150 	      ch = GET ();
1151 	    }
1152 	  while (ch != EOF && ch != '\n');
1153 
1154 	  if (ch == EOF)
1155 	    as_warn (_("end of file in comment; newline inserted"));
1156 
1157 	  state = 0;
1158 	  PUT ('\n');
1159 	  break;
1160 #endif
1161 #ifdef DOUBLEBAR_PARALLEL
1162 	case LEX_IS_DOUBLEBAR_1ST:
1163 	  ch2 = GET ();
1164 	  if (ch2 != EOF)
1165 	    UNGET (ch2);
1166 	  if (ch2 != '|')
1167 	    goto de_fault;
1168 
1169 	  /* Handle '||' in two states as invoking PUT twice might
1170 	     result in the first one jumping out of this loop.  We'd
1171 	     then lose track of the state and one '|' char.  */
1172 	  state = 13;
1173 	  PUT ('|');
1174 	  break;
1175 #endif
1176 	case LEX_IS_LINE_COMMENT_START:
1177 	  /* FIXME-someday: The two character comment stuff was badly
1178 	     thought out.  On i386, we want '/' as line comment start
1179 	     AND we want C style comments.  hence this hack.  The
1180 	     whole lexical process should be reworked.  xoxorich.  */
1181 	  if (ch == '/')
1182 	    {
1183 	      ch2 = GET ();
1184 	      if (ch2 == '*')
1185 		{
1186 		  old_state = 3;
1187 		  state = -2;
1188 		  break;
1189 		}
1190 	      else if (ch2 != EOF)
1191 		{
1192 		  UNGET (ch2);
1193 		}
1194 	    }
1195 
1196 	  if (state == 0 || state == 1)	/* Only comment at start of line.  */
1197 	    {
1198 	      int startch;
1199 
1200 	      startch = ch;
1201 
1202 	      do
1203 		{
1204 		  ch = GET ();
1205 		}
1206 	      while (ch != EOF && IS_WHITESPACE (ch));
1207 
1208 	      if (ch == EOF)
1209 		{
1210 		  as_warn (_("end of file in comment; newline inserted"));
1211 		  PUT ('\n');
1212 		  break;
1213 		}
1214 
1215 	      if (ch < '0' || ch > '9' || state != 0 || startch != '#')
1216 		{
1217 		  /* Not a cpp line.  */
1218 		  while (ch != EOF && !IS_NEWLINE (ch))
1219 		    ch = GET ();
1220 		  if (ch == EOF)
1221 		    {
1222 		      as_warn (_("end of file in comment; newline inserted"));
1223 		      PUT ('\n');
1224 		    }
1225 		  else /* IS_NEWLINE (ch) */
1226 		    {
1227 		      /* To process non-zero add_newlines.  */
1228 		      UNGET (ch);
1229 		    }
1230 		  state = 0;
1231 		  break;
1232 		}
1233 	      /* Looks like `# 123 "filename"' from cpp.  */
1234 	      UNGET (ch);
1235 	      old_state = 4;
1236 	      state = -1;
1237 	      if (scrub_m68k_mri)
1238 		out_string = "\tlinefile ";
1239 	      else
1240 		out_string = "\t.linefile ";
1241 	      PUT (*out_string++);
1242 	      break;
1243 	    }
1244 
1245 #ifdef TC_D10V
1246 	  /* All insns end in a char for which LEX_IS_SYMBOL_COMPONENT is true.
1247 	     Trap is the only short insn that has a first operand that is
1248 	     neither register nor label.
1249 	     We must prevent exef0f ||trap #1 to degenerate to exef0f ||trap#1 .
1250 	     We can't make '#' LEX_IS_SYMBOL_COMPONENT because it is
1251 	     already LEX_IS_LINE_COMMENT_START.  However, it is the
1252 	     only character in line_comment_chars for d10v, hence we
1253 	     can recognize it as such.  */
1254 	  /* An alternative approach would be to reset the state to 1 when
1255 	     we see '||', '<'- or '->', but that seems to be overkill.  */
1256 	  if (state == 10)
1257 	    PUT (' ');
1258 #endif
1259 	  /* We have a line comment character which is not at the
1260 	     start of a line.  If this is also a normal comment
1261 	     character, fall through.  Otherwise treat it as a default
1262 	     character.  */
1263 	  if (strchr (tc_comment_chars, ch) == NULL
1264 	      && (! scrub_m68k_mri
1265 		  || (ch != '!' && ch != '*')))
1266 	    goto de_fault;
1267 	  if (scrub_m68k_mri
1268 	      && (ch == '!' || ch == '*' || ch == '#')
1269 	      && state != 1
1270 	      && state != 10)
1271 	    goto de_fault;
1272 	  /* Fall through.  */
1273 	case LEX_IS_COMMENT_START:
1274 #if defined TC_ARM && defined OBJ_ELF
1275 	  /* On the ARM, `@' is the comment character.
1276 	     Unfortunately this is also a special character in ELF .symver
1277 	     directives (and .type, though we deal with those another way).
1278 	     So we check if this line is such a directive, and treat
1279 	     the character as default if so.  This is a hack.  */
1280 	  if ((symver_state != NULL) && (*symver_state == 0))
1281 	    goto de_fault;
1282 #endif
1283 
1284 	  /* Care is needed not to damage occurrences of \<comment-char>
1285 	     by stripping the <comment-char> onwards.  Yuck.  */
1286 	  if ((to > tostart ? to[-1] : last_char) == '\\')
1287 	    /* Do not treat the <comment-char> as a start-of-comment.  */
1288 	    goto de_fault;
1289 
1290 #ifdef WARN_COMMENTS
1291 	  if (!found_comment)
1292 	    found_comment_file = as_where (&found_comment);
1293 #endif
1294 	  do
1295 	    {
1296 	      ch = GET ();
1297 	    }
1298 	  while (ch != EOF && !IS_NEWLINE (ch));
1299 	  if (ch == EOF)
1300 	    as_warn (_("end of file in comment; newline inserted"));
1301 	  state = 0;
1302 	  PUT ('\n');
1303 	  break;
1304 
1305 #ifdef H_TICK_HEX
1306 	case LEX_IS_H:
1307 	  /* Look for strings like H'[0-9A-Fa-f] and if found, replace
1308 	     the H' with 0x to make them gas-style hex characters.  */
1309 	  if (enable_h_tick_hex)
1310 	    {
1311 	      char quot;
1312 
1313 	      quot = GET ();
1314 	      if (quot == '\'')
1315 		{
1316 		  UNGET ('x');
1317 		  ch = '0';
1318 		}
1319 	      else
1320 		UNGET (quot);
1321 	    }
1322 #endif
1323 	  /* Fall through.  */
1324 
1325 	case LEX_IS_SYMBOL_COMPONENT:
1326 	  if (state == 10)
1327 	    {
1328 	      /* This is a symbol character following another symbol
1329 		 character, with whitespace in between.  We skipped
1330 		 the whitespace earlier, so output it now.  */
1331 	      UNGET (ch);
1332 	      state = 3;
1333 	      PUT (' ');
1334 	      break;
1335 	    }
1336 
1337 #ifdef TC_Z80
1338 	  /* "af'" is a symbol containing '\''.  */
1339 	  if (state == 3 && (ch == 'a' || ch == 'A'))
1340 	    {
1341 	      state = 16;
1342 	      PUT (ch);
1343 	      ch = GET ();
1344 	      if (ch == 'f' || ch == 'F')
1345 		{
1346 		  state = 17;
1347 		  PUT (ch);
1348 		  break;
1349 		}
1350 	      else
1351 		{
1352 		  state = 9;
1353 		  if (ch == EOF || !IS_SYMBOL_COMPONENT (ch))
1354 		    {
1355 		      if (ch != EOF)
1356 			UNGET (ch);
1357 		      break;
1358 		    }
1359 		}
1360 	    }
1361 #endif
1362 	  if (state == 3)
1363 	    state = 9;
1364 
1365 	  /* This is a common case.  Quickly copy CH and all the
1366 	     following symbol component or normal characters.  */
1367 	  if (to + 1 < toend
1368 	      && mri_state == NULL
1369 #if defined TC_ARM && defined OBJ_ELF
1370 	      && symver_state == NULL
1371 #endif
1372 	      )
1373 	    {
1374 	      char *s;
1375 	      ptrdiff_t len;
1376 
1377 	      for (s = from; s < fromend; s++)
1378 		{
1379 		  int type;
1380 
1381 		  ch2 = *(unsigned char *) s;
1382 		  type = lex[ch2];
1383 		  if (type != 0
1384 		      && type != LEX_IS_SYMBOL_COMPONENT)
1385 		    break;
1386 		}
1387 
1388 	      if (s > from)
1389 		/* Handle the last character normally, for
1390 		   simplicity.  */
1391 		--s;
1392 
1393 	      len = s - from;
1394 
1395 	      if (len > (toend - to) - 1)
1396 		len = (toend - to) - 1;
1397 
1398 	      if (len > 0)
1399 		{
1400 		  PUT (ch);
1401 		  memcpy (to, from, len);
1402 		  to += len;
1403 		  from += len;
1404 		  if (to >= toend)
1405 		    goto tofull;
1406 		  ch = GET ();
1407 		}
1408 	    }
1409 
1410 	  /* Fall through.  */
1411 	default:
1412 	de_fault:
1413 	  /* Some relatively `normal' character.  */
1414 	  if (state == 0)
1415 	    {
1416 	      state = 11;	/* Now seeing label definition.  */
1417 	    }
1418 	  else if (state == 1)
1419 	    {
1420 	      state = 2;	/* Ditto.  */
1421 	    }
1422 	  else if (state == 9)
1423 	    {
1424 	      if (!IS_SYMBOL_COMPONENT (ch))
1425 		state = 3;
1426 	    }
1427 	  else if (state == 10)
1428 	    {
1429 	      if (ch == '\\')
1430 		{
1431 		  /* Special handling for backslash: a backslash may
1432 		     be the beginning of a formal parameter (of a
1433 		     macro) following another symbol character, with
1434 		     whitespace in between.  If that is the case, we
1435 		     output a space before the parameter.  Strictly
1436 		     speaking, correct handling depends upon what the
1437 		     macro parameter expands into; if the parameter
1438 		     expands into something which does not start with
1439 		     an operand character, then we don't want to keep
1440 		     the space.  We don't have enough information to
1441 		     make the right choice, so here we are making the
1442 		     choice which is more likely to be correct.  */
1443 		  if (to + 1 >= toend)
1444 		    {
1445 		      /* If we're near the end of the buffer, save the
1446 		         character for the next time round.  Otherwise
1447 		         we'll lose our state.  */
1448 		      UNGET (ch);
1449 		      goto tofull;
1450 		    }
1451 		  *to++ = ' ';
1452 		}
1453 
1454 	      state = 3;
1455 	    }
1456 	  PUT (ch);
1457 	  break;
1458 	}
1459     }
1460 
1461   /*NOTREACHED*/
1462 
1463  fromeof:
1464   /* We have reached the end of the input.  */
1465   if (to > tostart)
1466     last_char = to[-1];
1467   return to - tostart;
1468 
1469  tofull:
1470   /* The output buffer is full.  Save any input we have not yet
1471      processed.  */
1472   if (fromend > from)
1473     {
1474       saved_input = from;
1475       saved_input_len = fromend - from;
1476     }
1477   else
1478     saved_input = NULL;
1479 
1480   if (to > tostart)
1481     last_char = to[-1];
1482   return to - tostart;
1483 }
1484