1 /* This is the Assembler Pre-Processor 2 Copyright (C) 1987-2020 Free Software Foundation, Inc. 3 4 This file is part of GAS, the GNU Assembler. 5 6 GAS is free software; you can redistribute it and/or modify 7 it under the terms of the GNU General Public License as published by 8 the Free Software Foundation; either version 3, or (at your option) 9 any later version. 10 11 GAS is distributed in the hope that it will be useful, but WITHOUT 12 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 13 or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public 14 License for more details. 15 16 You should have received a copy of the GNU General Public License 17 along with GAS; see the file COPYING. If not, write to the Free 18 Software Foundation, 51 Franklin Street - Fifth Floor, Boston, MA 19 02110-1301, USA. */ 20 21 /* Modified by Allen Wirfs-Brock, Instantiations Inc 2/90. */ 22 /* App, the assembler pre-processor. This pre-processor strips out 23 excess spaces, turns single-quoted characters into a decimal 24 constant, and turns the # in # <number> <filename> <garbage> into a 25 .linefile. This needs better error-handling. */ 26 27 #include "as.h" 28 29 #if (__STDC__ != 1) 30 #ifndef const 31 #define const /* empty */ 32 #endif 33 #endif 34 35 #ifdef H_TICK_HEX 36 int enable_h_tick_hex = 0; 37 #endif 38 39 #ifdef TC_M68K 40 /* Whether we are scrubbing in m68k MRI mode. This is different from 41 flag_m68k_mri, because the two flags will be affected by the .mri 42 pseudo-op at different times. */ 43 static int scrub_m68k_mri; 44 45 /* The pseudo-op which switches in and out of MRI mode. See the 46 comment in do_scrub_chars. */ 47 static const char mri_pseudo[] = ".mri 0"; 48 #else 49 #define scrub_m68k_mri 0 50 #endif 51 52 #if defined TC_ARM && defined OBJ_ELF 53 /* The pseudo-op for which we need to special-case `@' characters. 54 See the comment in do_scrub_chars. */ 55 static const char symver_pseudo[] = ".symver"; 56 static const char * symver_state; 57 #endif 58 #ifdef TC_ARM 59 static char last_char; 60 #endif 61 62 static char lex[256]; 63 static const char symbol_chars[] = 64 "$._ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789"; 65 66 #define LEX_IS_SYMBOL_COMPONENT 1 67 #define LEX_IS_WHITESPACE 2 68 #define LEX_IS_LINE_SEPARATOR 3 69 #define LEX_IS_COMMENT_START 4 70 #define LEX_IS_LINE_COMMENT_START 5 71 #define LEX_IS_TWOCHAR_COMMENT_1ST 6 72 #define LEX_IS_STRINGQUOTE 8 73 #define LEX_IS_COLON 9 74 #define LEX_IS_NEWLINE 10 75 #define LEX_IS_ONECHAR_QUOTE 11 76 #ifdef TC_V850 77 #define LEX_IS_DOUBLEDASH_1ST 12 78 #endif 79 #ifdef TC_M32R 80 #define DOUBLEBAR_PARALLEL 81 #endif 82 #ifdef DOUBLEBAR_PARALLEL 83 #define LEX_IS_DOUBLEBAR_1ST 13 84 #endif 85 #define LEX_IS_PARALLEL_SEPARATOR 14 86 #ifdef H_TICK_HEX 87 #define LEX_IS_H 15 88 #endif 89 #define IS_SYMBOL_COMPONENT(c) (lex[c] == LEX_IS_SYMBOL_COMPONENT) 90 #define IS_WHITESPACE(c) (lex[c] == LEX_IS_WHITESPACE) 91 #define IS_LINE_SEPARATOR(c) (lex[c] == LEX_IS_LINE_SEPARATOR) 92 #define IS_PARALLEL_SEPARATOR(c) (lex[c] == LEX_IS_PARALLEL_SEPARATOR) 93 #define IS_COMMENT(c) (lex[c] == LEX_IS_COMMENT_START) 94 #define IS_LINE_COMMENT(c) (lex[c] == LEX_IS_LINE_COMMENT_START) 95 #define IS_NEWLINE(c) (lex[c] == LEX_IS_NEWLINE) 96 97 static int process_escape (int); 98 99 /* FIXME-soon: The entire lexer/parser thingy should be 100 built statically at compile time rather than dynamically 101 each and every time the assembler is run. xoxorich. */ 102 103 void 104 do_scrub_begin (int m68k_mri ATTRIBUTE_UNUSED) 105 { 106 const char *p; 107 int c; 108 109 lex[' '] = LEX_IS_WHITESPACE; 110 lex['\t'] = LEX_IS_WHITESPACE; 111 lex['\r'] = LEX_IS_WHITESPACE; 112 lex['\n'] = LEX_IS_NEWLINE; 113 lex[':'] = LEX_IS_COLON; 114 115 #ifdef TC_M68K 116 scrub_m68k_mri = m68k_mri; 117 118 if (! m68k_mri) 119 #endif 120 { 121 lex['"'] = LEX_IS_STRINGQUOTE; 122 123 #if ! defined (TC_HPPA) 124 lex['\''] = LEX_IS_ONECHAR_QUOTE; 125 #endif 126 127 #ifdef SINGLE_QUOTE_STRINGS 128 lex['\''] = LEX_IS_STRINGQUOTE; 129 #endif 130 } 131 132 /* Note: if any other character can be LEX_IS_STRINGQUOTE, the loop 133 in state 5 of do_scrub_chars must be changed. */ 134 135 /* Note that these override the previous defaults, e.g. if ';' is a 136 comment char, then it isn't a line separator. */ 137 for (p = symbol_chars; *p; ++p) 138 lex[(unsigned char) *p] = LEX_IS_SYMBOL_COMPONENT; 139 140 for (c = 128; c < 256; ++c) 141 lex[c] = LEX_IS_SYMBOL_COMPONENT; 142 143 #ifdef tc_symbol_chars 144 /* This macro permits the processor to specify all characters which 145 may appears in an operand. This will prevent the scrubber from 146 discarding meaningful whitespace in certain cases. The i386 147 backend uses this to support prefixes, which can confuse the 148 scrubber as to whether it is parsing operands or opcodes. */ 149 for (p = tc_symbol_chars; *p; ++p) 150 lex[(unsigned char) *p] = LEX_IS_SYMBOL_COMPONENT; 151 #endif 152 153 /* The m68k backend wants to be able to change comment_chars. */ 154 #ifndef tc_comment_chars 155 #define tc_comment_chars comment_chars 156 #endif 157 for (p = tc_comment_chars; *p; p++) 158 lex[(unsigned char) *p] = LEX_IS_COMMENT_START; 159 160 for (p = line_comment_chars; *p; p++) 161 lex[(unsigned char) *p] = LEX_IS_LINE_COMMENT_START; 162 163 #ifndef tc_line_separator_chars 164 #define tc_line_separator_chars line_separator_chars 165 #endif 166 for (p = tc_line_separator_chars; *p; p++) 167 lex[(unsigned char) *p] = LEX_IS_LINE_SEPARATOR; 168 169 #ifdef tc_parallel_separator_chars 170 /* This macro permits the processor to specify all characters which 171 separate parallel insns on the same line. */ 172 for (p = tc_parallel_separator_chars; *p; p++) 173 lex[(unsigned char) *p] = LEX_IS_PARALLEL_SEPARATOR; 174 #endif 175 176 /* Only allow slash-star comments if slash is not in use. 177 FIXME: This isn't right. We should always permit them. */ 178 if (lex['/'] == 0) 179 lex['/'] = LEX_IS_TWOCHAR_COMMENT_1ST; 180 181 #ifdef TC_M68K 182 if (m68k_mri) 183 { 184 lex['\''] = LEX_IS_STRINGQUOTE; 185 lex[';'] = LEX_IS_COMMENT_START; 186 lex['*'] = LEX_IS_LINE_COMMENT_START; 187 /* The MRI documentation says '!' is LEX_IS_COMMENT_START, but 188 then it can't be used in an expression. */ 189 lex['!'] = LEX_IS_LINE_COMMENT_START; 190 } 191 #endif 192 193 #ifdef TC_V850 194 lex['-'] = LEX_IS_DOUBLEDASH_1ST; 195 #endif 196 #ifdef DOUBLEBAR_PARALLEL 197 lex['|'] = LEX_IS_DOUBLEBAR_1ST; 198 #endif 199 #ifdef TC_D30V 200 /* Must do this is we want VLIW instruction with "->" or "<-". */ 201 lex['-'] = LEX_IS_SYMBOL_COMPONENT; 202 #endif 203 204 #ifdef H_TICK_HEX 205 if (enable_h_tick_hex) 206 { 207 lex['h'] = LEX_IS_H; 208 lex['H'] = LEX_IS_H; 209 } 210 #endif 211 } 212 213 /* Saved state of the scrubber. */ 214 static int state; 215 static int old_state; 216 static const char *out_string; 217 static char out_buf[20]; 218 static int add_newlines; 219 static char *saved_input; 220 static size_t saved_input_len; 221 static char input_buffer[32 * 1024]; 222 static const char *mri_state; 223 static char mri_last_ch; 224 225 /* Data structure for saving the state of app across #include's. Note that 226 app is called asynchronously to the parsing of the .include's, so our 227 state at the time .include is interpreted is completely unrelated. 228 That's why we have to save it all. */ 229 230 struct app_save 231 { 232 int state; 233 int old_state; 234 const char * out_string; 235 char out_buf[sizeof (out_buf)]; 236 int add_newlines; 237 char * saved_input; 238 size_t saved_input_len; 239 #ifdef TC_M68K 240 int scrub_m68k_mri; 241 #endif 242 const char * mri_state; 243 char mri_last_ch; 244 #if defined TC_ARM && defined OBJ_ELF 245 const char * symver_state; 246 #endif 247 #ifdef TC_ARM 248 char last_char; 249 #endif 250 }; 251 252 char * 253 app_push (void) 254 { 255 struct app_save *saved; 256 257 saved = XNEW (struct app_save); 258 saved->state = state; 259 saved->old_state = old_state; 260 saved->out_string = out_string; 261 memcpy (saved->out_buf, out_buf, sizeof (out_buf)); 262 saved->add_newlines = add_newlines; 263 if (saved_input == NULL) 264 saved->saved_input = NULL; 265 else 266 { 267 saved->saved_input = XNEWVEC (char, saved_input_len); 268 memcpy (saved->saved_input, saved_input, saved_input_len); 269 saved->saved_input_len = saved_input_len; 270 } 271 #ifdef TC_M68K 272 saved->scrub_m68k_mri = scrub_m68k_mri; 273 #endif 274 saved->mri_state = mri_state; 275 saved->mri_last_ch = mri_last_ch; 276 #if defined TC_ARM && defined OBJ_ELF 277 saved->symver_state = symver_state; 278 #endif 279 #ifdef TC_ARM 280 saved->last_char = last_char; 281 #endif 282 283 /* do_scrub_begin() is not useful, just wastes time. */ 284 285 state = 0; 286 saved_input = NULL; 287 add_newlines = 0; 288 289 return (char *) saved; 290 } 291 292 void 293 app_pop (char *arg) 294 { 295 struct app_save *saved = (struct app_save *) arg; 296 297 /* There is no do_scrub_end (). */ 298 state = saved->state; 299 old_state = saved->old_state; 300 out_string = saved->out_string; 301 memcpy (out_buf, saved->out_buf, sizeof (out_buf)); 302 add_newlines = saved->add_newlines; 303 if (saved->saved_input == NULL) 304 saved_input = NULL; 305 else 306 { 307 gas_assert (saved->saved_input_len <= sizeof (input_buffer)); 308 memcpy (input_buffer, saved->saved_input, saved->saved_input_len); 309 saved_input = input_buffer; 310 saved_input_len = saved->saved_input_len; 311 free (saved->saved_input); 312 } 313 #ifdef TC_M68K 314 scrub_m68k_mri = saved->scrub_m68k_mri; 315 #endif 316 mri_state = saved->mri_state; 317 mri_last_ch = saved->mri_last_ch; 318 #if defined TC_ARM && defined OBJ_ELF 319 symver_state = saved->symver_state; 320 #endif 321 #ifdef TC_ARM 322 last_char = saved->last_char; 323 #endif 324 325 free (arg); 326 } 327 328 /* @@ This assumes that \n &c are the same on host and target. This is not 329 necessarily true. */ 330 331 static int 332 process_escape (int ch) 333 { 334 switch (ch) 335 { 336 case 'b': 337 return '\b'; 338 case 'f': 339 return '\f'; 340 case 'n': 341 return '\n'; 342 case 'r': 343 return '\r'; 344 case 't': 345 return '\t'; 346 case '\'': 347 return '\''; 348 case '"': 349 return '\"'; 350 default: 351 return ch; 352 } 353 } 354 355 /* This function is called to process input characters. The GET 356 parameter is used to retrieve more input characters. GET should 357 set its parameter to point to a buffer, and return the length of 358 the buffer; it should return 0 at end of file. The scrubbed output 359 characters are put into the buffer starting at TOSTART; the TOSTART 360 buffer is TOLEN bytes in length. The function returns the number 361 of scrubbed characters put into TOSTART. This will be TOLEN unless 362 end of file was seen. This function is arranged as a state 363 machine, and saves its state so that it may return at any point. 364 This is the way the old code used to work. */ 365 366 size_t 367 do_scrub_chars (size_t (*get) (char *, size_t), char *tostart, size_t tolen) 368 { 369 char *to = tostart; 370 char *toend = tostart + tolen; 371 char *from; 372 char *fromend; 373 size_t fromlen; 374 int ch, ch2 = 0; 375 /* Character that started the string we're working on. */ 376 static char quotechar; 377 378 /*State 0: beginning of normal line 379 1: After first whitespace on line (flush more white) 380 2: After first non-white (opcode) on line (keep 1white) 381 3: after second white on line (into operands) (flush white) 382 4: after putting out a .linefile, put out digits 383 5: parsing a string, then go to old-state 384 6: putting out \ escape in a "d string. 385 7: no longer used 386 8: no longer used 387 9: After seeing symbol char in state 3 (keep 1white after symchar) 388 10: After seeing whitespace in state 9 (keep white before symchar) 389 11: After seeing a symbol character in state 0 (eg a label definition) 390 -1: output string in out_string and go to the state in old_state 391 -2: flush text until a '*' '/' is seen, then go to state old_state 392 #ifdef TC_V850 393 12: After seeing a dash, looking for a second dash as a start 394 of comment. 395 #endif 396 #ifdef DOUBLEBAR_PARALLEL 397 13: After seeing a vertical bar, looking for a second 398 vertical bar as a parallel expression separator. 399 #endif 400 #ifdef TC_PREDICATE_START_CHAR 401 14: After seeing a predicate start character at state 0, looking 402 for a predicate end character as predicate. 403 15: After seeing a predicate start character at state 1, looking 404 for a predicate end character as predicate. 405 #endif 406 #ifdef TC_Z80 407 16: After seeing an 'a' or an 'A' at the start of a symbol 408 17: After seeing an 'f' or an 'F' in state 16 409 #endif 410 */ 411 412 /* I added states 9 and 10 because the MIPS ECOFF assembler uses 413 constructs like ``.loc 1 20''. This was turning into ``.loc 414 120''. States 9 and 10 ensure that a space is never dropped in 415 between characters which could appear in an identifier. Ian 416 Taylor, ian@cygnus.com. 417 418 I added state 11 so that something like "Lfoo add %r25,%r26,%r27" works 419 correctly on the PA (and any other target where colons are optional). 420 Jeff Law, law@cs.utah.edu. 421 422 I added state 13 so that something like "cmp r1, r2 || trap #1" does not 423 get squashed into "cmp r1,r2||trap#1", with the all important space 424 between the 'trap' and the '#1' being eliminated. nickc@cygnus.com */ 425 426 /* This macro gets the next input character. */ 427 428 #define GET() \ 429 (from < fromend \ 430 ? * (unsigned char *) (from++) \ 431 : (saved_input = NULL, \ 432 fromlen = (*get) (input_buffer, sizeof input_buffer), \ 433 from = input_buffer, \ 434 fromend = from + fromlen, \ 435 (fromlen == 0 \ 436 ? EOF \ 437 : * (unsigned char *) (from++)))) 438 439 /* This macro pushes a character back on the input stream. */ 440 441 #define UNGET(uch) (*--from = (uch)) 442 443 /* This macro puts a character into the output buffer. If this 444 character fills the output buffer, this macro jumps to the label 445 TOFULL. We use this rather ugly approach because we need to 446 handle two different termination conditions: EOF on the input 447 stream, and a full output buffer. It would be simpler if we 448 always read in the entire input stream before processing it, but 449 I don't want to make such a significant change to the assembler's 450 memory usage. */ 451 452 #define PUT(pch) \ 453 do \ 454 { \ 455 *to++ = (pch); \ 456 if (to >= toend) \ 457 goto tofull; \ 458 } \ 459 while (0) 460 461 if (saved_input != NULL) 462 { 463 from = saved_input; 464 fromend = from + saved_input_len; 465 } 466 else 467 { 468 fromlen = (*get) (input_buffer, sizeof input_buffer); 469 if (fromlen == 0) 470 return 0; 471 from = input_buffer; 472 fromend = from + fromlen; 473 } 474 475 while (1) 476 { 477 /* The cases in this switch end with continue, in order to 478 branch back to the top of this while loop and generate the 479 next output character in the appropriate state. */ 480 switch (state) 481 { 482 case -1: 483 ch = *out_string++; 484 if (*out_string == '\0') 485 { 486 state = old_state; 487 old_state = 3; 488 } 489 PUT (ch); 490 continue; 491 492 case -2: 493 for (;;) 494 { 495 do 496 { 497 ch = GET (); 498 499 if (ch == EOF) 500 { 501 as_warn (_("end of file in comment")); 502 goto fromeof; 503 } 504 505 if (ch == '\n') 506 PUT ('\n'); 507 } 508 while (ch != '*'); 509 510 while ((ch = GET ()) == '*') 511 ; 512 513 if (ch == EOF) 514 { 515 as_warn (_("end of file in comment")); 516 goto fromeof; 517 } 518 519 if (ch == '/') 520 break; 521 522 UNGET (ch); 523 } 524 525 state = old_state; 526 UNGET (' '); 527 continue; 528 529 case 4: 530 ch = GET (); 531 if (ch == EOF) 532 goto fromeof; 533 else if (ch >= '0' && ch <= '9') 534 PUT (ch); 535 else 536 { 537 while (ch != EOF && IS_WHITESPACE (ch)) 538 ch = GET (); 539 if (ch == '"') 540 { 541 quotechar = ch; 542 state = 5; 543 old_state = 3; 544 PUT (ch); 545 } 546 else 547 { 548 while (ch != EOF && ch != '\n') 549 ch = GET (); 550 state = 0; 551 PUT (ch); 552 } 553 } 554 continue; 555 556 case 5: 557 /* We are going to copy everything up to a quote character, 558 with special handling for a backslash. We try to 559 optimize the copying in the simple case without using the 560 GET and PUT macros. */ 561 { 562 char *s; 563 ptrdiff_t len; 564 565 for (s = from; s < fromend; s++) 566 { 567 ch = *s; 568 if (ch == '\\' 569 || ch == quotechar 570 || ch == '\n') 571 break; 572 } 573 len = s - from; 574 if (len > toend - to) 575 len = toend - to; 576 if (len > 0) 577 { 578 memcpy (to, from, len); 579 to += len; 580 from += len; 581 if (to >= toend) 582 goto tofull; 583 } 584 } 585 586 ch = GET (); 587 if (ch == EOF) 588 { 589 /* This buffer is here specifically so 590 that the UNGET below will work. */ 591 static char one_char_buf[1]; 592 593 as_warn (_("end of file in string; '%c' inserted"), quotechar); 594 state = old_state; 595 from = fromend = one_char_buf + 1; 596 fromlen = 1; 597 UNGET ('\n'); 598 PUT (quotechar); 599 } 600 else if (ch == quotechar) 601 { 602 state = old_state; 603 PUT (ch); 604 } 605 else if (TC_STRING_ESCAPES && ch == '\\') 606 { 607 state = 6; 608 PUT (ch); 609 } 610 else if (scrub_m68k_mri && ch == '\n') 611 { 612 /* Just quietly terminate the string. This permits lines like 613 bne label loop if we haven't reach end yet. */ 614 state = old_state; 615 UNGET (ch); 616 PUT ('\''); 617 } 618 else 619 { 620 PUT (ch); 621 } 622 continue; 623 624 case 6: 625 state = 5; 626 ch = GET (); 627 switch (ch) 628 { 629 /* Handle strings broken across lines, by turning '\n' into 630 '\\' and 'n'. */ 631 case '\n': 632 UNGET ('n'); 633 add_newlines++; 634 PUT ('\\'); 635 continue; 636 637 case EOF: 638 as_warn (_("end of file in string; '%c' inserted"), quotechar); 639 PUT (quotechar); 640 continue; 641 642 case '"': 643 case '\\': 644 case 'b': 645 case 'f': 646 case 'n': 647 case 'r': 648 case 't': 649 case 'v': 650 case 'x': 651 case 'X': 652 case '0': 653 case '1': 654 case '2': 655 case '3': 656 case '4': 657 case '5': 658 case '6': 659 case '7': 660 break; 661 662 default: 663 #ifdef ONLY_STANDARD_ESCAPES 664 as_warn (_("unknown escape '\\%c' in string; ignored"), ch); 665 #endif 666 break; 667 } 668 PUT (ch); 669 continue; 670 671 #ifdef DOUBLEBAR_PARALLEL 672 case 13: 673 ch = GET (); 674 if (ch != '|') 675 abort (); 676 677 /* Reset back to state 1 and pretend that we are parsing a 678 line from just after the first white space. */ 679 state = 1; 680 PUT ('|'); 681 #ifdef TC_TIC6X 682 /* "||^" is used for SPMASKed instructions. */ 683 ch = GET (); 684 if (ch == EOF) 685 goto fromeof; 686 else if (ch == '^') 687 PUT ('^'); 688 else 689 UNGET (ch); 690 #endif 691 continue; 692 #endif 693 #ifdef TC_Z80 694 case 16: 695 /* We have seen an 'a' at the start of a symbol, look for an 'f'. */ 696 ch = GET (); 697 if (ch == 'f' || ch == 'F') 698 { 699 state = 17; 700 PUT (ch); 701 } 702 else 703 { 704 state = 9; 705 break; 706 } 707 /* Fall through. */ 708 case 17: 709 /* We have seen "af" at the start of a symbol, 710 a ' here is a part of that symbol. */ 711 ch = GET (); 712 state = 9; 713 if (ch == '\'') 714 /* Change to avoid warning about unclosed string. */ 715 PUT ('`'); 716 else if (ch != EOF) 717 UNGET (ch); 718 break; 719 #endif 720 } 721 722 /* OK, we are somewhere in states 0 through 4 or 9 through 11. */ 723 724 /* flushchar: */ 725 ch = GET (); 726 727 #ifdef TC_PREDICATE_START_CHAR 728 if (ch == TC_PREDICATE_START_CHAR && (state == 0 || state == 1)) 729 { 730 state += 14; 731 PUT (ch); 732 continue; 733 } 734 else if (state == 14 || state == 15) 735 { 736 if (ch == TC_PREDICATE_END_CHAR) 737 { 738 state -= 14; 739 PUT (ch); 740 ch = GET (); 741 } 742 else 743 { 744 PUT (ch); 745 continue; 746 } 747 } 748 #endif 749 750 recycle: 751 752 #if defined TC_ARM && defined OBJ_ELF 753 /* We need to watch out for .symver directives. See the comment later 754 in this function. */ 755 if (symver_state == NULL) 756 { 757 if ((state == 0 || state == 1) && ch == symver_pseudo[0]) 758 symver_state = symver_pseudo + 1; 759 } 760 else 761 { 762 /* We advance to the next state if we find the right 763 character. */ 764 if (ch != '\0' && (*symver_state == ch)) 765 ++symver_state; 766 else if (*symver_state != '\0') 767 /* We did not get the expected character, or we didn't 768 get a valid terminating character after seeing the 769 entire pseudo-op, so we must go back to the beginning. */ 770 symver_state = NULL; 771 else 772 { 773 /* We've read the entire pseudo-op. If this is the end 774 of the line, go back to the beginning. */ 775 if (IS_NEWLINE (ch)) 776 symver_state = NULL; 777 } 778 } 779 #endif /* TC_ARM && OBJ_ELF */ 780 781 #ifdef TC_M68K 782 /* We want to have pseudo-ops which control whether we are in 783 MRI mode or not. Unfortunately, since m68k MRI mode affects 784 the scrubber, that means that we need a special purpose 785 recognizer here. */ 786 if (mri_state == NULL) 787 { 788 if ((state == 0 || state == 1) 789 && ch == mri_pseudo[0]) 790 mri_state = mri_pseudo + 1; 791 } 792 else 793 { 794 /* We advance to the next state if we find the right 795 character, or if we need a space character and we get any 796 whitespace character, or if we need a '0' and we get a 797 '1' (this is so that we only need one state to handle 798 ``.mri 0'' and ``.mri 1''). */ 799 if (ch != '\0' 800 && (*mri_state == ch 801 || (*mri_state == ' ' 802 && lex[ch] == LEX_IS_WHITESPACE) 803 || (*mri_state == '0' 804 && ch == '1'))) 805 { 806 mri_last_ch = ch; 807 ++mri_state; 808 } 809 else if (*mri_state != '\0' 810 || (lex[ch] != LEX_IS_WHITESPACE 811 && lex[ch] != LEX_IS_NEWLINE)) 812 { 813 /* We did not get the expected character, or we didn't 814 get a valid terminating character after seeing the 815 entire pseudo-op, so we must go back to the 816 beginning. */ 817 mri_state = NULL; 818 } 819 else 820 { 821 /* We've read the entire pseudo-op. mips_last_ch is 822 either '0' or '1' indicating whether to enter or 823 leave MRI mode. */ 824 do_scrub_begin (mri_last_ch == '1'); 825 mri_state = NULL; 826 827 /* We continue handling the character as usual. The 828 main gas reader must also handle the .mri pseudo-op 829 to control expression parsing and the like. */ 830 } 831 } 832 #endif 833 834 if (ch == EOF) 835 { 836 if (state != 0) 837 { 838 as_warn (_("end of file not at end of a line; newline inserted")); 839 state = 0; 840 PUT ('\n'); 841 } 842 goto fromeof; 843 } 844 845 switch (lex[ch]) 846 { 847 case LEX_IS_WHITESPACE: 848 do 849 { 850 ch = GET (); 851 } 852 while (ch != EOF && IS_WHITESPACE (ch)); 853 if (ch == EOF) 854 goto fromeof; 855 856 if (state == 0) 857 { 858 /* Preserve a single whitespace character at the 859 beginning of a line. */ 860 state = 1; 861 UNGET (ch); 862 PUT (' '); 863 break; 864 } 865 866 #ifdef KEEP_WHITE_AROUND_COLON 867 if (lex[ch] == LEX_IS_COLON) 868 { 869 /* Only keep this white if there's no white *after* the 870 colon. */ 871 ch2 = GET (); 872 if (ch2 != EOF) 873 UNGET (ch2); 874 if (!IS_WHITESPACE (ch2)) 875 { 876 state = 9; 877 UNGET (ch); 878 PUT (' '); 879 break; 880 } 881 } 882 #endif 883 if (IS_COMMENT (ch) 884 || ch == '/' 885 || IS_LINE_SEPARATOR (ch) 886 || IS_PARALLEL_SEPARATOR (ch)) 887 { 888 if (scrub_m68k_mri) 889 { 890 /* In MRI mode, we keep these spaces. */ 891 UNGET (ch); 892 PUT (' '); 893 break; 894 } 895 goto recycle; 896 } 897 898 /* If we're in state 2 or 11, we've seen a non-white 899 character followed by whitespace. If the next character 900 is ':', this is whitespace after a label name which we 901 normally must ignore. In MRI mode, though, spaces are 902 not permitted between the label and the colon. */ 903 if ((state == 2 || state == 11) 904 && lex[ch] == LEX_IS_COLON 905 && ! scrub_m68k_mri) 906 { 907 state = 1; 908 PUT (ch); 909 break; 910 } 911 912 switch (state) 913 { 914 case 1: 915 /* We can arrive here if we leave a leading whitespace 916 character at the beginning of a line. */ 917 goto recycle; 918 case 2: 919 state = 3; 920 if (to + 1 < toend) 921 { 922 /* Optimize common case by skipping UNGET/GET. */ 923 PUT (' '); /* Sp after opco */ 924 goto recycle; 925 } 926 UNGET (ch); 927 PUT (' '); 928 break; 929 case 3: 930 #ifndef TC_KEEP_OPERAND_SPACES 931 /* For TI C6X, we keep these spaces as they may separate 932 functional unit specifiers from operands. */ 933 if (scrub_m68k_mri) 934 #endif 935 { 936 /* In MRI mode, we keep these spaces. */ 937 UNGET (ch); 938 PUT (' '); 939 break; 940 } 941 goto recycle; /* Sp in operands */ 942 case 9: 943 case 10: 944 #ifndef TC_KEEP_OPERAND_SPACES 945 if (scrub_m68k_mri) 946 #endif 947 { 948 /* In MRI mode, we keep these spaces. */ 949 state = 3; 950 UNGET (ch); 951 PUT (' '); 952 break; 953 } 954 state = 10; /* Sp after symbol char */ 955 goto recycle; 956 case 11: 957 if (LABELS_WITHOUT_COLONS || flag_m68k_mri) 958 state = 1; 959 else 960 { 961 /* We know that ch is not ':', since we tested that 962 case above. Therefore this is not a label, so it 963 must be the opcode, and we've just seen the 964 whitespace after it. */ 965 state = 3; 966 } 967 UNGET (ch); 968 PUT (' '); /* Sp after label definition. */ 969 break; 970 default: 971 BAD_CASE (state); 972 } 973 break; 974 975 case LEX_IS_TWOCHAR_COMMENT_1ST: 976 ch2 = GET (); 977 if (ch2 == '*') 978 { 979 for (;;) 980 { 981 do 982 { 983 ch2 = GET (); 984 if (ch2 != EOF && IS_NEWLINE (ch2)) 985 add_newlines++; 986 } 987 while (ch2 != EOF && ch2 != '*'); 988 989 while (ch2 == '*') 990 ch2 = GET (); 991 992 if (ch2 == EOF || ch2 == '/') 993 break; 994 995 /* This UNGET will ensure that we count newlines 996 correctly. */ 997 UNGET (ch2); 998 } 999 1000 if (ch2 == EOF) 1001 as_warn (_("end of file in multiline comment")); 1002 1003 ch = ' '; 1004 goto recycle; 1005 } 1006 #ifdef DOUBLESLASH_LINE_COMMENTS 1007 else if (ch2 == '/') 1008 { 1009 do 1010 { 1011 ch = GET (); 1012 } 1013 while (ch != EOF && !IS_NEWLINE (ch)); 1014 if (ch == EOF) 1015 as_warn ("end of file in comment; newline inserted"); 1016 state = 0; 1017 PUT ('\n'); 1018 break; 1019 } 1020 #endif 1021 else 1022 { 1023 if (ch2 != EOF) 1024 UNGET (ch2); 1025 if (state == 9 || state == 10) 1026 state = 3; 1027 PUT (ch); 1028 } 1029 break; 1030 1031 case LEX_IS_STRINGQUOTE: 1032 quotechar = ch; 1033 if (state == 10) 1034 { 1035 /* Preserve the whitespace in foo "bar". */ 1036 UNGET (ch); 1037 state = 3; 1038 PUT (' '); 1039 1040 /* PUT didn't jump out. We could just break, but we 1041 know what will happen, so optimize a bit. */ 1042 ch = GET (); 1043 old_state = 3; 1044 } 1045 else if (state == 9) 1046 old_state = 3; 1047 else 1048 old_state = state; 1049 state = 5; 1050 PUT (ch); 1051 break; 1052 1053 case LEX_IS_ONECHAR_QUOTE: 1054 #ifdef H_TICK_HEX 1055 if (state == 9 && enable_h_tick_hex) 1056 { 1057 char c; 1058 1059 c = GET (); 1060 as_warn ("'%c found after symbol", c); 1061 UNGET (c); 1062 } 1063 #endif 1064 if (state == 10) 1065 { 1066 /* Preserve the whitespace in foo 'b'. */ 1067 UNGET (ch); 1068 state = 3; 1069 PUT (' '); 1070 break; 1071 } 1072 ch = GET (); 1073 if (ch == EOF) 1074 { 1075 as_warn (_("end of file after a one-character quote; \\0 inserted")); 1076 ch = 0; 1077 } 1078 if (ch == '\\') 1079 { 1080 ch = GET (); 1081 if (ch == EOF) 1082 { 1083 as_warn (_("end of file in escape character")); 1084 ch = '\\'; 1085 } 1086 else 1087 ch = process_escape (ch); 1088 } 1089 sprintf (out_buf, "%d", (int) (unsigned char) ch); 1090 1091 /* None of these 'x constants for us. We want 'x'. */ 1092 if ((ch = GET ()) != '\'') 1093 { 1094 #ifdef REQUIRE_CHAR_CLOSE_QUOTE 1095 as_warn (_("missing close quote; (assumed)")); 1096 #else 1097 if (ch != EOF) 1098 UNGET (ch); 1099 #endif 1100 } 1101 if (strlen (out_buf) == 1) 1102 { 1103 PUT (out_buf[0]); 1104 break; 1105 } 1106 if (state == 9) 1107 old_state = 3; 1108 else 1109 old_state = state; 1110 state = -1; 1111 out_string = out_buf; 1112 PUT (*out_string++); 1113 break; 1114 1115 case LEX_IS_COLON: 1116 #ifdef KEEP_WHITE_AROUND_COLON 1117 state = 9; 1118 #else 1119 if (state == 9 || state == 10) 1120 state = 3; 1121 else if (state != 3) 1122 state = 1; 1123 #endif 1124 PUT (ch); 1125 break; 1126 1127 case LEX_IS_NEWLINE: 1128 /* Roll out a bunch of newlines from inside comments, etc. */ 1129 if (add_newlines) 1130 { 1131 --add_newlines; 1132 UNGET (ch); 1133 } 1134 /* Fall through. */ 1135 1136 case LEX_IS_LINE_SEPARATOR: 1137 state = 0; 1138 PUT (ch); 1139 break; 1140 1141 case LEX_IS_PARALLEL_SEPARATOR: 1142 state = 1; 1143 PUT (ch); 1144 break; 1145 1146 #ifdef TC_V850 1147 case LEX_IS_DOUBLEDASH_1ST: 1148 ch2 = GET (); 1149 if (ch2 != '-') 1150 { 1151 if (ch2 != EOF) 1152 UNGET (ch2); 1153 goto de_fault; 1154 } 1155 /* Read and skip to end of line. */ 1156 do 1157 { 1158 ch = GET (); 1159 } 1160 while (ch != EOF && ch != '\n'); 1161 1162 if (ch == EOF) 1163 as_warn (_("end of file in comment; newline inserted")); 1164 1165 state = 0; 1166 PUT ('\n'); 1167 break; 1168 #endif 1169 #ifdef DOUBLEBAR_PARALLEL 1170 case LEX_IS_DOUBLEBAR_1ST: 1171 ch2 = GET (); 1172 if (ch2 != EOF) 1173 UNGET (ch2); 1174 if (ch2 != '|') 1175 goto de_fault; 1176 1177 /* Handle '||' in two states as invoking PUT twice might 1178 result in the first one jumping out of this loop. We'd 1179 then lose track of the state and one '|' char. */ 1180 state = 13; 1181 PUT ('|'); 1182 break; 1183 #endif 1184 case LEX_IS_LINE_COMMENT_START: 1185 /* FIXME-someday: The two character comment stuff was badly 1186 thought out. On i386, we want '/' as line comment start 1187 AND we want C style comments. hence this hack. The 1188 whole lexical process should be reworked. xoxorich. */ 1189 if (ch == '/') 1190 { 1191 ch2 = GET (); 1192 if (ch2 == '*') 1193 { 1194 old_state = 3; 1195 state = -2; 1196 break; 1197 } 1198 else if (ch2 != EOF) 1199 { 1200 UNGET (ch2); 1201 } 1202 } 1203 1204 if (state == 0 || state == 1) /* Only comment at start of line. */ 1205 { 1206 int startch; 1207 1208 startch = ch; 1209 1210 do 1211 { 1212 ch = GET (); 1213 } 1214 while (ch != EOF && IS_WHITESPACE (ch)); 1215 1216 if (ch == EOF) 1217 { 1218 as_warn (_("end of file in comment; newline inserted")); 1219 PUT ('\n'); 1220 break; 1221 } 1222 1223 if (ch < '0' || ch > '9' || state != 0 || startch != '#') 1224 { 1225 /* Not a cpp line. */ 1226 while (ch != EOF && !IS_NEWLINE (ch)) 1227 ch = GET (); 1228 if (ch == EOF) 1229 { 1230 as_warn (_("end of file in comment; newline inserted")); 1231 PUT ('\n'); 1232 } 1233 else /* IS_NEWLINE (ch) */ 1234 { 1235 /* To process non-zero add_newlines. */ 1236 UNGET (ch); 1237 } 1238 state = 0; 1239 break; 1240 } 1241 /* Looks like `# 123 "filename"' from cpp. */ 1242 UNGET (ch); 1243 old_state = 4; 1244 state = -1; 1245 if (scrub_m68k_mri) 1246 out_string = "\tlinefile "; 1247 else 1248 out_string = "\t.linefile "; 1249 PUT (*out_string++); 1250 break; 1251 } 1252 1253 #ifdef TC_D10V 1254 /* All insns end in a char for which LEX_IS_SYMBOL_COMPONENT is true. 1255 Trap is the only short insn that has a first operand that is 1256 neither register nor label. 1257 We must prevent exef0f ||trap #1 to degenerate to exef0f ||trap#1 . 1258 We can't make '#' LEX_IS_SYMBOL_COMPONENT because it is 1259 already LEX_IS_LINE_COMMENT_START. However, it is the 1260 only character in line_comment_chars for d10v, hence we 1261 can recognize it as such. */ 1262 /* An alternative approach would be to reset the state to 1 when 1263 we see '||', '<'- or '->', but that seems to be overkill. */ 1264 if (state == 10) 1265 PUT (' '); 1266 #endif 1267 /* We have a line comment character which is not at the 1268 start of a line. If this is also a normal comment 1269 character, fall through. Otherwise treat it as a default 1270 character. */ 1271 if (strchr (tc_comment_chars, ch) == NULL 1272 && (! scrub_m68k_mri 1273 || (ch != '!' && ch != '*'))) 1274 goto de_fault; 1275 if (scrub_m68k_mri 1276 && (ch == '!' || ch == '*' || ch == '#') 1277 && state != 1 1278 && state != 10) 1279 goto de_fault; 1280 /* Fall through. */ 1281 case LEX_IS_COMMENT_START: 1282 #if defined TC_ARM && defined OBJ_ELF 1283 /* On the ARM, `@' is the comment character. 1284 Unfortunately this is also a special character in ELF .symver 1285 directives (and .type, though we deal with those another way). 1286 So we check if this line is such a directive, and treat 1287 the character as default if so. This is a hack. */ 1288 if ((symver_state != NULL) && (*symver_state == 0)) 1289 goto de_fault; 1290 #endif 1291 1292 #ifdef TC_ARM 1293 /* For the ARM, care is needed not to damage occurrences of \@ 1294 by stripping the @ onwards. Yuck. */ 1295 if ((to > tostart ? to[-1] : last_char) == '\\') 1296 /* Do not treat the @ as a start-of-comment. */ 1297 goto de_fault; 1298 #endif 1299 1300 #ifdef WARN_COMMENTS 1301 if (!found_comment) 1302 found_comment_file = as_where (&found_comment); 1303 #endif 1304 do 1305 { 1306 ch = GET (); 1307 } 1308 while (ch != EOF && !IS_NEWLINE (ch)); 1309 if (ch == EOF) 1310 as_warn (_("end of file in comment; newline inserted")); 1311 state = 0; 1312 PUT ('\n'); 1313 break; 1314 1315 #ifdef H_TICK_HEX 1316 case LEX_IS_H: 1317 /* Look for strings like H'[0-9A-Fa-f] and if found, replace 1318 the H' with 0x to make them gas-style hex characters. */ 1319 if (enable_h_tick_hex) 1320 { 1321 char quot; 1322 1323 quot = GET (); 1324 if (quot == '\'') 1325 { 1326 UNGET ('x'); 1327 ch = '0'; 1328 } 1329 else 1330 UNGET (quot); 1331 } 1332 #endif 1333 /* Fall through. */ 1334 1335 case LEX_IS_SYMBOL_COMPONENT: 1336 if (state == 10) 1337 { 1338 /* This is a symbol character following another symbol 1339 character, with whitespace in between. We skipped 1340 the whitespace earlier, so output it now. */ 1341 UNGET (ch); 1342 state = 3; 1343 PUT (' '); 1344 break; 1345 } 1346 1347 #ifdef TC_Z80 1348 /* "af'" is a symbol containing '\''. */ 1349 if (state == 3 && (ch == 'a' || ch == 'A')) 1350 { 1351 state = 16; 1352 PUT (ch); 1353 ch = GET (); 1354 if (ch == 'f' || ch == 'F') 1355 { 1356 state = 17; 1357 PUT (ch); 1358 break; 1359 } 1360 else 1361 { 1362 state = 9; 1363 if (ch == EOF || !IS_SYMBOL_COMPONENT (ch)) 1364 { 1365 if (ch != EOF) 1366 UNGET (ch); 1367 break; 1368 } 1369 } 1370 } 1371 #endif 1372 if (state == 3) 1373 state = 9; 1374 1375 /* This is a common case. Quickly copy CH and all the 1376 following symbol component or normal characters. */ 1377 if (to + 1 < toend 1378 && mri_state == NULL 1379 #if defined TC_ARM && defined OBJ_ELF 1380 && symver_state == NULL 1381 #endif 1382 ) 1383 { 1384 char *s; 1385 ptrdiff_t len; 1386 1387 for (s = from; s < fromend; s++) 1388 { 1389 int type; 1390 1391 ch2 = *(unsigned char *) s; 1392 type = lex[ch2]; 1393 if (type != 0 1394 && type != LEX_IS_SYMBOL_COMPONENT) 1395 break; 1396 } 1397 1398 if (s > from) 1399 /* Handle the last character normally, for 1400 simplicity. */ 1401 --s; 1402 1403 len = s - from; 1404 1405 if (len > (toend - to) - 1) 1406 len = (toend - to) - 1; 1407 1408 if (len > 0) 1409 { 1410 PUT (ch); 1411 memcpy (to, from, len); 1412 to += len; 1413 from += len; 1414 if (to >= toend) 1415 goto tofull; 1416 ch = GET (); 1417 } 1418 } 1419 1420 /* Fall through. */ 1421 default: 1422 de_fault: 1423 /* Some relatively `normal' character. */ 1424 if (state == 0) 1425 { 1426 state = 11; /* Now seeing label definition. */ 1427 } 1428 else if (state == 1) 1429 { 1430 state = 2; /* Ditto. */ 1431 } 1432 else if (state == 9) 1433 { 1434 if (!IS_SYMBOL_COMPONENT (ch)) 1435 state = 3; 1436 } 1437 else if (state == 10) 1438 { 1439 if (ch == '\\') 1440 { 1441 /* Special handling for backslash: a backslash may 1442 be the beginning of a formal parameter (of a 1443 macro) following another symbol character, with 1444 whitespace in between. If that is the case, we 1445 output a space before the parameter. Strictly 1446 speaking, correct handling depends upon what the 1447 macro parameter expands into; if the parameter 1448 expands into something which does not start with 1449 an operand character, then we don't want to keep 1450 the space. We don't have enough information to 1451 make the right choice, so here we are making the 1452 choice which is more likely to be correct. */ 1453 if (to + 1 >= toend) 1454 { 1455 /* If we're near the end of the buffer, save the 1456 character for the next time round. Otherwise 1457 we'll lose our state. */ 1458 UNGET (ch); 1459 goto tofull; 1460 } 1461 *to++ = ' '; 1462 } 1463 1464 state = 3; 1465 } 1466 PUT (ch); 1467 break; 1468 } 1469 } 1470 1471 /*NOTREACHED*/ 1472 1473 fromeof: 1474 /* We have reached the end of the input. */ 1475 #ifdef TC_ARM 1476 if (to > tostart) 1477 last_char = to[-1]; 1478 #endif 1479 return to - tostart; 1480 1481 tofull: 1482 /* The output buffer is full. Save any input we have not yet 1483 processed. */ 1484 if (fromend > from) 1485 { 1486 saved_input = from; 1487 saved_input_len = fromend - from; 1488 } 1489 else 1490 saved_input = NULL; 1491 1492 #ifdef TC_ARM 1493 if (to > tostart) 1494 last_char = to[-1]; 1495 #endif 1496 return to - tostart; 1497 } 1498