1 /* This is the Assembler Pre-Processor 2 Copyright 1987, 1990, 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 3 1999, 2000, 2001, 2002, 2003 4 Free Software Foundation, Inc. 5 6 This file is part of GAS, the GNU Assembler. 7 8 GAS is free software; you can redistribute it and/or modify 9 it under the terms of the GNU General Public License as published by 10 the Free Software Foundation; either version 2, or (at your option) 11 any later version. 12 13 GAS is distributed in the hope that it will be useful, 14 but WITHOUT ANY WARRANTY; without even the implied warranty of 15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16 GNU General Public License for more details. 17 18 You should have received a copy of the GNU General Public License 19 along with GAS; see the file COPYING. If not, write to the Free 20 Software Foundation, 51 Franklin Street - Fifth Floor, Boston, MA 21 02110-1301, USA. */ 22 23 /* Modified by Allen Wirfs-Brock, Instantiations Inc 2/90. */ 24 /* App, the assembler pre-processor. This pre-processor strips out excess 25 spaces, turns single-quoted characters into a decimal constant, and turns 26 # <number> <filename> <garbage> into a .line <number>\n.file <filename> 27 pair. This needs better error-handling. */ 28 29 #include <stdio.h> 30 #include "as.h" /* For BAD_CASE() only. */ 31 32 #if (__STDC__ != 1) 33 #ifndef const 34 #define const /* empty */ 35 #endif 36 #endif 37 38 #ifdef TC_M68K 39 /* Whether we are scrubbing in m68k MRI mode. This is different from 40 flag_m68k_mri, because the two flags will be affected by the .mri 41 pseudo-op at different times. */ 42 static int scrub_m68k_mri; 43 44 /* The pseudo-op which switches in and out of MRI mode. See the 45 comment in do_scrub_chars. */ 46 static const char mri_pseudo[] = ".mri 0"; 47 #else 48 #define scrub_m68k_mri 0 49 #endif 50 51 #if defined TC_ARM && defined OBJ_ELF 52 /* The pseudo-op for which we need to special-case `@' characters. 53 See the comment in do_scrub_chars. */ 54 static const char symver_pseudo[] = ".symver"; 55 static const char * symver_state; 56 #endif 57 58 static char lex[256]; 59 static const char symbol_chars[] = 60 "$._ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789"; 61 62 #define LEX_IS_SYMBOL_COMPONENT 1 63 #define LEX_IS_WHITESPACE 2 64 #define LEX_IS_LINE_SEPARATOR 3 65 #define LEX_IS_COMMENT_START 4 66 #define LEX_IS_LINE_COMMENT_START 5 67 #define LEX_IS_TWOCHAR_COMMENT_1ST 6 68 #define LEX_IS_STRINGQUOTE 8 69 #define LEX_IS_COLON 9 70 #define LEX_IS_NEWLINE 10 71 #define LEX_IS_ONECHAR_QUOTE 11 72 #ifdef TC_V850 73 #define LEX_IS_DOUBLEDASH_1ST 12 74 #endif 75 #ifdef TC_M32R 76 #define DOUBLEBAR_PARALLEL 77 #endif 78 #ifdef DOUBLEBAR_PARALLEL 79 #define LEX_IS_DOUBLEBAR_1ST 13 80 #endif 81 #define LEX_IS_PARALLEL_SEPARATOR 14 82 #define IS_SYMBOL_COMPONENT(c) (lex[c] == LEX_IS_SYMBOL_COMPONENT) 83 #define IS_WHITESPACE(c) (lex[c] == LEX_IS_WHITESPACE) 84 #define IS_LINE_SEPARATOR(c) (lex[c] == LEX_IS_LINE_SEPARATOR) 85 #define IS_PARALLEL_SEPARATOR(c) (lex[c] == LEX_IS_PARALLEL_SEPARATOR) 86 #define IS_COMMENT(c) (lex[c] == LEX_IS_COMMENT_START) 87 #define IS_LINE_COMMENT(c) (lex[c] == LEX_IS_LINE_COMMENT_START) 88 #define IS_NEWLINE(c) (lex[c] == LEX_IS_NEWLINE) 89 90 static int process_escape (int); 91 92 /* FIXME-soon: The entire lexer/parser thingy should be 93 built statically at compile time rather than dynamically 94 each and every time the assembler is run. xoxorich. */ 95 96 void 97 do_scrub_begin (int m68k_mri ATTRIBUTE_UNUSED) 98 { 99 const char *p; 100 int c; 101 102 lex[' '] = LEX_IS_WHITESPACE; 103 lex['\t'] = LEX_IS_WHITESPACE; 104 lex['\r'] = LEX_IS_WHITESPACE; 105 lex['\n'] = LEX_IS_NEWLINE; 106 lex[':'] = LEX_IS_COLON; 107 108 #ifdef TC_M68K 109 scrub_m68k_mri = m68k_mri; 110 111 if (! m68k_mri) 112 #endif 113 { 114 lex['"'] = LEX_IS_STRINGQUOTE; 115 116 #if ! defined (TC_HPPA) && ! defined (TC_I370) 117 /* I370 uses single-quotes to delimit integer, float constants. */ 118 lex['\''] = LEX_IS_ONECHAR_QUOTE; 119 #endif 120 121 #ifdef SINGLE_QUOTE_STRINGS 122 lex['\''] = LEX_IS_STRINGQUOTE; 123 #endif 124 } 125 126 /* Note: if any other character can be LEX_IS_STRINGQUOTE, the loop 127 in state 5 of do_scrub_chars must be changed. */ 128 129 /* Note that these override the previous defaults, e.g. if ';' is a 130 comment char, then it isn't a line separator. */ 131 for (p = symbol_chars; *p; ++p) 132 lex[(unsigned char) *p] = LEX_IS_SYMBOL_COMPONENT; 133 134 for (c = 128; c < 256; ++c) 135 lex[c] = LEX_IS_SYMBOL_COMPONENT; 136 137 #ifdef tc_symbol_chars 138 /* This macro permits the processor to specify all characters which 139 may appears in an operand. This will prevent the scrubber from 140 discarding meaningful whitespace in certain cases. The i386 141 backend uses this to support prefixes, which can confuse the 142 scrubber as to whether it is parsing operands or opcodes. */ 143 for (p = tc_symbol_chars; *p; ++p) 144 lex[(unsigned char) *p] = LEX_IS_SYMBOL_COMPONENT; 145 #endif 146 147 /* The m68k backend wants to be able to change comment_chars. */ 148 #ifndef tc_comment_chars 149 #define tc_comment_chars comment_chars 150 #endif 151 for (p = tc_comment_chars; *p; p++) 152 lex[(unsigned char) *p] = LEX_IS_COMMENT_START; 153 154 for (p = line_comment_chars; *p; p++) 155 lex[(unsigned char) *p] = LEX_IS_LINE_COMMENT_START; 156 157 for (p = line_separator_chars; *p; p++) 158 lex[(unsigned char) *p] = LEX_IS_LINE_SEPARATOR; 159 160 #ifdef tc_parallel_separator_chars 161 /* This macro permits the processor to specify all characters which 162 separate parallel insns on the same line. */ 163 for (p = tc_parallel_separator_chars; *p; p++) 164 lex[(unsigned char) *p] = LEX_IS_PARALLEL_SEPARATOR; 165 #endif 166 167 /* Only allow slash-star comments if slash is not in use. 168 FIXME: This isn't right. We should always permit them. */ 169 if (lex['/'] == 0) 170 lex['/'] = LEX_IS_TWOCHAR_COMMENT_1ST; 171 172 #ifdef TC_M68K 173 if (m68k_mri) 174 { 175 lex['\''] = LEX_IS_STRINGQUOTE; 176 lex[';'] = LEX_IS_COMMENT_START; 177 lex['*'] = LEX_IS_LINE_COMMENT_START; 178 /* The MRI documentation says '!' is LEX_IS_COMMENT_START, but 179 then it can't be used in an expression. */ 180 lex['!'] = LEX_IS_LINE_COMMENT_START; 181 } 182 #endif 183 184 #ifdef TC_V850 185 lex['-'] = LEX_IS_DOUBLEDASH_1ST; 186 #endif 187 #ifdef DOUBLEBAR_PARALLEL 188 lex['|'] = LEX_IS_DOUBLEBAR_1ST; 189 #endif 190 #ifdef TC_D30V 191 /* Must do this is we want VLIW instruction with "->" or "<-". */ 192 lex['-'] = LEX_IS_SYMBOL_COMPONENT; 193 #endif 194 } 195 196 /* Saved state of the scrubber. */ 197 static int state; 198 static int old_state; 199 static char *out_string; 200 static char out_buf[20]; 201 static int add_newlines; 202 static char *saved_input; 203 static int saved_input_len; 204 static char input_buffer[32 * 1024]; 205 static const char *mri_state; 206 static char mri_last_ch; 207 208 /* Data structure for saving the state of app across #include's. Note that 209 app is called asynchronously to the parsing of the .include's, so our 210 state at the time .include is interpreted is completely unrelated. 211 That's why we have to save it all. */ 212 213 struct app_save 214 { 215 int state; 216 int old_state; 217 char * out_string; 218 char out_buf[sizeof (out_buf)]; 219 int add_newlines; 220 char * saved_input; 221 int saved_input_len; 222 #ifdef TC_M68K 223 int scrub_m68k_mri; 224 #endif 225 const char * mri_state; 226 char mri_last_ch; 227 #if defined TC_ARM && defined OBJ_ELF 228 const char * symver_state; 229 #endif 230 }; 231 232 char * 233 app_push (void) 234 { 235 register struct app_save *saved; 236 237 saved = (struct app_save *) xmalloc (sizeof (*saved)); 238 saved->state = state; 239 saved->old_state = old_state; 240 saved->out_string = out_string; 241 memcpy (saved->out_buf, out_buf, sizeof (out_buf)); 242 saved->add_newlines = add_newlines; 243 if (saved_input == NULL) 244 saved->saved_input = NULL; 245 else 246 { 247 saved->saved_input = xmalloc (saved_input_len); 248 memcpy (saved->saved_input, saved_input, saved_input_len); 249 saved->saved_input_len = saved_input_len; 250 } 251 #ifdef TC_M68K 252 saved->scrub_m68k_mri = scrub_m68k_mri; 253 #endif 254 saved->mri_state = mri_state; 255 saved->mri_last_ch = mri_last_ch; 256 #if defined TC_ARM && defined OBJ_ELF 257 saved->symver_state = symver_state; 258 #endif 259 260 /* do_scrub_begin() is not useful, just wastes time. */ 261 262 state = 0; 263 saved_input = NULL; 264 265 return (char *) saved; 266 } 267 268 void 269 app_pop (char *arg) 270 { 271 register struct app_save *saved = (struct app_save *) arg; 272 273 /* There is no do_scrub_end (). */ 274 state = saved->state; 275 old_state = saved->old_state; 276 out_string = saved->out_string; 277 memcpy (out_buf, saved->out_buf, sizeof (out_buf)); 278 add_newlines = saved->add_newlines; 279 if (saved->saved_input == NULL) 280 saved_input = NULL; 281 else 282 { 283 assert (saved->saved_input_len <= (int) (sizeof input_buffer)); 284 memcpy (input_buffer, saved->saved_input, saved->saved_input_len); 285 saved_input = input_buffer; 286 saved_input_len = saved->saved_input_len; 287 free (saved->saved_input); 288 } 289 #ifdef TC_M68K 290 scrub_m68k_mri = saved->scrub_m68k_mri; 291 #endif 292 mri_state = saved->mri_state; 293 mri_last_ch = saved->mri_last_ch; 294 #if defined TC_ARM && defined OBJ_ELF 295 symver_state = saved->symver_state; 296 #endif 297 298 free (arg); 299 } 300 301 /* @@ This assumes that \n &c are the same on host and target. This is not 302 necessarily true. */ 303 304 static int 305 process_escape (int ch) 306 { 307 switch (ch) 308 { 309 case 'b': 310 return '\b'; 311 case 'f': 312 return '\f'; 313 case 'n': 314 return '\n'; 315 case 'r': 316 return '\r'; 317 case 't': 318 return '\t'; 319 case '\'': 320 return '\''; 321 case '"': 322 return '\"'; 323 default: 324 return ch; 325 } 326 } 327 328 /* This function is called to process input characters. The GET 329 parameter is used to retrieve more input characters. GET should 330 set its parameter to point to a buffer, and return the length of 331 the buffer; it should return 0 at end of file. The scrubbed output 332 characters are put into the buffer starting at TOSTART; the TOSTART 333 buffer is TOLEN bytes in length. The function returns the number 334 of scrubbed characters put into TOSTART. This will be TOLEN unless 335 end of file was seen. This function is arranged as a state 336 machine, and saves its state so that it may return at any point. 337 This is the way the old code used to work. */ 338 339 int 340 do_scrub_chars (int (*get) (char *, int), char *tostart, int tolen) 341 { 342 char *to = tostart; 343 char *toend = tostart + tolen; 344 char *from; 345 char *fromend; 346 int fromlen; 347 register int ch, ch2 = 0; 348 /* Character that started the string we're working on. */ 349 static char quotechar; 350 351 /*State 0: beginning of normal line 352 1: After first whitespace on line (flush more white) 353 2: After first non-white (opcode) on line (keep 1white) 354 3: after second white on line (into operands) (flush white) 355 4: after putting out a .line, put out digits 356 5: parsing a string, then go to old-state 357 6: putting out \ escape in a "d string. 358 7: After putting out a .appfile, put out string. 359 8: After putting out a .appfile string, flush until newline. 360 9: After seeing symbol char in state 3 (keep 1white after symchar) 361 10: After seeing whitespace in state 9 (keep white before symchar) 362 11: After seeing a symbol character in state 0 (eg a label definition) 363 -1: output string in out_string and go to the state in old_state 364 -2: flush text until a '*' '/' is seen, then go to state old_state 365 #ifdef TC_V850 366 12: After seeing a dash, looking for a second dash as a start 367 of comment. 368 #endif 369 #ifdef DOUBLEBAR_PARALLEL 370 13: After seeing a vertical bar, looking for a second 371 vertical bar as a parallel expression separator. 372 #endif 373 #ifdef TC_IA64 374 14: After seeing a `(' at state 0, looking for a `)' as 375 predicate. 376 15: After seeing a `(' at state 1, looking for a `)' as 377 predicate. 378 #endif 379 #ifdef TC_Z80 380 16: After seeing an 'a' or an 'A' at the start of a symbol 381 17: After seeing an 'f' or an 'F' in state 16 382 #endif 383 */ 384 385 /* I added states 9 and 10 because the MIPS ECOFF assembler uses 386 constructs like ``.loc 1 20''. This was turning into ``.loc 387 120''. States 9 and 10 ensure that a space is never dropped in 388 between characters which could appear in an identifier. Ian 389 Taylor, ian@cygnus.com. 390 391 I added state 11 so that something like "Lfoo add %r25,%r26,%r27" works 392 correctly on the PA (and any other target where colons are optional). 393 Jeff Law, law@cs.utah.edu. 394 395 I added state 13 so that something like "cmp r1, r2 || trap #1" does not 396 get squashed into "cmp r1,r2||trap#1", with the all important space 397 between the 'trap' and the '#1' being eliminated. nickc@cygnus.com */ 398 399 /* This macro gets the next input character. */ 400 401 #define GET() \ 402 (from < fromend \ 403 ? * (unsigned char *) (from++) \ 404 : (saved_input = NULL, \ 405 fromlen = (*get) (input_buffer, sizeof input_buffer), \ 406 from = input_buffer, \ 407 fromend = from + fromlen, \ 408 (fromlen == 0 \ 409 ? EOF \ 410 : * (unsigned char *) (from++)))) 411 412 /* This macro pushes a character back on the input stream. */ 413 414 #define UNGET(uch) (*--from = (uch)) 415 416 /* This macro puts a character into the output buffer. If this 417 character fills the output buffer, this macro jumps to the label 418 TOFULL. We use this rather ugly approach because we need to 419 handle two different termination conditions: EOF on the input 420 stream, and a full output buffer. It would be simpler if we 421 always read in the entire input stream before processing it, but 422 I don't want to make such a significant change to the assembler's 423 memory usage. */ 424 425 #define PUT(pch) \ 426 do \ 427 { \ 428 *to++ = (pch); \ 429 if (to >= toend) \ 430 goto tofull; \ 431 } \ 432 while (0) 433 434 if (saved_input != NULL) 435 { 436 from = saved_input; 437 fromend = from + saved_input_len; 438 } 439 else 440 { 441 fromlen = (*get) (input_buffer, sizeof input_buffer); 442 if (fromlen == 0) 443 return 0; 444 from = input_buffer; 445 fromend = from + fromlen; 446 } 447 448 while (1) 449 { 450 /* The cases in this switch end with continue, in order to 451 branch back to the top of this while loop and generate the 452 next output character in the appropriate state. */ 453 switch (state) 454 { 455 case -1: 456 ch = *out_string++; 457 if (*out_string == '\0') 458 { 459 state = old_state; 460 old_state = 3; 461 } 462 PUT (ch); 463 continue; 464 465 case -2: 466 for (;;) 467 { 468 do 469 { 470 ch = GET (); 471 472 if (ch == EOF) 473 { 474 as_warn (_("end of file in comment")); 475 goto fromeof; 476 } 477 478 if (ch == '\n') 479 PUT ('\n'); 480 } 481 while (ch != '*'); 482 483 while ((ch = GET ()) == '*') 484 ; 485 486 if (ch == EOF) 487 { 488 as_warn (_("end of file in comment")); 489 goto fromeof; 490 } 491 492 if (ch == '/') 493 break; 494 495 UNGET (ch); 496 } 497 498 state = old_state; 499 UNGET (' '); 500 continue; 501 502 case 4: 503 ch = GET (); 504 if (ch == EOF) 505 goto fromeof; 506 else if (ch >= '0' && ch <= '9') 507 PUT (ch); 508 else 509 { 510 while (ch != EOF && IS_WHITESPACE (ch)) 511 ch = GET (); 512 if (ch == '"') 513 { 514 UNGET (ch); 515 if (scrub_m68k_mri) 516 out_string = "\n\tappfile "; 517 else 518 out_string = "\n\t.appfile "; 519 old_state = 7; 520 state = -1; 521 PUT (*out_string++); 522 } 523 else 524 { 525 while (ch != EOF && ch != '\n') 526 ch = GET (); 527 state = 0; 528 PUT (ch); 529 } 530 } 531 continue; 532 533 case 5: 534 /* We are going to copy everything up to a quote character, 535 with special handling for a backslash. We try to 536 optimize the copying in the simple case without using the 537 GET and PUT macros. */ 538 { 539 char *s; 540 int len; 541 542 for (s = from; s < fromend; s++) 543 { 544 ch = *s; 545 if (ch == '\\' 546 || ch == quotechar 547 || ch == '\n') 548 break; 549 } 550 len = s - from; 551 if (len > toend - to) 552 len = toend - to; 553 if (len > 0) 554 { 555 memcpy (to, from, len); 556 to += len; 557 from += len; 558 } 559 } 560 561 ch = GET (); 562 if (ch == EOF) 563 { 564 as_warn (_("end of file in string; '%c' inserted"), quotechar); 565 state = old_state; 566 UNGET ('\n'); 567 PUT (quotechar); 568 } 569 else if (ch == quotechar) 570 { 571 state = old_state; 572 PUT (ch); 573 } 574 #ifndef NO_STRING_ESCAPES 575 else if (ch == '\\') 576 { 577 state = 6; 578 PUT (ch); 579 } 580 #endif 581 else if (scrub_m68k_mri && ch == '\n') 582 { 583 /* Just quietly terminate the string. This permits lines like 584 bne label loop if we haven't reach end yet. */ 585 state = old_state; 586 UNGET (ch); 587 PUT ('\''); 588 } 589 else 590 { 591 PUT (ch); 592 } 593 continue; 594 595 case 6: 596 state = 5; 597 ch = GET (); 598 switch (ch) 599 { 600 /* Handle strings broken across lines, by turning '\n' into 601 '\\' and 'n'. */ 602 case '\n': 603 UNGET ('n'); 604 add_newlines++; 605 PUT ('\\'); 606 continue; 607 608 case EOF: 609 as_warn (_("end of file in string; '%c' inserted"), quotechar); 610 PUT (quotechar); 611 continue; 612 613 case '"': 614 case '\\': 615 case 'b': 616 case 'f': 617 case 'n': 618 case 'r': 619 case 't': 620 case 'v': 621 case 'x': 622 case 'X': 623 case '0': 624 case '1': 625 case '2': 626 case '3': 627 case '4': 628 case '5': 629 case '6': 630 case '7': 631 break; 632 633 default: 634 #ifdef ONLY_STANDARD_ESCAPES 635 as_warn (_("unknown escape '\\%c' in string; ignored"), ch); 636 #endif 637 break; 638 } 639 PUT (ch); 640 continue; 641 642 case 7: 643 ch = GET (); 644 quotechar = ch; 645 state = 5; 646 old_state = 8; 647 PUT (ch); 648 continue; 649 650 case 8: 651 do 652 ch = GET (); 653 while (ch != '\n' && ch != EOF); 654 if (ch == EOF) 655 goto fromeof; 656 state = 0; 657 PUT (ch); 658 continue; 659 660 #ifdef DOUBLEBAR_PARALLEL 661 case 13: 662 ch = GET (); 663 if (ch != '|') 664 abort (); 665 666 /* Reset back to state 1 and pretend that we are parsing a 667 line from just after the first white space. */ 668 state = 1; 669 PUT ('|'); 670 continue; 671 #endif 672 #ifdef TC_Z80 673 case 16: 674 /* We have seen an 'a' at the start of a symbol, look for an 'f'. */ 675 ch = GET (); 676 if (ch == 'f' || ch == 'F') 677 { 678 state = 17; 679 PUT (ch); 680 } 681 else 682 { 683 state = 9; 684 break; 685 } 686 case 17: 687 /* We have seen "af" at the start of a symbol, 688 a ' here is a part of that symbol. */ 689 ch = GET (); 690 state = 9; 691 if (ch == '\'') 692 /* Change to avoid warning about unclosed string. */ 693 PUT ('`'); 694 else 695 UNGET (ch); 696 break; 697 #endif 698 } 699 700 /* OK, we are somewhere in states 0 through 4 or 9 through 11. */ 701 702 /* flushchar: */ 703 ch = GET (); 704 705 #ifdef TC_IA64 706 if (ch == '(' && (state == 0 || state == 1)) 707 { 708 state += 14; 709 PUT (ch); 710 continue; 711 } 712 else if (state == 14 || state == 15) 713 { 714 if (ch == ')') 715 { 716 state -= 14; 717 PUT (ch); 718 ch = GET (); 719 } 720 else 721 { 722 PUT (ch); 723 continue; 724 } 725 } 726 #endif 727 728 recycle: 729 730 #if defined TC_ARM && defined OBJ_ELF 731 /* We need to watch out for .symver directives. See the comment later 732 in this function. */ 733 if (symver_state == NULL) 734 { 735 if ((state == 0 || state == 1) && ch == symver_pseudo[0]) 736 symver_state = symver_pseudo + 1; 737 } 738 else 739 { 740 /* We advance to the next state if we find the right 741 character. */ 742 if (ch != '\0' && (*symver_state == ch)) 743 ++symver_state; 744 else if (*symver_state != '\0') 745 /* We did not get the expected character, or we didn't 746 get a valid terminating character after seeing the 747 entire pseudo-op, so we must go back to the beginning. */ 748 symver_state = NULL; 749 else 750 { 751 /* We've read the entire pseudo-op. If this is the end 752 of the line, go back to the beginning. */ 753 if (IS_NEWLINE (ch)) 754 symver_state = NULL; 755 } 756 } 757 #endif /* TC_ARM && OBJ_ELF */ 758 759 #ifdef TC_M68K 760 /* We want to have pseudo-ops which control whether we are in 761 MRI mode or not. Unfortunately, since m68k MRI mode affects 762 the scrubber, that means that we need a special purpose 763 recognizer here. */ 764 if (mri_state == NULL) 765 { 766 if ((state == 0 || state == 1) 767 && ch == mri_pseudo[0]) 768 mri_state = mri_pseudo + 1; 769 } 770 else 771 { 772 /* We advance to the next state if we find the right 773 character, or if we need a space character and we get any 774 whitespace character, or if we need a '0' and we get a 775 '1' (this is so that we only need one state to handle 776 ``.mri 0'' and ``.mri 1''). */ 777 if (ch != '\0' 778 && (*mri_state == ch 779 || (*mri_state == ' ' 780 && lex[ch] == LEX_IS_WHITESPACE) 781 || (*mri_state == '0' 782 && ch == '1'))) 783 { 784 mri_last_ch = ch; 785 ++mri_state; 786 } 787 else if (*mri_state != '\0' 788 || (lex[ch] != LEX_IS_WHITESPACE 789 && lex[ch] != LEX_IS_NEWLINE)) 790 { 791 /* We did not get the expected character, or we didn't 792 get a valid terminating character after seeing the 793 entire pseudo-op, so we must go back to the 794 beginning. */ 795 mri_state = NULL; 796 } 797 else 798 { 799 /* We've read the entire pseudo-op. mips_last_ch is 800 either '0' or '1' indicating whether to enter or 801 leave MRI mode. */ 802 do_scrub_begin (mri_last_ch == '1'); 803 mri_state = NULL; 804 805 /* We continue handling the character as usual. The 806 main gas reader must also handle the .mri pseudo-op 807 to control expression parsing and the like. */ 808 } 809 } 810 #endif 811 812 if (ch == EOF) 813 { 814 if (state != 0) 815 { 816 as_warn (_("end of file not at end of a line; newline inserted")); 817 state = 0; 818 PUT ('\n'); 819 } 820 goto fromeof; 821 } 822 823 switch (lex[ch]) 824 { 825 case LEX_IS_WHITESPACE: 826 do 827 { 828 ch = GET (); 829 } 830 while (ch != EOF && IS_WHITESPACE (ch)); 831 if (ch == EOF) 832 goto fromeof; 833 834 if (state == 0) 835 { 836 /* Preserve a single whitespace character at the 837 beginning of a line. */ 838 state = 1; 839 UNGET (ch); 840 PUT (' '); 841 break; 842 } 843 844 #ifdef KEEP_WHITE_AROUND_COLON 845 if (lex[ch] == LEX_IS_COLON) 846 { 847 /* Only keep this white if there's no white *after* the 848 colon. */ 849 ch2 = GET (); 850 UNGET (ch2); 851 if (!IS_WHITESPACE (ch2)) 852 { 853 state = 9; 854 UNGET (ch); 855 PUT (' '); 856 break; 857 } 858 } 859 #endif 860 if (IS_COMMENT (ch) 861 || ch == '/' 862 || IS_LINE_SEPARATOR (ch) 863 || IS_PARALLEL_SEPARATOR (ch)) 864 { 865 if (scrub_m68k_mri) 866 { 867 /* In MRI mode, we keep these spaces. */ 868 UNGET (ch); 869 PUT (' '); 870 break; 871 } 872 goto recycle; 873 } 874 875 /* If we're in state 2 or 11, we've seen a non-white 876 character followed by whitespace. If the next character 877 is ':', this is whitespace after a label name which we 878 normally must ignore. In MRI mode, though, spaces are 879 not permitted between the label and the colon. */ 880 if ((state == 2 || state == 11) 881 && lex[ch] == LEX_IS_COLON 882 && ! scrub_m68k_mri) 883 { 884 state = 1; 885 PUT (ch); 886 break; 887 } 888 889 switch (state) 890 { 891 case 0: 892 state++; 893 goto recycle; /* Punted leading sp */ 894 case 1: 895 /* We can arrive here if we leave a leading whitespace 896 character at the beginning of a line. */ 897 goto recycle; 898 case 2: 899 state = 3; 900 if (to + 1 < toend) 901 { 902 /* Optimize common case by skipping UNGET/GET. */ 903 PUT (' '); /* Sp after opco */ 904 goto recycle; 905 } 906 UNGET (ch); 907 PUT (' '); 908 break; 909 case 3: 910 if (scrub_m68k_mri) 911 { 912 /* In MRI mode, we keep these spaces. */ 913 UNGET (ch); 914 PUT (' '); 915 break; 916 } 917 goto recycle; /* Sp in operands */ 918 case 9: 919 case 10: 920 if (scrub_m68k_mri) 921 { 922 /* In MRI mode, we keep these spaces. */ 923 state = 3; 924 UNGET (ch); 925 PUT (' '); 926 break; 927 } 928 state = 10; /* Sp after symbol char */ 929 goto recycle; 930 case 11: 931 if (LABELS_WITHOUT_COLONS || flag_m68k_mri) 932 state = 1; 933 else 934 { 935 /* We know that ch is not ':', since we tested that 936 case above. Therefore this is not a label, so it 937 must be the opcode, and we've just seen the 938 whitespace after it. */ 939 state = 3; 940 } 941 UNGET (ch); 942 PUT (' '); /* Sp after label definition. */ 943 break; 944 default: 945 BAD_CASE (state); 946 } 947 break; 948 949 case LEX_IS_TWOCHAR_COMMENT_1ST: 950 ch2 = GET (); 951 if (ch2 == '*') 952 { 953 for (;;) 954 { 955 do 956 { 957 ch2 = GET (); 958 if (ch2 != EOF && IS_NEWLINE (ch2)) 959 add_newlines++; 960 } 961 while (ch2 != EOF && ch2 != '*'); 962 963 while (ch2 == '*') 964 ch2 = GET (); 965 966 if (ch2 == EOF || ch2 == '/') 967 break; 968 969 /* This UNGET will ensure that we count newlines 970 correctly. */ 971 UNGET (ch2); 972 } 973 974 if (ch2 == EOF) 975 as_warn (_("end of file in multiline comment")); 976 977 ch = ' '; 978 goto recycle; 979 } 980 #ifdef DOUBLESLASH_LINE_COMMENTS 981 else if (ch2 == '/') 982 { 983 do 984 { 985 ch = GET (); 986 } 987 while (ch != EOF && !IS_NEWLINE (ch)); 988 if (ch == EOF) 989 as_warn ("end of file in comment; newline inserted"); 990 state = 0; 991 PUT ('\n'); 992 break; 993 } 994 #endif 995 else 996 { 997 if (ch2 != EOF) 998 UNGET (ch2); 999 if (state == 9 || state == 10) 1000 state = 3; 1001 PUT (ch); 1002 } 1003 break; 1004 1005 case LEX_IS_STRINGQUOTE: 1006 quotechar = ch; 1007 if (state == 10) 1008 { 1009 /* Preserve the whitespace in foo "bar". */ 1010 UNGET (ch); 1011 state = 3; 1012 PUT (' '); 1013 1014 /* PUT didn't jump out. We could just break, but we 1015 know what will happen, so optimize a bit. */ 1016 ch = GET (); 1017 old_state = 3; 1018 } 1019 else if (state == 9) 1020 old_state = 3; 1021 else 1022 old_state = state; 1023 state = 5; 1024 PUT (ch); 1025 break; 1026 1027 #ifndef IEEE_STYLE 1028 case LEX_IS_ONECHAR_QUOTE: 1029 if (state == 10) 1030 { 1031 /* Preserve the whitespace in foo 'b'. */ 1032 UNGET (ch); 1033 state = 3; 1034 PUT (' '); 1035 break; 1036 } 1037 ch = GET (); 1038 if (ch == EOF) 1039 { 1040 as_warn (_("end of file after a one-character quote; \\0 inserted")); 1041 ch = 0; 1042 } 1043 if (ch == '\\') 1044 { 1045 ch = GET (); 1046 if (ch == EOF) 1047 { 1048 as_warn (_("end of file in escape character")); 1049 ch = '\\'; 1050 } 1051 else 1052 ch = process_escape (ch); 1053 } 1054 sprintf (out_buf, "%d", (int) (unsigned char) ch); 1055 1056 /* None of these 'x constants for us. We want 'x'. */ 1057 if ((ch = GET ()) != '\'') 1058 { 1059 #ifdef REQUIRE_CHAR_CLOSE_QUOTE 1060 as_warn (_("missing close quote; (assumed)")); 1061 #else 1062 if (ch != EOF) 1063 UNGET (ch); 1064 #endif 1065 } 1066 if (strlen (out_buf) == 1) 1067 { 1068 PUT (out_buf[0]); 1069 break; 1070 } 1071 if (state == 9) 1072 old_state = 3; 1073 else 1074 old_state = state; 1075 state = -1; 1076 out_string = out_buf; 1077 PUT (*out_string++); 1078 break; 1079 #endif 1080 1081 case LEX_IS_COLON: 1082 #ifdef KEEP_WHITE_AROUND_COLON 1083 state = 9; 1084 #else 1085 if (state == 9 || state == 10) 1086 state = 3; 1087 else if (state != 3) 1088 state = 1; 1089 #endif 1090 PUT (ch); 1091 break; 1092 1093 case LEX_IS_NEWLINE: 1094 /* Roll out a bunch of newlines from inside comments, etc. */ 1095 if (add_newlines) 1096 { 1097 --add_newlines; 1098 UNGET (ch); 1099 } 1100 /* Fall through. */ 1101 1102 case LEX_IS_LINE_SEPARATOR: 1103 state = 0; 1104 PUT (ch); 1105 break; 1106 1107 case LEX_IS_PARALLEL_SEPARATOR: 1108 state = 1; 1109 PUT (ch); 1110 break; 1111 1112 #ifdef TC_V850 1113 case LEX_IS_DOUBLEDASH_1ST: 1114 ch2 = GET (); 1115 if (ch2 != '-') 1116 { 1117 UNGET (ch2); 1118 goto de_fault; 1119 } 1120 /* Read and skip to end of line. */ 1121 do 1122 { 1123 ch = GET (); 1124 } 1125 while (ch != EOF && ch != '\n'); 1126 1127 if (ch == EOF) 1128 as_warn (_("end of file in comment; newline inserted")); 1129 1130 state = 0; 1131 PUT ('\n'); 1132 break; 1133 #endif 1134 #ifdef DOUBLEBAR_PARALLEL 1135 case LEX_IS_DOUBLEBAR_1ST: 1136 ch2 = GET (); 1137 UNGET (ch2); 1138 if (ch2 != '|') 1139 goto de_fault; 1140 1141 /* Handle '||' in two states as invoking PUT twice might 1142 result in the first one jumping out of this loop. We'd 1143 then lose track of the state and one '|' char. */ 1144 state = 13; 1145 PUT ('|'); 1146 break; 1147 #endif 1148 case LEX_IS_LINE_COMMENT_START: 1149 /* FIXME-someday: The two character comment stuff was badly 1150 thought out. On i386, we want '/' as line comment start 1151 AND we want C style comments. hence this hack. The 1152 whole lexical process should be reworked. xoxorich. */ 1153 if (ch == '/') 1154 { 1155 ch2 = GET (); 1156 if (ch2 == '*') 1157 { 1158 old_state = 3; 1159 state = -2; 1160 break; 1161 } 1162 else 1163 { 1164 UNGET (ch2); 1165 } 1166 } 1167 1168 if (state == 0 || state == 1) /* Only comment at start of line. */ 1169 { 1170 int startch; 1171 1172 startch = ch; 1173 1174 do 1175 { 1176 ch = GET (); 1177 } 1178 while (ch != EOF && IS_WHITESPACE (ch)); 1179 1180 if (ch == EOF) 1181 { 1182 as_warn (_("end of file in comment; newline inserted")); 1183 PUT ('\n'); 1184 break; 1185 } 1186 1187 if (ch < '0' || ch > '9' || state != 0 || startch != '#') 1188 { 1189 /* Not a cpp line. */ 1190 while (ch != EOF && !IS_NEWLINE (ch)) 1191 ch = GET (); 1192 if (ch == EOF) 1193 as_warn (_("end of file in comment; newline inserted")); 1194 state = 0; 1195 PUT ('\n'); 1196 break; 1197 } 1198 /* Looks like `# 123 "filename"' from cpp. */ 1199 UNGET (ch); 1200 old_state = 4; 1201 state = -1; 1202 if (scrub_m68k_mri) 1203 out_string = "\tappline "; 1204 else 1205 out_string = "\t.appline "; 1206 PUT (*out_string++); 1207 break; 1208 } 1209 1210 #ifdef TC_D10V 1211 /* All insns end in a char for which LEX_IS_SYMBOL_COMPONENT is true. 1212 Trap is the only short insn that has a first operand that is 1213 neither register nor label. 1214 We must prevent exef0f ||trap #1 to degenerate to exef0f ||trap#1 . 1215 We can't make '#' LEX_IS_SYMBOL_COMPONENT because it is 1216 already LEX_IS_LINE_COMMENT_START. However, it is the 1217 only character in line_comment_chars for d10v, hence we 1218 can recognize it as such. */ 1219 /* An alternative approach would be to reset the state to 1 when 1220 we see '||', '<'- or '->', but that seems to be overkill. */ 1221 if (state == 10) 1222 PUT (' '); 1223 #endif 1224 /* We have a line comment character which is not at the 1225 start of a line. If this is also a normal comment 1226 character, fall through. Otherwise treat it as a default 1227 character. */ 1228 if (strchr (tc_comment_chars, ch) == NULL 1229 && (! scrub_m68k_mri 1230 || (ch != '!' && ch != '*'))) 1231 goto de_fault; 1232 if (scrub_m68k_mri 1233 && (ch == '!' || ch == '*' || ch == '#') 1234 && state != 1 1235 && state != 10) 1236 goto de_fault; 1237 /* Fall through. */ 1238 case LEX_IS_COMMENT_START: 1239 #if defined TC_ARM && defined OBJ_ELF 1240 /* On the ARM, `@' is the comment character. 1241 Unfortunately this is also a special character in ELF .symver 1242 directives (and .type, though we deal with those another way). 1243 So we check if this line is such a directive, and treat 1244 the character as default if so. This is a hack. */ 1245 if ((symver_state != NULL) && (*symver_state == 0)) 1246 goto de_fault; 1247 #endif 1248 #ifdef WARN_COMMENTS 1249 if (!found_comment) 1250 as_where (&found_comment_file, &found_comment); 1251 #endif 1252 do 1253 { 1254 ch = GET (); 1255 } 1256 while (ch != EOF && !IS_NEWLINE (ch)); 1257 if (ch == EOF) 1258 as_warn (_("end of file in comment; newline inserted")); 1259 state = 0; 1260 PUT ('\n'); 1261 break; 1262 1263 case LEX_IS_SYMBOL_COMPONENT: 1264 if (state == 10) 1265 { 1266 /* This is a symbol character following another symbol 1267 character, with whitespace in between. We skipped 1268 the whitespace earlier, so output it now. */ 1269 UNGET (ch); 1270 state = 3; 1271 PUT (' '); 1272 break; 1273 } 1274 1275 #ifdef TC_Z80 1276 /* "af'" is a symbol containing '\''. */ 1277 if (state == 3 && (ch == 'a' || ch == 'A')) 1278 { 1279 state = 16; 1280 PUT (ch); 1281 ch = GET (); 1282 if (ch == 'f' || ch == 'F') 1283 { 1284 state = 17; 1285 PUT (ch); 1286 break; 1287 } 1288 else 1289 { 1290 state = 9; 1291 if (!IS_SYMBOL_COMPONENT (ch)) 1292 { 1293 UNGET (ch); 1294 break; 1295 } 1296 } 1297 } 1298 #endif 1299 if (state == 3) 1300 state = 9; 1301 1302 /* This is a common case. Quickly copy CH and all the 1303 following symbol component or normal characters. */ 1304 if (to + 1 < toend 1305 && mri_state == NULL 1306 #if defined TC_ARM && defined OBJ_ELF 1307 && symver_state == NULL 1308 #endif 1309 ) 1310 { 1311 char *s; 1312 int len; 1313 1314 for (s = from; s < fromend; s++) 1315 { 1316 int type; 1317 1318 ch2 = *(unsigned char *) s; 1319 type = lex[ch2]; 1320 if (type != 0 1321 && type != LEX_IS_SYMBOL_COMPONENT) 1322 break; 1323 } 1324 1325 if (s > from) 1326 /* Handle the last character normally, for 1327 simplicity. */ 1328 --s; 1329 1330 len = s - from; 1331 1332 if (len > (toend - to) - 1) 1333 len = (toend - to) - 1; 1334 1335 if (len > 0) 1336 { 1337 PUT (ch); 1338 memcpy (to, from, len); 1339 to += len; 1340 from += len; 1341 if (to >= toend) 1342 goto tofull; 1343 ch = GET (); 1344 } 1345 } 1346 1347 /* Fall through. */ 1348 default: 1349 de_fault: 1350 /* Some relatively `normal' character. */ 1351 if (state == 0) 1352 { 1353 state = 11; /* Now seeing label definition. */ 1354 } 1355 else if (state == 1) 1356 { 1357 state = 2; /* Ditto. */ 1358 } 1359 else if (state == 9) 1360 { 1361 if (!IS_SYMBOL_COMPONENT (ch)) 1362 state = 3; 1363 } 1364 else if (state == 10) 1365 { 1366 if (ch == '\\') 1367 { 1368 /* Special handling for backslash: a backslash may 1369 be the beginning of a formal parameter (of a 1370 macro) following another symbol character, with 1371 whitespace in between. If that is the case, we 1372 output a space before the parameter. Strictly 1373 speaking, correct handling depends upon what the 1374 macro parameter expands into; if the parameter 1375 expands into something which does not start with 1376 an operand character, then we don't want to keep 1377 the space. We don't have enough information to 1378 make the right choice, so here we are making the 1379 choice which is more likely to be correct. */ 1380 PUT (' '); 1381 } 1382 1383 state = 3; 1384 } 1385 PUT (ch); 1386 break; 1387 } 1388 } 1389 1390 /*NOTREACHED*/ 1391 1392 fromeof: 1393 /* We have reached the end of the input. */ 1394 return to - tostart; 1395 1396 tofull: 1397 /* The output buffer is full. Save any input we have not yet 1398 processed. */ 1399 if (fromend > from) 1400 { 1401 saved_input = from; 1402 saved_input_len = fromend - from; 1403 } 1404 else 1405 saved_input = NULL; 1406 1407 return to - tostart; 1408 } 1409 1410