1 /* 2 * This is a really stupid C tokenizer. It doesn't do any include 3 * files or anything complex at all. That's the preprocessor. 4 * 5 * Copyright (C) 2003 Transmeta Corp. 6 * 2003 Linus Torvalds 7 * 8 * Permission is hereby granted, free of charge, to any person obtaining a copy 9 * of this software and associated documentation files (the "Software"), to deal 10 * in the Software without restriction, including without limitation the rights 11 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 12 * copies of the Software, and to permit persons to whom the Software is 13 * furnished to do so, subject to the following conditions: 14 * 15 * The above copyright notice and this permission notice shall be included in 16 * all copies or substantial portions of the Software. 17 * 18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 19 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 20 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 21 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 22 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 23 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 24 * THE SOFTWARE. 25 */ 26 #include <stdio.h> 27 #include <stdlib.h> 28 #include <stdarg.h> 29 #include <stddef.h> 30 #include <string.h> 31 #include <ctype.h> 32 #include <unistd.h> 33 #include <stdint.h> 34 35 #include "lib.h" 36 #include "allocate.h" 37 #include "token.h" 38 #include "symbol.h" 39 40 #define EOF (-1) 41 42 int input_stream_nr = 0; 43 struct stream *input_streams; 44 static int input_streams_allocated; 45 unsigned int tabstop = 8; 46 int no_lineno = 0; 47 48 #define BUFSIZE (8192) 49 50 typedef struct { 51 int fd, offset, size; 52 int pos, line, nr; 53 int newline, whitespace; 54 struct token **tokenlist; 55 struct token *token; 56 unsigned char *buffer; 57 } stream_t; 58 59 const char *stream_name(int stream) 60 { 61 if (stream < 0 || stream > input_stream_nr) 62 return "<bad stream>"; 63 return input_streams[stream].name; 64 } 65 66 static struct position stream_pos(stream_t *stream) 67 { 68 struct position pos; 69 pos.type = 0; 70 pos.stream = stream->nr; 71 pos.newline = stream->newline; 72 pos.whitespace = stream->whitespace; 73 pos.pos = stream->pos; 74 75 pos.line = stream->line; 76 if (no_lineno) 77 pos.line = 123456; 78 79 pos.noexpand = 0; 80 return pos; 81 } 82 83 const char *show_special(int val) 84 { 85 static char buffer[4]; 86 87 buffer[0] = val; 88 buffer[1] = 0; 89 if (val >= SPECIAL_BASE) 90 strcpy(buffer, (char *) combinations[val - SPECIAL_BASE]); 91 return buffer; 92 } 93 94 const char *show_ident(const struct ident *ident) 95 { 96 static char buff[4][256]; 97 static int n; 98 char *buffer; 99 100 if (!ident) 101 return "<noident>"; 102 buffer = buff[3 & ++n]; 103 sprintf(buffer, "%.*s", ident->len, ident->name); 104 return buffer; 105 } 106 107 static char *charstr(char *ptr, unsigned char c, unsigned char escape, unsigned char next) 108 { 109 if (isprint(c)) { 110 if (c == escape || c == '\\') 111 *ptr++ = '\\'; 112 *ptr++ = c; 113 return ptr; 114 } 115 *ptr++ = '\\'; 116 switch (c) { 117 case '\n': 118 *ptr++ = 'n'; 119 return ptr; 120 case '\t': 121 *ptr++ = 't'; 122 return ptr; 123 } 124 if (!isdigit(next)) 125 return ptr + sprintf(ptr, "%o", c); 126 127 return ptr + sprintf(ptr, "%03o", c); 128 } 129 130 const char *show_string(const struct string *string) 131 { 132 static char buffer[4 * MAX_STRING + 3]; 133 char *ptr; 134 int i; 135 136 if (!string || !string->length) 137 return "<bad_string>"; 138 ptr = buffer; 139 *ptr++ = '"'; 140 for (i = 0; i < string->length-1; i++) { 141 const char *p = string->data + i; 142 ptr = charstr(ptr, p[0], '"', p[1]); 143 } 144 *ptr++ = '"'; 145 *ptr = '\0'; 146 return buffer; 147 } 148 149 static const char *show_char(const char *s, size_t len, char prefix, char delim) 150 { 151 static char buffer[MAX_STRING + 4]; 152 char *p = buffer; 153 if (prefix) 154 *p++ = prefix; 155 *p++ = delim; 156 memcpy(p, s, len); 157 p += len; 158 *p++ = delim; 159 *p++ = '\0'; 160 return buffer; 161 } 162 163 static const char *quote_char(const char *s, size_t len, char prefix, char delim) 164 { 165 static char buffer[2*MAX_STRING + 6]; 166 size_t i; 167 char *p = buffer; 168 if (prefix) 169 *p++ = prefix; 170 if (delim == '"') 171 *p++ = '\\'; 172 *p++ = delim; 173 for (i = 0; i < len; i++) { 174 if (s[i] == '"' || s[i] == '\\') 175 *p++ = '\\'; 176 *p++ = s[i]; 177 } 178 if (delim == '"') 179 *p++ = '\\'; 180 *p++ = delim; 181 *p++ = '\0'; 182 return buffer; 183 } 184 185 const char *show_token(const struct token *token) 186 { 187 static char buffer[256]; 188 189 if (!token) 190 return "<no token>"; 191 switch (token_type(token)) { 192 case TOKEN_ERROR: 193 return "syntax error"; 194 195 case TOKEN_EOF: 196 return "end-of-input"; 197 198 case TOKEN_IDENT: 199 return show_ident(token->ident); 200 201 case TOKEN_NUMBER: 202 return token->number; 203 204 case TOKEN_SPECIAL: 205 return show_special(token->special); 206 207 case TOKEN_CHAR: 208 return show_char(token->string->data, 209 token->string->length - 1, 0, '\''); 210 case TOKEN_CHAR_EMBEDDED_0 ... TOKEN_CHAR_EMBEDDED_3: 211 return show_char(token->embedded, 212 token_type(token) - TOKEN_CHAR, 0, '\''); 213 case TOKEN_WIDE_CHAR: 214 return show_char(token->string->data, 215 token->string->length - 1, 'L', '\''); 216 case TOKEN_WIDE_CHAR_EMBEDDED_0 ... TOKEN_WIDE_CHAR_EMBEDDED_3: 217 return show_char(token->embedded, 218 token_type(token) - TOKEN_WIDE_CHAR, 'L', '\''); 219 case TOKEN_STRING: 220 return show_char(token->string->data, 221 token->string->length - 1, 0, '"'); 222 case TOKEN_WIDE_STRING: 223 return show_char(token->string->data, 224 token->string->length - 1, 'L', '"'); 225 226 case TOKEN_STREAMBEGIN: 227 sprintf(buffer, "<beginning of '%s'>", stream_name(token->pos.stream)); 228 return buffer; 229 230 case TOKEN_STREAMEND: 231 sprintf(buffer, "<end of '%s'>", stream_name(token->pos.stream)); 232 return buffer; 233 234 case TOKEN_UNTAINT: 235 sprintf(buffer, "<untaint>"); 236 return buffer; 237 238 case TOKEN_ARG_COUNT: 239 sprintf(buffer, "<argcnt>"); 240 return buffer; 241 242 default: 243 sprintf(buffer, "unhandled token type '%d' ", token_type(token)); 244 return buffer; 245 } 246 } 247 248 const char *quote_token(const struct token *token) 249 { 250 static char buffer[256]; 251 252 switch (token_type(token)) { 253 case TOKEN_ERROR: 254 return "syntax error"; 255 256 case TOKEN_IDENT: 257 return show_ident(token->ident); 258 259 case TOKEN_NUMBER: 260 return token->number; 261 262 case TOKEN_SPECIAL: 263 return show_special(token->special); 264 265 case TOKEN_CHAR: 266 return quote_char(token->string->data, 267 token->string->length - 1, 0, '\''); 268 case TOKEN_CHAR_EMBEDDED_0 ... TOKEN_CHAR_EMBEDDED_3: 269 return quote_char(token->embedded, 270 token_type(token) - TOKEN_CHAR, 0, '\''); 271 case TOKEN_WIDE_CHAR: 272 return quote_char(token->string->data, 273 token->string->length - 1, 'L', '\''); 274 case TOKEN_WIDE_CHAR_EMBEDDED_0 ... TOKEN_WIDE_CHAR_EMBEDDED_3: 275 return quote_char(token->embedded, 276 token_type(token) - TOKEN_WIDE_CHAR, 'L', '\''); 277 case TOKEN_STRING: 278 return quote_char(token->string->data, 279 token->string->length - 1, 0, '"'); 280 case TOKEN_WIDE_STRING: 281 return quote_char(token->string->data, 282 token->string->length - 1, 'L', '"'); 283 default: 284 sprintf(buffer, "unhandled token type '%d' ", token_type(token)); 285 return buffer; 286 } 287 } 288 289 #define HASHED_INPUT_BITS (6) 290 #define HASHED_INPUT (1 << HASHED_INPUT_BITS) 291 #define HASH_PRIME 0x9e370001UL 292 293 static int input_stream_hashes[HASHED_INPUT] = { [0 ... HASHED_INPUT-1] = -1 }; 294 295 int *hash_stream(const char *name) 296 { 297 uint32_t hash = 0; 298 unsigned char c; 299 300 while ((c = *name++) != 0) 301 hash = (hash + (c << 4) + (c >> 4)) * 11; 302 303 hash *= HASH_PRIME; 304 hash >>= 32 - HASHED_INPUT_BITS; 305 return input_stream_hashes + hash; 306 } 307 308 int init_stream(const char *name, int fd, const char **next_path) 309 { 310 int stream = input_stream_nr, *hash; 311 struct stream *current; 312 313 if (stream >= input_streams_allocated) { 314 int newalloc = stream * 4 / 3 + 10; 315 input_streams = realloc(input_streams, newalloc * sizeof(struct stream)); 316 if (!input_streams) 317 die("Unable to allocate more streams space"); 318 input_streams_allocated = newalloc; 319 } 320 current = input_streams + stream; 321 memset(current, 0, sizeof(*current)); 322 current->name = name; 323 current->fd = fd; 324 current->next_path = next_path; 325 current->path = NULL; 326 current->constant = CONSTANT_FILE_MAYBE; 327 input_stream_nr = stream+1; 328 hash = hash_stream(name); 329 current->next_stream = *hash; 330 *hash = stream; 331 return stream; 332 } 333 334 static struct token * alloc_token(stream_t *stream) 335 { 336 struct token *token = __alloc_token(0); 337 token->pos = stream_pos(stream); 338 return token; 339 } 340 341 /* 342 * Argh... That was surprisingly messy - handling '\r' complicates the 343 * things a _lot_. 344 */ 345 static int nextchar_slow(stream_t *stream) 346 { 347 int offset = stream->offset; 348 int size = stream->size; 349 int c; 350 int spliced = 0, had_cr, had_backslash; 351 352 restart: 353 had_cr = had_backslash = 0; 354 355 repeat: 356 if (offset >= size) { 357 if (stream->fd < 0) 358 goto got_eof; 359 size = read(stream->fd, stream->buffer, BUFSIZE); 360 if (size <= 0) 361 goto got_eof; 362 stream->size = size; 363 stream->offset = offset = 0; 364 } 365 366 c = stream->buffer[offset++]; 367 if (had_cr) 368 goto check_lf; 369 370 if (c == '\r') { 371 had_cr = 1; 372 goto repeat; 373 } 374 375 norm: 376 if (!had_backslash) { 377 switch (c) { 378 case '\t': 379 stream->pos += tabstop - stream->pos % tabstop; 380 break; 381 case '\n': 382 stream->line++; 383 stream->pos = 0; 384 stream->newline = 1; 385 break; 386 case '\\': 387 had_backslash = 1; 388 stream->pos++; 389 goto repeat; 390 default: 391 stream->pos++; 392 } 393 } else { 394 if (c == '\n') { 395 stream->line++; 396 stream->pos = 0; 397 spliced = 1; 398 goto restart; 399 } 400 offset--; 401 c = '\\'; 402 } 403 out: 404 stream->offset = offset; 405 406 return c; 407 408 check_lf: 409 if (c != '\n') 410 offset--; 411 c = '\n'; 412 goto norm; 413 414 got_eof: 415 if (had_backslash) { 416 c = '\\'; 417 goto out; 418 } 419 if (stream->pos) 420 warning(stream_pos(stream), "no newline at end of file"); 421 else if (spliced) 422 warning(stream_pos(stream), "backslash-newline at end of file"); 423 return EOF; 424 } 425 426 /* 427 * We want that as light as possible while covering all normal cases. 428 * Slow path (including the logics with line-splicing and EOF sanity 429 * checks) is in nextchar_slow(). 430 */ 431 static inline int nextchar(stream_t *stream) 432 { 433 int offset = stream->offset; 434 435 if (offset < stream->size) { 436 int c = stream->buffer[offset++]; 437 static const char special[256] = { 438 ['\t'] = 1, ['\r'] = 1, ['\n'] = 1, ['\\'] = 1 439 }; 440 if (!special[c]) { 441 stream->offset = offset; 442 stream->pos++; 443 return c; 444 } 445 } 446 return nextchar_slow(stream); 447 } 448 449 struct token eof_token_entry; 450 451 static struct token *mark_eof(stream_t *stream) 452 { 453 struct token *end; 454 455 end = alloc_token(stream); 456 eof_token_entry.pos = end->pos; 457 token_type(end) = TOKEN_STREAMEND; 458 end->pos.newline = 1; 459 460 eof_token_entry.next = &eof_token_entry; 461 eof_token_entry.pos.newline = 1; 462 463 end->next = &eof_token_entry; 464 *stream->tokenlist = end; 465 stream->tokenlist = NULL; 466 return end; 467 } 468 469 static void add_token(stream_t *stream) 470 { 471 struct token *token = stream->token; 472 473 stream->token = NULL; 474 token->next = NULL; 475 *stream->tokenlist = token; 476 stream->tokenlist = &token->next; 477 } 478 479 static void drop_token(stream_t *stream) 480 { 481 stream->newline |= stream->token->pos.newline; 482 stream->whitespace |= stream->token->pos.whitespace; 483 stream->token = NULL; 484 } 485 486 enum { 487 Letter = 1, 488 Digit = 2, 489 Hex = 4, 490 Exp = 8, 491 Dot = 16, 492 ValidSecond = 32, 493 Quote = 64, 494 }; 495 496 static const char cclass[257] = { 497 ['0' + 1 ... '9' + 1] = Digit | Hex, 498 ['A' + 1 ... 'D' + 1] = Letter | Hex, 499 ['E' + 1] = Letter | Hex | Exp, /* E<exp> */ 500 ['F' + 1] = Letter | Hex, 501 ['G' + 1 ... 'O' + 1] = Letter, 502 ['P' + 1] = Letter | Exp, /* P<exp> */ 503 ['Q' + 1 ... 'Z' + 1] = Letter, 504 ['a' + 1 ... 'd' + 1] = Letter | Hex, 505 ['e' + 1] = Letter | Hex | Exp, /* e<exp> */ 506 ['f' + 1] = Letter | Hex, 507 ['g' + 1 ... 'o' + 1] = Letter, 508 ['p' + 1] = Letter | Exp, /* p<exp> */ 509 ['q' + 1 ... 'z' + 1] = Letter, 510 ['_' + 1] = Letter, 511 ['.' + 1] = Dot | ValidSecond, 512 ['=' + 1] = ValidSecond, 513 ['+' + 1] = ValidSecond, 514 ['-' + 1] = ValidSecond, 515 ['>' + 1] = ValidSecond, 516 ['<' + 1] = ValidSecond, 517 ['&' + 1] = ValidSecond, 518 ['|' + 1] = ValidSecond, 519 ['#' + 1] = ValidSecond, 520 ['\'' + 1] = Quote, 521 ['"' + 1] = Quote, 522 }; 523 524 /* 525 * pp-number: 526 * digit 527 * . digit 528 * pp-number digit 529 * pp-number identifier-nodigit 530 * pp-number e sign 531 * pp-number E sign 532 * pp-number p sign 533 * pp-number P sign 534 * pp-number . 535 */ 536 static int get_one_number(int c, int next, stream_t *stream) 537 { 538 struct token *token; 539 static char buffer[4095]; 540 char *p = buffer, *buffer_end = buffer + sizeof (buffer); 541 542 *p++ = c; 543 for (;;) { 544 long class = cclass[next + 1]; 545 if (!(class & (Dot | Digit | Letter))) 546 break; 547 if (p != buffer_end) 548 *p++ = next; 549 next = nextchar(stream); 550 if (class & Exp) { 551 if (next == '-' || next == '+') { 552 if (p != buffer_end) 553 *p++ = next; 554 next = nextchar(stream); 555 } 556 } 557 } 558 559 if (p == buffer_end) { 560 sparse_error(stream_pos(stream), "number token exceeds %td characters", 561 buffer_end - buffer); 562 // Pretend we saw just "1". 563 buffer[0] = '1'; 564 p = buffer + 1; 565 } 566 567 *p++ = 0; 568 token = stream->token; 569 token_type(token) = TOKEN_NUMBER; 570 token->number = xmemdup(buffer, p - buffer); 571 add_token(stream); 572 573 return next; 574 } 575 576 static int eat_string(int next, stream_t *stream, enum token_type type) 577 { 578 static char buffer[MAX_STRING]; 579 struct string *string; 580 struct token *token = stream->token; 581 int len = 0; 582 int escape; 583 int want_hex = 0; 584 char delim = type < TOKEN_STRING ? '\'' : '"'; 585 586 for (escape = 0; escape || next != delim; next = nextchar(stream)) { 587 if (len < MAX_STRING) 588 buffer[len] = next; 589 len++; 590 if (next == '\n') { 591 warning(stream_pos(stream), 592 "missing terminating %c character", delim); 593 /* assume delimiter is lost */ 594 break; 595 } 596 if (next == EOF) { 597 warning(stream_pos(stream), 598 "End of file in middle of string"); 599 return next; 600 } 601 if (!escape) { 602 if (want_hex && !(cclass[next + 1] & Hex)) 603 warning(stream_pos(stream), 604 "\\x used with no following hex digits"); 605 want_hex = 0; 606 escape = next == '\\'; 607 } else { 608 escape = 0; 609 want_hex = next == 'x'; 610 } 611 } 612 if (want_hex) 613 warning(stream_pos(stream), 614 "\\x used with no following hex digits"); 615 if (len > MAX_STRING) { 616 warning(stream_pos(stream), "string too long (%d bytes, %d bytes max)", len, MAX_STRING); 617 len = MAX_STRING; 618 } 619 if (delim == '\'' && len <= 4) { 620 if (len == 0) { 621 sparse_error(stream_pos(stream), 622 "empty character constant"); 623 return nextchar(stream); 624 } 625 token_type(token) = type + len; 626 memset(buffer + len, '\0', 4 - len); 627 memcpy(token->embedded, buffer, 4); 628 } else { 629 token_type(token) = type; 630 string = __alloc_string(len+1); 631 memcpy(string->data, buffer, len); 632 string->data[len] = '\0'; 633 string->length = len+1; 634 token->string = string; 635 } 636 637 /* Pass it on.. */ 638 token = stream->token; 639 add_token(stream); 640 return nextchar(stream); 641 } 642 643 static int drop_stream_eoln(stream_t *stream) 644 { 645 drop_token(stream); 646 for (;;) { 647 switch (nextchar(stream)) { 648 case EOF: 649 return EOF; 650 case '\n': 651 return nextchar(stream); 652 } 653 } 654 } 655 656 static int drop_stream_comment(stream_t *stream) 657 { 658 int newline; 659 int next; 660 drop_token(stream); 661 newline = stream->newline; 662 663 next = nextchar(stream); 664 for (;;) { 665 int curr = next; 666 if (curr == EOF) { 667 warning(stream_pos(stream), "End of file in the middle of a comment"); 668 return curr; 669 } 670 next = nextchar(stream); 671 if (curr == '*' && next == '/') 672 break; 673 } 674 stream->newline = newline; 675 return nextchar(stream); 676 } 677 678 unsigned char combinations[][4] = COMBINATION_STRINGS; 679 680 #define NR_COMBINATIONS (SPECIAL_ARG_SEPARATOR - SPECIAL_BASE) 681 682 /* hash function for two-character punctuators - all give unique values */ 683 #define special_hash(c0, c1) (((c0*8+c1*2)+((c0*8+c1*2)>>5))&31) 684 685 /* 686 * note that we won't get false positives - special_hash(0,0) is 0 and 687 * entry 0 is filled (by +=), so all the missing ones are OK. 688 */ 689 static unsigned char hash_results[32][2] = { 690 #define RES(c0, c1) [special_hash(c0, c1)] = {c0, c1} 691 RES('+', '='), /* 00 */ 692 RES('/', '='), /* 01 */ 693 RES('^', '='), /* 05 */ 694 RES('&', '&'), /* 07 */ 695 RES('#', '#'), /* 08 */ 696 RES('<', '<'), /* 0a */ 697 RES('<', '='), /* 0c */ 698 RES('!', '='), /* 0e */ 699 RES('%', '='), /* 0f */ 700 RES('-', '-'), /* 10 */ 701 RES('-', '='), /* 11 */ 702 RES('-', '>'), /* 13 */ 703 RES('=', '='), /* 15 */ 704 RES('&', '='), /* 17 */ 705 RES('*', '='), /* 18 */ 706 RES('.', '.'), /* 1a */ 707 RES('+', '+'), /* 1b */ 708 RES('|', '='), /* 1c */ 709 RES('>', '='), /* 1d */ 710 RES('|', '|'), /* 1e */ 711 RES('>', '>') /* 1f */ 712 #undef RES 713 }; 714 static int code[32] = { 715 #define CODE(c0, c1, value) [special_hash(c0, c1)] = value 716 CODE('+', '=', SPECIAL_ADD_ASSIGN), /* 00 */ 717 CODE('/', '=', SPECIAL_DIV_ASSIGN), /* 01 */ 718 CODE('^', '=', SPECIAL_XOR_ASSIGN), /* 05 */ 719 CODE('&', '&', SPECIAL_LOGICAL_AND), /* 07 */ 720 CODE('#', '#', SPECIAL_HASHHASH), /* 08 */ 721 CODE('<', '<', SPECIAL_LEFTSHIFT), /* 0a */ 722 CODE('<', '=', SPECIAL_LTE), /* 0c */ 723 CODE('!', '=', SPECIAL_NOTEQUAL), /* 0e */ 724 CODE('%', '=', SPECIAL_MOD_ASSIGN), /* 0f */ 725 CODE('-', '-', SPECIAL_DECREMENT), /* 10 */ 726 CODE('-', '=', SPECIAL_SUB_ASSIGN), /* 11 */ 727 CODE('-', '>', SPECIAL_DEREFERENCE), /* 13 */ 728 CODE('=', '=', SPECIAL_EQUAL), /* 15 */ 729 CODE('&', '=', SPECIAL_AND_ASSIGN), /* 17 */ 730 CODE('*', '=', SPECIAL_MUL_ASSIGN), /* 18 */ 731 CODE('.', '.', SPECIAL_DOTDOT), /* 1a */ 732 CODE('+', '+', SPECIAL_INCREMENT), /* 1b */ 733 CODE('|', '=', SPECIAL_OR_ASSIGN), /* 1c */ 734 CODE('>', '=', SPECIAL_GTE), /* 1d */ 735 CODE('|', '|', SPECIAL_LOGICAL_OR), /* 1e */ 736 CODE('>', '>', SPECIAL_RIGHTSHIFT) /* 1f */ 737 #undef CODE 738 }; 739 740 static int get_one_special(int c, stream_t *stream) 741 { 742 struct token *token; 743 int next, value, i; 744 745 next = nextchar(stream); 746 747 /* 748 * Check for numbers, strings, character constants, and comments 749 */ 750 switch (c) { 751 case '.': 752 if (next >= '0' && next <= '9') 753 return get_one_number(c, next, stream); 754 break; 755 case '"': 756 return eat_string(next, stream, TOKEN_STRING); 757 case '\'': 758 return eat_string(next, stream, TOKEN_CHAR); 759 case '/': 760 if (next == '/') 761 return drop_stream_eoln(stream); 762 if (next == '*') 763 return drop_stream_comment(stream); 764 } 765 766 /* 767 * Check for combinations 768 */ 769 value = c; 770 if (cclass[next + 1] & ValidSecond) { 771 i = special_hash(c, next); 772 if (hash_results[i][0] == c && hash_results[i][1] == next) { 773 value = code[i]; 774 next = nextchar(stream); 775 if (value >= SPECIAL_LEFTSHIFT && 776 next == "==."[value - SPECIAL_LEFTSHIFT]) { 777 value += 3; 778 next = nextchar(stream); 779 } 780 } 781 } 782 783 /* Pass it on.. */ 784 token = stream->token; 785 token_type(token) = TOKEN_SPECIAL; 786 token->special = value; 787 add_token(stream); 788 return next; 789 } 790 791 #define IDENT_HASH_BITS (13) 792 #define IDENT_HASH_SIZE (1<<IDENT_HASH_BITS) 793 #define IDENT_HASH_MASK (IDENT_HASH_SIZE-1) 794 795 #define ident_hash_init(c) (c) 796 #define ident_hash_add(oldhash,c) ((oldhash)*11 + (c)) 797 #define ident_hash_end(hash) ((((hash) >> IDENT_HASH_BITS) + (hash)) & IDENT_HASH_MASK) 798 799 static struct ident *hash_table[IDENT_HASH_SIZE]; 800 static int ident_hit, ident_miss, idents; 801 802 void show_identifier_stats(void) 803 { 804 int i; 805 int distribution[100]; 806 807 fprintf(stderr, "identifiers: %d hits, %d misses\n", 808 ident_hit, ident_miss); 809 810 for (i = 0; i < 100; i++) 811 distribution[i] = 0; 812 813 for (i = 0; i < IDENT_HASH_SIZE; i++) { 814 struct ident * ident = hash_table[i]; 815 int count = 0; 816 817 while (ident) { 818 count++; 819 ident = ident->next; 820 } 821 if (count > 99) 822 count = 99; 823 distribution[count]++; 824 } 825 826 for (i = 0; i < 100; i++) { 827 if (distribution[i]) 828 fprintf(stderr, "%2d: %d buckets\n", i, distribution[i]); 829 } 830 } 831 832 struct ident *alloc_ident(const char *name, int len) 833 { 834 struct ident *ident = __alloc_ident(len); 835 ident->symbols = NULL; 836 ident->len = len; 837 ident->tainted = 0; 838 memcpy(ident->name, name, len); 839 return ident; 840 } 841 842 static struct ident * insert_hash(struct ident *ident, unsigned long hash) 843 { 844 ident->next = hash_table[hash]; 845 hash_table[hash] = ident; 846 ident_miss++; 847 return ident; 848 } 849 850 static struct ident *create_hashed_ident(const char *name, int len, unsigned long hash) 851 { 852 struct ident *ident; 853 struct ident **p; 854 855 p = &hash_table[hash]; 856 while ((ident = *p) != NULL) { 857 if (ident->len == (unsigned char) len) { 858 if (strncmp(name, ident->name, len) != 0) 859 goto next; 860 861 ident_hit++; 862 return ident; 863 } 864 next: 865 //misses++; 866 p = &ident->next; 867 } 868 ident = alloc_ident(name, len); 869 *p = ident; 870 ident->next = NULL; 871 ident_miss++; 872 idents++; 873 return ident; 874 } 875 876 static unsigned long hash_name(const char *name, int len) 877 { 878 unsigned long hash; 879 const unsigned char *p = (const unsigned char *)name; 880 881 hash = ident_hash_init(*p++); 882 while (--len) { 883 unsigned int i = *p++; 884 hash = ident_hash_add(hash, i); 885 } 886 return ident_hash_end(hash); 887 } 888 889 struct ident *hash_ident(struct ident *ident) 890 { 891 return insert_hash(ident, hash_name(ident->name, ident->len)); 892 } 893 894 struct ident *built_in_ident(const char *name) 895 { 896 int len = strlen(name); 897 return create_hashed_ident(name, len, hash_name(name, len)); 898 } 899 900 struct token *built_in_token(int stream, struct ident *ident) 901 { 902 struct token *token; 903 904 token = __alloc_token(0); 905 token->pos.stream = stream; 906 token_type(token) = TOKEN_IDENT; 907 token->ident = ident; 908 return token; 909 } 910 911 static int get_one_identifier(int c, stream_t *stream) 912 { 913 struct token *token; 914 struct ident *ident; 915 unsigned long hash; 916 char buf[256]; 917 int len = 1; 918 int next; 919 920 hash = ident_hash_init(c); 921 buf[0] = c; 922 for (;;) { 923 next = nextchar(stream); 924 if (!(cclass[next + 1] & (Letter | Digit))) 925 break; 926 if (len >= sizeof(buf)) 927 break; 928 hash = ident_hash_add(hash, next); 929 buf[len] = next; 930 len++; 931 }; 932 if (cclass[next + 1] & Quote) { 933 if (len == 1 && buf[0] == 'L') { 934 if (next == '\'') 935 return eat_string(nextchar(stream), stream, 936 TOKEN_WIDE_CHAR); 937 else 938 return eat_string(nextchar(stream), stream, 939 TOKEN_WIDE_STRING); 940 } 941 } 942 hash = ident_hash_end(hash); 943 ident = create_hashed_ident(buf, len, hash); 944 945 /* Pass it on.. */ 946 token = stream->token; 947 token_type(token) = TOKEN_IDENT; 948 token->ident = ident; 949 add_token(stream); 950 return next; 951 } 952 953 static int get_one_token(int c, stream_t *stream) 954 { 955 long class = cclass[c + 1]; 956 if (class & Digit) 957 return get_one_number(c, nextchar(stream), stream); 958 if (class & Letter) 959 return get_one_identifier(c, stream); 960 return get_one_special(c, stream); 961 } 962 963 static struct token *setup_stream(stream_t *stream, int idx, int fd, 964 unsigned char *buf, unsigned int buf_size) 965 { 966 struct token *begin; 967 968 stream->nr = idx; 969 stream->line = 1; 970 stream->newline = 1; 971 stream->whitespace = 0; 972 stream->pos = 0; 973 974 stream->token = NULL; 975 stream->fd = fd; 976 stream->offset = 0; 977 stream->size = buf_size; 978 stream->buffer = buf; 979 980 begin = alloc_token(stream); 981 token_type(begin) = TOKEN_STREAMBEGIN; 982 stream->tokenlist = &begin->next; 983 return begin; 984 } 985 986 static struct token *tokenize_stream(stream_t *stream) 987 { 988 int c = nextchar(stream); 989 while (c != EOF) { 990 if (!isspace(c)) { 991 struct token *token = alloc_token(stream); 992 stream->token = token; 993 stream->newline = 0; 994 stream->whitespace = 0; 995 c = get_one_token(c, stream); 996 continue; 997 } 998 stream->whitespace = 1; 999 c = nextchar(stream); 1000 } 1001 return mark_eof(stream); 1002 } 1003 1004 struct token * tokenize_buffer(void *buffer, unsigned long size, struct token **endtoken) 1005 { 1006 stream_t stream; 1007 struct token *begin; 1008 1009 begin = setup_stream(&stream, 0, -1, buffer, size); 1010 *endtoken = tokenize_stream(&stream); 1011 return begin; 1012 } 1013 1014 struct token * tokenize(const char *name, int fd, struct token *endtoken, const char **next_path) 1015 { 1016 struct token *begin, *end; 1017 stream_t stream; 1018 unsigned char buffer[BUFSIZE]; 1019 int idx; 1020 1021 idx = init_stream(name, fd, next_path); 1022 if (idx < 0) { 1023 // info(endtoken->pos, "File %s is const", name); 1024 return endtoken; 1025 } 1026 1027 begin = setup_stream(&stream, idx, fd, buffer, 0); 1028 end = tokenize_stream(&stream); 1029 if (endtoken) 1030 end->next = endtoken; 1031 return begin; 1032 } 1033