1 /*- 2 * SPDX-License-Identifier: BSD-4-Clause 3 * 4 * Copyright (c) 1985 Sun Microsystems, Inc. 5 * Copyright (c) 1980, 1993 6 * The Regents of the University of California. All rights reserved. 7 * All rights reserved. 8 * 9 * Redistribution and use in source and binary forms, with or without 10 * modification, are permitted provided that the following conditions 11 * are met: 12 * 1. Redistributions of source code must retain the above copyright 13 * notice, this list of conditions and the following disclaimer. 14 * 2. Redistributions in binary form must reproduce the above copyright 15 * notice, this list of conditions and the following disclaimer in the 16 * documentation and/or other materials provided with the distribution. 17 * 3. Neither the name of the University nor the names of its contributors 18 * may be used to endorse or promote products derived from this software 19 * without specific prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 22 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 24 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 25 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 26 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 27 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 28 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 29 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 30 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 31 * SUCH DAMAGE. 32 * 33 * @(#)lexi.c 8.1 (Berkeley) 6/6/93 34 * $FreeBSD: head/usr.bin/indent/lexi.c 337862 2018-08-15 18:19:45Z pstef $ 35 */ 36 37 /* 38 * Here we have the token scanner for indent. It scans off one token and puts 39 * it in the global variable "token". It returns a code, indicating the type 40 * of token scanned. 41 */ 42 43 #include <err.h> 44 #include <stdio.h> 45 #include <ctype.h> 46 #include <stdlib.h> 47 #include <string.h> 48 #include <sys/param.h> 49 50 #include "indent_globs.h" 51 #include "indent_codes.h" 52 #include "indent.h" 53 54 struct templ { 55 const char *rwd; 56 int rwcode; 57 }; 58 59 /* 60 * This table has to be sorted alphabetically, because it'll be used in binary 61 * search. For the same reason, string must be the first thing in struct templ. 62 */ 63 struct templ specials[] = 64 { 65 {"_Bool", 4}, 66 {"_Complex", 4}, 67 {"_Imaginary", 4}, 68 {"auto", 10}, 69 {"bool", 4}, 70 {"break", 9}, 71 {"case", 8}, 72 {"char", 4}, 73 {"complex", 4}, 74 {"const", 4}, 75 {"continue", 12}, 76 {"default", 8}, 77 {"do", 6}, 78 {"double", 4}, 79 {"else", 6}, 80 {"enum", 3}, 81 {"extern", 10}, 82 {"float", 4}, 83 {"for", 5}, 84 {"global", 4}, 85 {"goto", 9}, 86 {"if", 5}, 87 {"imaginary", 4}, 88 {"inline", 12}, 89 {"int", 4}, 90 {"long", 4}, 91 {"offsetof", 1}, 92 {"register", 10}, 93 {"restrict", 12}, 94 {"return", 9}, 95 {"short", 4}, 96 {"signed", 4}, 97 {"sizeof", 2}, 98 {"static", 10}, 99 {"struct", 3}, 100 {"switch", 7}, 101 {"typedef", 11}, 102 {"union", 3}, 103 {"unsigned", 4}, 104 {"void", 4}, 105 {"volatile", 4}, 106 {"while", 5} 107 }; 108 109 const char **typenames; 110 int typename_count; 111 int typename_top = -1; 112 113 /* 114 * The transition table below was rewritten by hand from lx's output, given 115 * the following definitions. lx is Katherine Flavel's lexer generator. 116 * 117 * O = /[0-7]/; D = /[0-9]/; NZ = /[1-9]/; 118 * H = /[a-f0-9]/i; B = /[0-1]/; HP = /0x/i; 119 * BP = /0b/i; E = /e[+\-]?/i D+; P = /p[+\-]?/i D+; 120 * FS = /[fl]/i; IS = /u/i /(l|L|ll|LL)/? | /(l|L|ll|LL)/ /u/i?; 121 * 122 * D+ E FS? -> $float; 123 * D* "." D+ E? FS? -> $float; 124 * D+ "." E? FS? -> $float; HP H+ IS? -> $int; 125 * HP H+ P FS? -> $float; NZ D* IS? -> $int; 126 * HP H* "." H+ P FS? -> $float; "0" O* IS? -> $int; 127 * HP H+ "." P FS -> $float; BP B+ IS? -> $int; 128 */ 129 static char const *table[] = { 130 /* examples: 131 00 132 s 0xx 133 t 00xaa 134 a 11 101100xxa.. 135 r 11ee0001101lbuuxx.a.pp 136 t.01.e+008bLuxll0Ll.aa.p+0 137 states: ABCDEFGHIJKLMNOPQRSTUVWXYZ */ 138 ['0'] = "CEIDEHHHIJQ U Q VUVVZZZ", 139 ['1'] = "DEIDEHHHIJQ U Q VUVVZZZ", 140 ['7'] = "DEIDEHHHIJ U VUVVZZZ", 141 ['9'] = "DEJDEHHHJJ U VUVVZZZ", 142 ['a'] = " U VUVV ", 143 ['b'] = " K U VUVV ", 144 ['e'] = " FFF FF U VUVV ", 145 ['f'] = " f f U VUVV f", 146 ['u'] = " MM M i iiM M ", 147 ['x'] = " N ", 148 ['p'] = " FFX ", 149 ['L'] = " LLf fL PR Li L f", 150 ['l'] = " OOf fO S P O i O f", 151 ['+'] = " G Y ", 152 ['.'] = "B EE EE T W ", 153 /* ABCDEFGHIJKLMNOPQRSTUVWXYZ */ 154 [0] = "uuiifuufiuuiiuiiiiiuiuuuuu", 155 }; 156 157 static int 158 strcmp_type(const void *e1, const void *e2) 159 { 160 return (strcmp(e1, *(const char * const *)e2)); 161 } 162 163 int 164 lexi(struct parser_state *state) 165 { 166 int unary_delim; /* this is set to 1 if the current token 167 * forces a following operator to be unary */ 168 int code; /* internal code to be returned */ 169 char qchar; /* the delimiter character for a string */ 170 171 e_token = s_token; /* point to start of place to save token */ 172 unary_delim = false; 173 state->col_1 = state->last_nl; /* tell world that this token started 174 * in column 1 iff the last thing 175 * scanned was a newline */ 176 state->last_nl = false; 177 178 while (*buf_ptr == ' ' || *buf_ptr == '\t') { /* get rid of blanks */ 179 state->col_1 = false; /* leading blanks imply token is not in column 180 * 1 */ 181 if (++buf_ptr >= buf_end) 182 fill_buffer(); 183 } 184 185 /* Scan an alphanumeric token */ 186 if (isalnum((unsigned char)*buf_ptr) || 187 *buf_ptr == '_' || *buf_ptr == '$' || 188 (buf_ptr[0] == '.' && isdigit((unsigned char)buf_ptr[1]))) { 189 /* 190 * we have a character or number 191 */ 192 struct templ *p; 193 194 if (isdigit((unsigned char)*buf_ptr) || 195 (buf_ptr[0] == '.' && isdigit((unsigned char)buf_ptr[1]))) { 196 char s; 197 unsigned char i; 198 199 for (s = 'A'; s != 'f' && s != 'i' && s != 'u'; ) { 200 i = (unsigned char)*buf_ptr; 201 if (i >= nitems(table) || table[i] == NULL || 202 table[i][s - 'A'] == ' ') { 203 s = table[0][s - 'A']; 204 break; 205 } 206 s = table[i][s - 'A']; 207 CHECK_SIZE_TOKEN(1); 208 *e_token++ = *buf_ptr++; 209 if (buf_ptr >= buf_end) 210 fill_buffer(); 211 } 212 /* s now indicates the type: f(loating), i(integer), u(nknown) */ 213 } 214 else 215 while (isalnum((unsigned char)*buf_ptr) || 216 *buf_ptr == BACKSLASH || 217 *buf_ptr == '_' || *buf_ptr == '$') { 218 /* fill_buffer() terminates buffer with newline */ 219 if (*buf_ptr == BACKSLASH) { 220 if (*(buf_ptr + 1) == '\n') { 221 buf_ptr += 2; 222 if (buf_ptr >= buf_end) 223 fill_buffer(); 224 } else 225 break; 226 } 227 CHECK_SIZE_TOKEN(1); 228 /* copy it over */ 229 *e_token++ = *buf_ptr++; 230 if (buf_ptr >= buf_end) 231 fill_buffer(); 232 } 233 *e_token = '\0'; 234 235 if (s_token[0] == 'L' && s_token[1] == '\0' && 236 (*buf_ptr == '"' || *buf_ptr == '\'')) 237 return (strpfx); 238 239 while (*buf_ptr == ' ' || *buf_ptr == '\t') { /* get rid of blanks */ 240 if (++buf_ptr >= buf_end) 241 fill_buffer(); 242 } 243 state->keyword = 0; 244 if (state->last_token == structure && !state->p_l_follow) { 245 /* if last token was 'struct' and we're not 246 * in parentheses, then this token 247 * should be treated as a declaration */ 248 state->last_u_d = true; 249 return (decl); 250 } 251 /* 252 * Operator after identifier is binary unless last token was 'struct' 253 */ 254 state->last_u_d = (state->last_token == structure); 255 256 p = bsearch(s_token, 257 specials, 258 sizeof(specials) / sizeof(specials[0]), 259 sizeof(specials[0]), 260 strcmp_type); 261 if (p == NULL) { /* not a special keyword... */ 262 char *u; 263 264 /* ... so maybe a type_t or a typedef */ 265 if ((opt.auto_typedefs && ((u = strrchr(s_token, '_')) != NULL) && 266 strcmp(u, "_t") == 0) || (typename_top >= 0 && 267 bsearch(s_token, typenames, typename_top + 1, 268 sizeof(typenames[0]), strcmp_type))) { 269 state->keyword = 4; /* a type name */ 270 state->last_u_d = true; 271 goto found_typename; 272 } 273 } else { /* we have a keyword */ 274 state->keyword = p->rwcode; 275 state->last_u_d = true; 276 switch (p->rwcode) { 277 case 7: /* it is a switch */ 278 return (swstmt); 279 case 8: /* a case or default */ 280 return (casestmt); 281 282 case 3: /* a "struct" */ 283 /* FALLTHROUGH */ 284 case 4: /* one of the declaration keywords */ 285 found_typename: 286 if (state->p_l_follow) { 287 /* inside parens: cast, param list, offsetof or sizeof */ 288 state->cast_mask |= (1 << state->p_l_follow) & ~state->not_cast_mask; 289 } 290 if (state->last_token == period || state->last_token == unary_op) { 291 state->keyword = 0; 292 break; 293 } 294 if (p != NULL && p->rwcode == 3) 295 return (structure); 296 if (state->p_l_follow) 297 break; 298 return (decl); 299 300 case 5: /* if, while, for */ 301 return (sp_paren); 302 303 case 6: /* do, else */ 304 return (sp_nparen); 305 306 case 10: /* storage class specifier */ 307 return (storage); 308 309 case 11: /* typedef */ 310 return (type_def); 311 312 /* FALLTHROUGH */ 313 default: /* all others are treated like any other 314 * identifier */ 315 return (ident); 316 } /* end of switch */ 317 } /* end of if (found_it) */ 318 if (*buf_ptr == '(' && state->tos <= 1 && state->ind_level == 0 && 319 state->in_parameter_declaration == 0 && state->block_init == 0) { 320 char *tp = buf_ptr; 321 while (tp < buf_end) 322 if (*tp++ == ')' && (*tp == ';' || *tp == ',')) 323 goto not_proc; 324 strncpy(state->procname, token, sizeof state->procname - 1); 325 if (state->in_decl) 326 state->in_parameter_declaration = 1; 327 return (funcname); 328 not_proc:; 329 } 330 /* 331 * The following hack attempts to guess whether or not the current 332 * token is in fact a declaration keyword -- one that has been 333 * typedefd 334 */ 335 else if (!state->p_l_follow && !state->block_init && 336 !state->in_stmt && 337 ((*buf_ptr == '*' && buf_ptr[1] != '=') || 338 isalpha((unsigned char)*buf_ptr)) && 339 (state->last_token == semicolon || state->last_token == lbrace || 340 state->last_token == rbrace)) { 341 state->keyword = 4; /* a type name */ 342 state->last_u_d = true; 343 return decl; 344 } 345 if (state->last_token == decl) /* if this is a declared variable, 346 * then following sign is unary */ 347 state->last_u_d = true; /* will make "int a -1" work */ 348 return (ident); /* the ident is not in the list */ 349 } /* end of procesing for alpanum character */ 350 351 /* Scan a non-alphanumeric token */ 352 353 CHECK_SIZE_TOKEN(3); /* things like "<<=" */ 354 *e_token++ = *buf_ptr; /* if it is only a one-character token, it is 355 * moved here */ 356 *e_token = '\0'; 357 if (++buf_ptr >= buf_end) 358 fill_buffer(); 359 360 switch (*token) { 361 case '\n': 362 unary_delim = state->last_u_d; 363 state->last_nl = true; /* remember that we just had a newline */ 364 code = (had_eof ? 0 : newline); 365 366 /* 367 * if data has been exhausted, the newline is a dummy, and we should 368 * return code to stop 369 */ 370 break; 371 372 case '\'': /* start of quoted character */ 373 case '"': /* start of string */ 374 qchar = *token; 375 do { /* copy the string */ 376 while (1) { /* move one character or [/<char>]<char> */ 377 if (*buf_ptr == '\n') { 378 diag2(1, "Unterminated literal"); 379 goto stop_lit; 380 } 381 CHECK_SIZE_TOKEN(2); 382 *e_token = *buf_ptr++; 383 if (buf_ptr >= buf_end) 384 fill_buffer(); 385 if (*e_token == BACKSLASH) { /* if escape, copy extra char */ 386 if (*buf_ptr == '\n') /* check for escaped newline */ 387 ++line_no; 388 *++e_token = *buf_ptr++; 389 ++e_token; /* we must increment this again because we 390 * copied two chars */ 391 if (buf_ptr >= buf_end) 392 fill_buffer(); 393 } 394 else 395 break; /* we copied one character */ 396 } /* end of while (1) */ 397 } while (*e_token++ != qchar); 398 stop_lit: 399 code = ident; 400 break; 401 402 case ('('): 403 case ('['): 404 unary_delim = true; 405 code = lparen; 406 break; 407 408 case (')'): 409 case (']'): 410 code = rparen; 411 break; 412 413 case '#': 414 unary_delim = state->last_u_d; 415 code = preesc; 416 break; 417 418 case '?': 419 unary_delim = true; 420 code = question; 421 break; 422 423 case (':'): 424 code = colon; 425 unary_delim = true; 426 break; 427 428 case (';'): 429 unary_delim = true; 430 code = semicolon; 431 break; 432 433 case ('{'): 434 unary_delim = true; 435 436 /* 437 * if (state->in_or_st) state->block_init = 1; 438 */ 439 /* ? code = state->block_init ? lparen : lbrace; */ 440 code = lbrace; 441 break; 442 443 case ('}'): 444 unary_delim = true; 445 /* ? code = state->block_init ? rparen : rbrace; */ 446 code = rbrace; 447 break; 448 449 case 014: /* a form feed */ 450 unary_delim = state->last_u_d; 451 state->last_nl = true; /* remember this so we can set 'state->col_1' 452 * right */ 453 code = form_feed; 454 break; 455 456 case (','): 457 unary_delim = true; 458 code = comma; 459 break; 460 461 case '.': 462 unary_delim = false; 463 code = period; 464 break; 465 466 case '-': 467 case '+': /* check for -, +, --, ++ */ 468 code = (state->last_u_d ? unary_op : binary_op); 469 unary_delim = true; 470 471 if (*buf_ptr == token[0]) { 472 /* check for doubled character */ 473 *e_token++ = *buf_ptr++; 474 /* buffer overflow will be checked at end of loop */ 475 if (state->last_token == ident || state->last_token == rparen) { 476 code = (state->last_u_d ? unary_op : postop); 477 /* check for following ++ or -- */ 478 unary_delim = false; 479 } 480 } 481 else if (*buf_ptr == '=') 482 /* check for operator += */ 483 *e_token++ = *buf_ptr++; 484 else if (*buf_ptr == '>') { 485 /* check for operator -> */ 486 *e_token++ = *buf_ptr++; 487 unary_delim = false; 488 code = unary_op; 489 state->want_blank = false; 490 } 491 break; /* buffer overflow will be checked at end of 492 * switch */ 493 494 case '=': 495 if (state->in_or_st) 496 state->block_init = 1; 497 if (*buf_ptr == '=') {/* == */ 498 *e_token++ = '='; /* Flip =+ to += */ 499 buf_ptr++; 500 *e_token = 0; 501 } 502 code = binary_op; 503 unary_delim = true; 504 break; 505 /* can drop thru!!! */ 506 507 case '>': 508 case '<': 509 case '!': /* ops like <, <<, <=, !=, etc */ 510 if (*buf_ptr == '>' || *buf_ptr == '<' || *buf_ptr == '=') { 511 *e_token++ = *buf_ptr; 512 if (++buf_ptr >= buf_end) 513 fill_buffer(); 514 } 515 if (*buf_ptr == '=') 516 *e_token++ = *buf_ptr++; 517 code = (state->last_u_d ? unary_op : binary_op); 518 unary_delim = true; 519 break; 520 521 case '*': 522 unary_delim = true; 523 if (!state->last_u_d) { 524 if (*buf_ptr == '=') 525 *e_token++ = *buf_ptr++; 526 code = binary_op; 527 break; 528 } 529 while (*buf_ptr == '*' || isspace((unsigned char)*buf_ptr)) { 530 if (*buf_ptr == '*') { 531 CHECK_SIZE_TOKEN(1); 532 *e_token++ = *buf_ptr; 533 } 534 if (++buf_ptr >= buf_end) 535 fill_buffer(); 536 } 537 if (ps.in_decl) { 538 char *tp = buf_ptr; 539 540 while (isalpha((unsigned char)*tp) || 541 isspace((unsigned char)*tp)) { 542 if (++tp >= buf_end) 543 fill_buffer(); 544 } 545 if (*tp == '(') 546 ps.procname[0] = ' '; 547 } 548 code = unary_op; 549 break; 550 551 default: 552 if (token[0] == '/' && *buf_ptr == '*') { 553 /* it is start of comment */ 554 *e_token++ = '*'; 555 556 if (++buf_ptr >= buf_end) 557 fill_buffer(); 558 559 code = comment; 560 unary_delim = state->last_u_d; 561 break; 562 } 563 while (*(e_token - 1) == *buf_ptr || *buf_ptr == '=') { 564 /* 565 * handle ||, &&, etc, and also things as in int *****i 566 */ 567 CHECK_SIZE_TOKEN(1); 568 *e_token++ = *buf_ptr; 569 if (++buf_ptr >= buf_end) 570 fill_buffer(); 571 } 572 code = (state->last_u_d ? unary_op : binary_op); 573 unary_delim = true; 574 575 576 } /* end of switch */ 577 if (buf_ptr >= buf_end) /* check for input buffer empty */ 578 fill_buffer(); 579 state->last_u_d = unary_delim; 580 CHECK_SIZE_TOKEN(1); 581 *e_token = '\0'; /* null terminate the token */ 582 return (code); 583 } 584 585 /* Initialize constant transition table */ 586 void 587 init_constant_tt(void) 588 { 589 table['-'] = table['+']; 590 table['8'] = table['9']; 591 table['2'] = table['3'] = table['4'] = table['5'] = table['6'] = table['7']; 592 table['A'] = table['C'] = table['D'] = table['c'] = table['d'] = table['a']; 593 table['B'] = table['b']; 594 table['E'] = table['e']; 595 table['U'] = table['u']; 596 table['X'] = table['x']; 597 table['P'] = table['p']; 598 table['F'] = table['f']; 599 } 600 601 void 602 alloc_typenames(void) 603 { 604 605 typenames = (const char **)malloc(sizeof(typenames[0]) * 606 (typename_count = 16)); 607 if (typenames == NULL) 608 err(1, NULL); 609 } 610 611 void 612 add_typename(const char *key) 613 { 614 int comparison; 615 const char *copy; 616 617 if (typename_top + 1 >= typename_count) { 618 typenames = realloc((void *)typenames, 619 sizeof(typenames[0]) * (typename_count *= 2)); 620 if (typenames == NULL) 621 err(1, NULL); 622 } 623 if (typename_top == -1) 624 typenames[++typename_top] = copy = strdup(key); 625 else if ((comparison = strcmp(key, typenames[typename_top])) >= 0) { 626 /* take advantage of sorted input */ 627 if (comparison == 0) /* remove duplicates */ 628 return; 629 typenames[++typename_top] = copy = strdup(key); 630 } 631 else { 632 int p; 633 634 for (p = 0; (comparison = strcmp(key, typenames[p])) > 0; p++) 635 /* find place for the new key */; 636 if (comparison == 0) /* remove duplicates */ 637 return; 638 memmove(&typenames[p + 1], &typenames[p], 639 sizeof(typenames[0]) * (++typename_top - p)); 640 typenames[p] = copy = strdup(key); 641 } 642 643 if (copy == NULL) 644 err(1, NULL); 645 } 646