1 /* 2 * Copyright (c) 1985 Sun Microsystems, Inc. 3 * Copyright (c) 1980, 1993 4 * The Regents of the University of California. All rights reserved. 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 3. Neither the name of the University nor the names of its contributors 16 * may be used to endorse or promote products derived from this software 17 * without specific prior written permission. 18 * 19 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 20 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 21 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 22 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 23 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 24 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 25 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 26 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 27 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 28 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 29 * SUCH DAMAGE. 30 * 31 * @(#)lexi.c 8.1 (Berkeley) 6/6/93 32 * $FreeBSD: src/usr.bin/indent/lexi.c,v 1.21 2010/04/15 21:41:07 avg Exp $ 33 */ 34 35 /* 36 * Here we have the token scanner for indent. It scans off one token and puts 37 * it in the global variable "token". It returns a code, indicating the type 38 * of token scanned. 39 */ 40 41 #include <err.h> 42 #include <stdio.h> 43 #include <ctype.h> 44 #include <stdlib.h> 45 #include <string.h> 46 #include "indent_globs.h" 47 #include "indent_codes.h" 48 #include "indent.h" 49 50 #define alphanum 1 51 #define opchar 3 52 53 struct templ { 54 const char *rwd; 55 int rwcode; 56 }; 57 58 struct templ specials[1000] = 59 { 60 {"switch", 1}, 61 {"case", 2}, 62 {"break", 0}, 63 {"struct", 3}, 64 {"union", 3}, 65 {"enum", 3}, 66 {"default", 2}, 67 {"int", 4}, 68 {"char", 4}, 69 {"float", 4}, 70 {"double", 4}, 71 {"long", 4}, 72 {"short", 4}, 73 {"typdef", 4}, 74 {"unsigned", 4}, 75 {"register", 4}, 76 {"static", 4}, 77 {"global", 4}, 78 {"extern", 4}, 79 {"void", 4}, 80 {"const", 4}, 81 {"volatile", 4}, 82 {"goto", 0}, 83 {"return", 0}, 84 {"if", 5}, 85 {"while", 5}, 86 {"for", 5}, 87 {"else", 6}, 88 {"do", 6}, 89 {"sizeof", 7}, 90 {0, 0} 91 }; 92 93 char chartype[128] = 94 { /* this is used to facilitate the decision of 95 * what type (alphanumeric, operator) each 96 * character is */ 97 0, 0, 0, 0, 0, 0, 0, 0, 98 0, 0, 0, 0, 0, 0, 0, 0, 99 0, 0, 0, 0, 0, 0, 0, 0, 100 0, 0, 0, 0, 0, 0, 0, 0, 101 0, 3, 0, 0, 1, 3, 3, 0, 102 0, 0, 3, 3, 0, 3, 0, 3, 103 1, 1, 1, 1, 1, 1, 1, 1, 104 1, 1, 0, 0, 3, 3, 3, 3, 105 0, 1, 1, 1, 1, 1, 1, 1, 106 1, 1, 1, 1, 1, 1, 1, 1, 107 1, 1, 1, 1, 1, 1, 1, 1, 108 1, 1, 1, 0, 0, 0, 3, 1, 109 0, 1, 1, 1, 1, 1, 1, 1, 110 1, 1, 1, 1, 1, 1, 1, 1, 111 1, 1, 1, 1, 1, 1, 1, 1, 112 1, 1, 1, 0, 3, 0, 3, 0 113 }; 114 115 int 116 lexi(void) 117 { 118 int unary_delim; /* this is set to 1 if the current token 119 * forces a following operator to be unary */ 120 static int last_code; /* the last token type returned */ 121 static int l_struct; /* set to 1 if the last token was 'struct' */ 122 int code; /* internal code to be returned */ 123 char qchar; /* the delimiter character for a string */ 124 125 e_token = s_token; /* point to start of place to save token */ 126 unary_delim = false; 127 ps.col_1 = ps.last_nl; /* tell world that this token started in 128 * column 1 iff the last thing scanned was nl */ 129 ps.last_nl = false; 130 131 while (*buf_ptr == ' ' || *buf_ptr == '\t') { /* get rid of blanks */ 132 ps.col_1 = false; /* leading blanks imply token is not in column 133 * 1 */ 134 if (++buf_ptr >= buf_end) 135 fill_buffer(); 136 } 137 138 /* Scan an alphanumeric token */ 139 if (chartype[(int)*buf_ptr] == alphanum || (buf_ptr[0] == '.' && isdigit(buf_ptr[1]))) { 140 /* 141 * we have a character or number 142 */ 143 const char *j; /* used for searching thru list of 144 * 145 * reserved words */ 146 struct templ *p; 147 148 if (isdigit(*buf_ptr) || (buf_ptr[0] == '.' && isdigit(buf_ptr[1]))) { 149 int seendot = 0, 150 seenexp = 0, 151 seensfx = 0; 152 if (*buf_ptr == '0' && 153 (buf_ptr[1] == 'x' || buf_ptr[1] == 'X')) { 154 *e_token++ = *buf_ptr++; 155 *e_token++ = *buf_ptr++; 156 while (isxdigit(*buf_ptr)) { 157 CHECK_SIZE_TOKEN; 158 *e_token++ = *buf_ptr++; 159 } 160 } 161 else 162 while (1) { 163 if (*buf_ptr == '.') { 164 if (seendot) 165 break; 166 else 167 seendot++; 168 } 169 CHECK_SIZE_TOKEN; 170 *e_token++ = *buf_ptr++; 171 if (!isdigit(*buf_ptr) && *buf_ptr != '.') { 172 if ((*buf_ptr != 'E' && *buf_ptr != 'e') || seenexp) 173 break; 174 else { 175 seenexp++; 176 seendot++; 177 CHECK_SIZE_TOKEN; 178 *e_token++ = *buf_ptr++; 179 if (*buf_ptr == '+' || *buf_ptr == '-') 180 *e_token++ = *buf_ptr++; 181 } 182 } 183 } 184 while (1) { 185 if (!(seensfx & 1) && 186 (*buf_ptr == 'U' || *buf_ptr == 'u')) { 187 CHECK_SIZE_TOKEN; 188 *e_token++ = *buf_ptr++; 189 seensfx |= 1; 190 continue; 191 } 192 if (!(seensfx & 2) && 193 (*buf_ptr == 'L' || *buf_ptr == 'l')) { 194 CHECK_SIZE_TOKEN; 195 if (buf_ptr[1] == buf_ptr[0]) 196 *e_token++ = *buf_ptr++; 197 *e_token++ = *buf_ptr++; 198 seensfx |= 2; 199 continue; 200 } 201 break; 202 } 203 } 204 else 205 while (chartype[(int)*buf_ptr] == alphanum || *buf_ptr == BACKSLASH) { 206 /* fill_buffer() terminates buffer with newline */ 207 if (*buf_ptr == BACKSLASH) { 208 if (*(buf_ptr + 1) == '\n') { 209 buf_ptr += 2; 210 if (buf_ptr >= buf_end) 211 fill_buffer(); 212 } else 213 break; 214 } 215 CHECK_SIZE_TOKEN; 216 /* copy it over */ 217 *e_token++ = *buf_ptr++; 218 if (buf_ptr >= buf_end) 219 fill_buffer(); 220 } 221 *e_token++ = '\0'; 222 while (*buf_ptr == ' ' || *buf_ptr == '\t') { /* get rid of blanks */ 223 if (++buf_ptr >= buf_end) 224 fill_buffer(); 225 } 226 ps.its_a_keyword = false; 227 ps.sizeof_keyword = false; 228 if (l_struct && !ps.p_l_follow) { 229 /* if last token was 'struct' and we're not 230 * in parentheses, then this token 231 * should be treated as a declaration */ 232 l_struct = false; 233 last_code = ident; 234 ps.last_u_d = true; 235 return (decl); 236 } 237 ps.last_u_d = l_struct; /* Operator after identifier is binary 238 * unless last token was 'struct' */ 239 l_struct = false; 240 last_code = ident; /* Remember that this is the code we will 241 * return */ 242 243 if (auto_typedefs) { 244 const char *q = s_token; 245 size_t q_len = strlen(q); 246 /* Check if we have an "_t" in the end */ 247 if (q_len > 2 && 248 (strcmp(q + q_len - 2, "_t") == 0)) { 249 ps.its_a_keyword = true; 250 ps.last_u_d = true; 251 goto found_auto_typedef; 252 } 253 } 254 255 /* 256 * This loop will check if the token is a keyword. 257 */ 258 for (p = specials; (j = p->rwd) != NULL; p++) { 259 const char *q = s_token; /* point at scanned token */ 260 if (*j++ != *q++ || *j++ != *q++) 261 continue; /* This test depends on the fact that 262 * identifiers are always at least 1 character 263 * long (ie. the first two bytes of the 264 * identifier are always meaningful) */ 265 if (q[-1] == 0) 266 break; /* If its a one-character identifier */ 267 while (*q++ == *j) 268 if (*j++ == 0) 269 goto found_keyword; /* I wish that C had a multi-level 270 * break... */ 271 } 272 if (p->rwd) { /* we have a keyword */ 273 found_keyword: 274 ps.its_a_keyword = true; 275 ps.last_u_d = true; 276 switch (p->rwcode) { 277 case 1: /* it is a switch */ 278 return (swstmt); 279 case 2: /* a case or default */ 280 return (casestmt); 281 282 case 3: /* a "struct" */ 283 /* 284 * Next time around, we will want to know that we have had a 285 * 'struct' 286 */ 287 l_struct = true; 288 /* FALLTHROUGH */ 289 290 case 4: /* one of the declaration keywords */ 291 found_auto_typedef: 292 if (ps.p_l_follow) { 293 ps.cast_mask |= (1 << ps.p_l_follow) & ~ps.sizeof_mask; 294 break; /* inside parens: cast, param list or sizeof */ 295 } 296 last_code = decl; 297 return (decl); 298 299 case 5: /* if, while, for */ 300 return (sp_paren); 301 302 case 6: /* do, else */ 303 return (sp_nparen); 304 305 case 7: 306 ps.sizeof_keyword = true; 307 /* FALLTHROUGH */ 308 default: /* all others are treated like any other 309 * identifier */ 310 return (ident); 311 } /* end of switch */ 312 } /* end of if (found_it) */ 313 if (*buf_ptr == '(' && ps.tos <= 1 && ps.ind_level == 0) { 314 char *tp = buf_ptr; 315 while (tp < buf_end) 316 if (*tp++ == ')' && (*tp == ';' || *tp == ',')) 317 goto not_proc; 318 strncpy(ps.procname, token, sizeof ps.procname - 1); 319 ps.in_parameter_declaration = 1; 320 rparen_count = 1; 321 not_proc:; 322 } 323 /* 324 * The following hack attempts to guess whether or not the current 325 * token is in fact a declaration keyword -- one that has been 326 * typedefd 327 */ 328 if (((*buf_ptr == '*' && buf_ptr[1] != '=') || isalpha(*buf_ptr) || *buf_ptr == '_') 329 && !ps.p_l_follow 330 && !ps.block_init 331 && (ps.last_token == rparen || ps.last_token == semicolon || 332 ps.last_token == decl || 333 ps.last_token == lbrace || ps.last_token == rbrace)) { 334 ps.its_a_keyword = true; 335 ps.last_u_d = true; 336 last_code = decl; 337 return decl; 338 } 339 if (last_code == decl) /* if this is a declared variable, then 340 * following sign is unary */ 341 ps.last_u_d = true; /* will make "int a -1" work */ 342 last_code = ident; 343 return (ident); /* the ident is not in the list */ 344 } /* end of procesing for alpanum character */ 345 346 /* Scan a non-alphanumeric token */ 347 348 *e_token++ = *buf_ptr; /* if it is only a one-character token, it is 349 * moved here */ 350 *e_token = '\0'; 351 if (++buf_ptr >= buf_end) 352 fill_buffer(); 353 354 switch (*token) { 355 case '\n': 356 unary_delim = ps.last_u_d; 357 ps.last_nl = true; /* remember that we just had a newline */ 358 code = (had_eof ? 0 : newline); 359 360 /* 361 * if data has been exhausted, the newline is a dummy, and we should 362 * return code to stop 363 */ 364 break; 365 366 case '\'': /* start of quoted character */ 367 case '"': /* start of string */ 368 qchar = *token; 369 if (troff) { 370 e_token[-1] = '`'; 371 if (qchar == '"') 372 *e_token++ = '`'; 373 e_token = chfont(&bodyf, &stringf, e_token); 374 } 375 do { /* copy the string */ 376 while (1) { /* move one character or [/<char>]<char> */ 377 if (*buf_ptr == '\n') { 378 diag2(1, "Unterminated literal"); 379 goto stop_lit; 380 } 381 CHECK_SIZE_TOKEN; /* Only have to do this once in this loop, 382 * since CHECK_SIZE guarantees that there 383 * are at least 5 entries left */ 384 *e_token = *buf_ptr++; 385 if (buf_ptr >= buf_end) 386 fill_buffer(); 387 if (*e_token == BACKSLASH) { /* if escape, copy extra char */ 388 if (*buf_ptr == '\n') /* check for escaped newline */ 389 ++line_no; 390 if (troff) { 391 *++e_token = BACKSLASH; 392 if (*buf_ptr == BACKSLASH) 393 *++e_token = BACKSLASH; 394 } 395 *++e_token = *buf_ptr++; 396 ++e_token; /* we must increment this again because we 397 * copied two chars */ 398 if (buf_ptr >= buf_end) 399 fill_buffer(); 400 } 401 else 402 break; /* we copied one character */ 403 } /* end of while (1) */ 404 } while (*e_token++ != qchar); 405 if (troff) { 406 e_token = chfont(&stringf, &bodyf, e_token - 1); 407 if (qchar == '"') 408 *e_token++ = '\''; 409 } 410 stop_lit: 411 code = ident; 412 break; 413 414 case ('('): 415 case ('['): 416 unary_delim = true; 417 code = lparen; 418 break; 419 420 case (')'): 421 case (']'): 422 code = rparen; 423 break; 424 425 case '#': 426 unary_delim = ps.last_u_d; 427 code = preesc; 428 break; 429 430 case '?': 431 unary_delim = true; 432 code = question; 433 break; 434 435 case (':'): 436 code = colon; 437 unary_delim = true; 438 break; 439 440 case (';'): 441 unary_delim = true; 442 code = semicolon; 443 break; 444 445 case ('{'): 446 unary_delim = true; 447 448 /* 449 * if (ps.in_or_st) ps.block_init = 1; 450 */ 451 /* ? code = ps.block_init ? lparen : lbrace; */ 452 code = lbrace; 453 break; 454 455 case ('}'): 456 unary_delim = true; 457 /* ? code = ps.block_init ? rparen : rbrace; */ 458 code = rbrace; 459 break; 460 461 case 014: /* a form feed */ 462 unary_delim = ps.last_u_d; 463 ps.last_nl = true; /* remember this so we can set 'ps.col_1' 464 * right */ 465 code = form_feed; 466 break; 467 468 case (','): 469 unary_delim = true; 470 code = comma; 471 break; 472 473 case '.': 474 unary_delim = false; 475 code = period; 476 break; 477 478 case '-': 479 case '+': /* check for -, +, --, ++ */ 480 code = (ps.last_u_d ? unary_op : binary_op); 481 unary_delim = true; 482 483 if (*buf_ptr == token[0]) { 484 /* check for doubled character */ 485 *e_token++ = *buf_ptr++; 486 /* buffer overflow will be checked at end of loop */ 487 if (last_code == ident || last_code == rparen) { 488 code = (ps.last_u_d ? unary_op : postop); 489 /* check for following ++ or -- */ 490 unary_delim = false; 491 } 492 } 493 else if (*buf_ptr == '=') 494 /* check for operator += */ 495 *e_token++ = *buf_ptr++; 496 else if (*buf_ptr == '>') { 497 /* check for operator -> */ 498 *e_token++ = *buf_ptr++; 499 if (!pointer_as_binop) { 500 unary_delim = false; 501 code = unary_op; 502 ps.want_blank = false; 503 } 504 } 505 break; /* buffer overflow will be checked at end of 506 * switch */ 507 508 case '=': 509 if (ps.in_or_st) 510 ps.block_init = 1; 511 #ifdef undef 512 if (chartype[*buf_ptr] == opchar) { /* we have two char assignment */ 513 e_token[-1] = *buf_ptr++; 514 if ((e_token[-1] == '<' || e_token[-1] == '>') && e_token[-1] == *buf_ptr) 515 *e_token++ = *buf_ptr++; 516 *e_token++ = '='; /* Flip =+ to += */ 517 *e_token = 0; 518 } 519 #else 520 if (*buf_ptr == '=') {/* == */ 521 *e_token++ = '='; /* Flip =+ to += */ 522 buf_ptr++; 523 *e_token = 0; 524 } 525 #endif 526 code = binary_op; 527 unary_delim = true; 528 break; 529 /* can drop thru!!! */ 530 531 case '>': 532 case '<': 533 case '!': /* ops like <, <<, <=, !=, etc */ 534 if (*buf_ptr == '>' || *buf_ptr == '<' || *buf_ptr == '=') { 535 *e_token++ = *buf_ptr; 536 if (++buf_ptr >= buf_end) 537 fill_buffer(); 538 } 539 if (*buf_ptr == '=') 540 *e_token++ = *buf_ptr++; 541 code = (ps.last_u_d ? unary_op : binary_op); 542 unary_delim = true; 543 break; 544 545 default: 546 if (token[0] == '/' && *buf_ptr == '*') { 547 /* it is start of comment */ 548 *e_token++ = '*'; 549 550 if (++buf_ptr >= buf_end) 551 fill_buffer(); 552 553 code = comment; 554 unary_delim = ps.last_u_d; 555 break; 556 } 557 while (*(e_token - 1) == *buf_ptr || *buf_ptr == '=') { 558 /* 559 * handle ||, &&, etc, and also things as in int *****i 560 */ 561 *e_token++ = *buf_ptr; 562 if (++buf_ptr >= buf_end) 563 fill_buffer(); 564 } 565 code = (ps.last_u_d ? unary_op : binary_op); 566 unary_delim = true; 567 568 569 } /* end of switch */ 570 if (code != newline) { 571 l_struct = false; 572 last_code = code; 573 } 574 if (buf_ptr >= buf_end) /* check for input buffer empty */ 575 fill_buffer(); 576 ps.last_u_d = unary_delim; 577 *e_token = '\0'; /* null terminate the token */ 578 return (code); 579 } 580 581 /* 582 * Add the given keyword to the keyword table, using val as the keyword type 583 */ 584 void 585 addkey(char *key, int val) 586 { 587 struct templ *p = specials; 588 while (p->rwd) 589 if (p->rwd[0] == key[0] && strcmp(p->rwd, key) == 0) 590 return; 591 else 592 p++; 593 if (p >= specials + sizeof specials / sizeof specials[0]) 594 return; /* For now, table overflows are silently 595 * ignored */ 596 p->rwd = key; 597 p->rwcode = val; 598 p[1].rwd = NULL; 599 p[1].rwcode = 0; 600 } 601