1 /* 2 * Copyright (c) 1985 Sun Microsystems, Inc. 3 * Copyright (c) 1980, 1993 4 * The Regents of the University of California. All rights reserved. 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 3. All advertising materials mentioning features or use of this software 16 * must display the following acknowledgement: 17 * This product includes software developed by the University of 18 * California, Berkeley and its contributors. 19 * 4. Neither the name of the University nor the names of its contributors 20 * may be used to endorse or promote products derived from this software 21 * without specific prior written permission. 22 * 23 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 24 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 25 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 26 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 27 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 28 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 29 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 30 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 31 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 32 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 33 * SUCH DAMAGE. 34 * 35 * @(#)lexi.c 8.1 (Berkeley) 6/6/93 36 * $FreeBSD: src/usr.bin/indent/lexi.c,v 1.21 2010/04/15 21:41:07 avg Exp $ 37 */ 38 39 /* 40 * Here we have the token scanner for indent. It scans off one token and puts 41 * it in the global variable "token". It returns a code, indicating the type 42 * of token scanned. 43 */ 44 45 #include <err.h> 46 #include <stdio.h> 47 #include <ctype.h> 48 #include <stdlib.h> 49 #include <string.h> 50 #include "indent_globs.h" 51 #include "indent_codes.h" 52 #include "indent.h" 53 54 #define alphanum 1 55 #define opchar 3 56 57 struct templ { 58 const char *rwd; 59 int rwcode; 60 }; 61 62 struct templ specials[1000] = 63 { 64 {"switch", 1}, 65 {"case", 2}, 66 {"break", 0}, 67 {"struct", 3}, 68 {"union", 3}, 69 {"enum", 3}, 70 {"default", 2}, 71 {"int", 4}, 72 {"char", 4}, 73 {"float", 4}, 74 {"double", 4}, 75 {"long", 4}, 76 {"short", 4}, 77 {"typdef", 4}, 78 {"unsigned", 4}, 79 {"register", 4}, 80 {"static", 4}, 81 {"global", 4}, 82 {"extern", 4}, 83 {"void", 4}, 84 {"const", 4}, 85 {"volatile", 4}, 86 {"goto", 0}, 87 {"return", 0}, 88 {"if", 5}, 89 {"while", 5}, 90 {"for", 5}, 91 {"else", 6}, 92 {"do", 6}, 93 {"sizeof", 7}, 94 {0, 0} 95 }; 96 97 char chartype[128] = 98 { /* this is used to facilitate the decision of 99 * what type (alphanumeric, operator) each 100 * character is */ 101 0, 0, 0, 0, 0, 0, 0, 0, 102 0, 0, 0, 0, 0, 0, 0, 0, 103 0, 0, 0, 0, 0, 0, 0, 0, 104 0, 0, 0, 0, 0, 0, 0, 0, 105 0, 3, 0, 0, 1, 3, 3, 0, 106 0, 0, 3, 3, 0, 3, 0, 3, 107 1, 1, 1, 1, 1, 1, 1, 1, 108 1, 1, 0, 0, 3, 3, 3, 3, 109 0, 1, 1, 1, 1, 1, 1, 1, 110 1, 1, 1, 1, 1, 1, 1, 1, 111 1, 1, 1, 1, 1, 1, 1, 1, 112 1, 1, 1, 0, 0, 0, 3, 1, 113 0, 1, 1, 1, 1, 1, 1, 1, 114 1, 1, 1, 1, 1, 1, 1, 1, 115 1, 1, 1, 1, 1, 1, 1, 1, 116 1, 1, 1, 0, 3, 0, 3, 0 117 }; 118 119 int 120 lexi(void) 121 { 122 int unary_delim; /* this is set to 1 if the current token 123 * forces a following operator to be unary */ 124 static int last_code; /* the last token type returned */ 125 static int l_struct; /* set to 1 if the last token was 'struct' */ 126 int code; /* internal code to be returned */ 127 char qchar; /* the delimiter character for a string */ 128 129 e_token = s_token; /* point to start of place to save token */ 130 unary_delim = false; 131 ps.col_1 = ps.last_nl; /* tell world that this token started in 132 * column 1 iff the last thing scanned was nl */ 133 ps.last_nl = false; 134 135 while (*buf_ptr == ' ' || *buf_ptr == '\t') { /* get rid of blanks */ 136 ps.col_1 = false; /* leading blanks imply token is not in column 137 * 1 */ 138 if (++buf_ptr >= buf_end) 139 fill_buffer(); 140 } 141 142 /* Scan an alphanumeric token */ 143 if (chartype[(int)*buf_ptr] == alphanum || (buf_ptr[0] == '.' && isdigit(buf_ptr[1]))) { 144 /* 145 * we have a character or number 146 */ 147 const char *j; /* used for searching thru list of 148 * 149 * reserved words */ 150 struct templ *p; 151 152 if (isdigit(*buf_ptr) || (buf_ptr[0] == '.' && isdigit(buf_ptr[1]))) { 153 int seendot = 0, 154 seenexp = 0, 155 seensfx = 0; 156 if (*buf_ptr == '0' && 157 (buf_ptr[1] == 'x' || buf_ptr[1] == 'X')) { 158 *e_token++ = *buf_ptr++; 159 *e_token++ = *buf_ptr++; 160 while (isxdigit(*buf_ptr)) { 161 CHECK_SIZE_TOKEN; 162 *e_token++ = *buf_ptr++; 163 } 164 } 165 else 166 while (1) { 167 if (*buf_ptr == '.') { 168 if (seendot) 169 break; 170 else 171 seendot++; 172 } 173 CHECK_SIZE_TOKEN; 174 *e_token++ = *buf_ptr++; 175 if (!isdigit(*buf_ptr) && *buf_ptr != '.') { 176 if ((*buf_ptr != 'E' && *buf_ptr != 'e') || seenexp) 177 break; 178 else { 179 seenexp++; 180 seendot++; 181 CHECK_SIZE_TOKEN; 182 *e_token++ = *buf_ptr++; 183 if (*buf_ptr == '+' || *buf_ptr == '-') 184 *e_token++ = *buf_ptr++; 185 } 186 } 187 } 188 while (1) { 189 if (!(seensfx & 1) && 190 (*buf_ptr == 'U' || *buf_ptr == 'u')) { 191 CHECK_SIZE_TOKEN; 192 *e_token++ = *buf_ptr++; 193 seensfx |= 1; 194 continue; 195 } 196 if (!(seensfx & 2) && 197 (*buf_ptr == 'L' || *buf_ptr == 'l')) { 198 CHECK_SIZE_TOKEN; 199 if (buf_ptr[1] == buf_ptr[0]) 200 *e_token++ = *buf_ptr++; 201 *e_token++ = *buf_ptr++; 202 seensfx |= 2; 203 continue; 204 } 205 break; 206 } 207 } 208 else 209 while (chartype[(int)*buf_ptr] == alphanum || *buf_ptr == BACKSLASH) { 210 /* fill_buffer() terminates buffer with newline */ 211 if (*buf_ptr == BACKSLASH) { 212 if (*(buf_ptr + 1) == '\n') { 213 buf_ptr += 2; 214 if (buf_ptr >= buf_end) 215 fill_buffer(); 216 } else 217 break; 218 } 219 CHECK_SIZE_TOKEN; 220 /* copy it over */ 221 *e_token++ = *buf_ptr++; 222 if (buf_ptr >= buf_end) 223 fill_buffer(); 224 } 225 *e_token++ = '\0'; 226 while (*buf_ptr == ' ' || *buf_ptr == '\t') { /* get rid of blanks */ 227 if (++buf_ptr >= buf_end) 228 fill_buffer(); 229 } 230 ps.its_a_keyword = false; 231 ps.sizeof_keyword = false; 232 if (l_struct && !ps.p_l_follow) { 233 /* if last token was 'struct' and we're not 234 * in parentheses, then this token 235 * should be treated as a declaration */ 236 l_struct = false; 237 last_code = ident; 238 ps.last_u_d = true; 239 return (decl); 240 } 241 ps.last_u_d = l_struct; /* Operator after identifier is binary 242 * unless last token was 'struct' */ 243 l_struct = false; 244 last_code = ident; /* Remember that this is the code we will 245 * return */ 246 247 if (auto_typedefs) { 248 const char *q = s_token; 249 size_t q_len = strlen(q); 250 /* Check if we have an "_t" in the end */ 251 if (q_len > 2 && 252 (strcmp(q + q_len - 2, "_t") == 0)) { 253 ps.its_a_keyword = true; 254 ps.last_u_d = true; 255 goto found_auto_typedef; 256 } 257 } 258 259 /* 260 * This loop will check if the token is a keyword. 261 */ 262 for (p = specials; (j = p->rwd) != NULL; p++) { 263 const char *q = s_token; /* point at scanned token */ 264 if (*j++ != *q++ || *j++ != *q++) 265 continue; /* This test depends on the fact that 266 * identifiers are always at least 1 character 267 * long (ie. the first two bytes of the 268 * identifier are always meaningful) */ 269 if (q[-1] == 0) 270 break; /* If its a one-character identifier */ 271 while (*q++ == *j) 272 if (*j++ == 0) 273 goto found_keyword; /* I wish that C had a multi-level 274 * break... */ 275 } 276 if (p->rwd) { /* we have a keyword */ 277 found_keyword: 278 ps.its_a_keyword = true; 279 ps.last_u_d = true; 280 switch (p->rwcode) { 281 case 1: /* it is a switch */ 282 return (swstmt); 283 case 2: /* a case or default */ 284 return (casestmt); 285 286 case 3: /* a "struct" */ 287 /* 288 * Next time around, we will want to know that we have had a 289 * 'struct' 290 */ 291 l_struct = true; 292 /* FALLTHROUGH */ 293 294 case 4: /* one of the declaration keywords */ 295 found_auto_typedef: 296 if (ps.p_l_follow) { 297 ps.cast_mask |= (1 << ps.p_l_follow) & ~ps.sizeof_mask; 298 break; /* inside parens: cast, param list or sizeof */ 299 } 300 last_code = decl; 301 return (decl); 302 303 case 5: /* if, while, for */ 304 return (sp_paren); 305 306 case 6: /* do, else */ 307 return (sp_nparen); 308 309 case 7: 310 ps.sizeof_keyword = true; 311 default: /* all others are treated like any other 312 * identifier */ 313 return (ident); 314 } /* end of switch */ 315 } /* end of if (found_it) */ 316 if (*buf_ptr == '(' && ps.tos <= 1 && ps.ind_level == 0) { 317 char *tp = buf_ptr; 318 while (tp < buf_end) 319 if (*tp++ == ')' && (*tp == ';' || *tp == ',')) 320 goto not_proc; 321 strncpy(ps.procname, token, sizeof ps.procname - 1); 322 ps.in_parameter_declaration = 1; 323 rparen_count = 1; 324 not_proc:; 325 } 326 /* 327 * The following hack attempts to guess whether or not the current 328 * token is in fact a declaration keyword -- one that has been 329 * typedefd 330 */ 331 if (((*buf_ptr == '*' && buf_ptr[1] != '=') || isalpha(*buf_ptr) || *buf_ptr == '_') 332 && !ps.p_l_follow 333 && !ps.block_init 334 && (ps.last_token == rparen || ps.last_token == semicolon || 335 ps.last_token == decl || 336 ps.last_token == lbrace || ps.last_token == rbrace)) { 337 ps.its_a_keyword = true; 338 ps.last_u_d = true; 339 last_code = decl; 340 return decl; 341 } 342 if (last_code == decl) /* if this is a declared variable, then 343 * following sign is unary */ 344 ps.last_u_d = true; /* will make "int a -1" work */ 345 last_code = ident; 346 return (ident); /* the ident is not in the list */ 347 } /* end of procesing for alpanum character */ 348 349 /* Scan a non-alphanumeric token */ 350 351 *e_token++ = *buf_ptr; /* if it is only a one-character token, it is 352 * moved here */ 353 *e_token = '\0'; 354 if (++buf_ptr >= buf_end) 355 fill_buffer(); 356 357 switch (*token) { 358 case '\n': 359 unary_delim = ps.last_u_d; 360 ps.last_nl = true; /* remember that we just had a newline */ 361 code = (had_eof ? 0 : newline); 362 363 /* 364 * if data has been exhausted, the newline is a dummy, and we should 365 * return code to stop 366 */ 367 break; 368 369 case '\'': /* start of quoted character */ 370 case '"': /* start of string */ 371 qchar = *token; 372 if (troff) { 373 e_token[-1] = '`'; 374 if (qchar == '"') 375 *e_token++ = '`'; 376 e_token = chfont(&bodyf, &stringf, e_token); 377 } 378 do { /* copy the string */ 379 while (1) { /* move one character or [/<char>]<char> */ 380 if (*buf_ptr == '\n') { 381 diag2(1, "Unterminated literal"); 382 goto stop_lit; 383 } 384 CHECK_SIZE_TOKEN; /* Only have to do this once in this loop, 385 * since CHECK_SIZE guarantees that there 386 * are at least 5 entries left */ 387 *e_token = *buf_ptr++; 388 if (buf_ptr >= buf_end) 389 fill_buffer(); 390 if (*e_token == BACKSLASH) { /* if escape, copy extra char */ 391 if (*buf_ptr == '\n') /* check for escaped newline */ 392 ++line_no; 393 if (troff) { 394 *++e_token = BACKSLASH; 395 if (*buf_ptr == BACKSLASH) 396 *++e_token = BACKSLASH; 397 } 398 *++e_token = *buf_ptr++; 399 ++e_token; /* we must increment this again because we 400 * copied two chars */ 401 if (buf_ptr >= buf_end) 402 fill_buffer(); 403 } 404 else 405 break; /* we copied one character */ 406 } /* end of while (1) */ 407 } while (*e_token++ != qchar); 408 if (troff) { 409 e_token = chfont(&stringf, &bodyf, e_token - 1); 410 if (qchar == '"') 411 *e_token++ = '\''; 412 } 413 stop_lit: 414 code = ident; 415 break; 416 417 case ('('): 418 case ('['): 419 unary_delim = true; 420 code = lparen; 421 break; 422 423 case (')'): 424 case (']'): 425 code = rparen; 426 break; 427 428 case '#': 429 unary_delim = ps.last_u_d; 430 code = preesc; 431 break; 432 433 case '?': 434 unary_delim = true; 435 code = question; 436 break; 437 438 case (':'): 439 code = colon; 440 unary_delim = true; 441 break; 442 443 case (';'): 444 unary_delim = true; 445 code = semicolon; 446 break; 447 448 case ('{'): 449 unary_delim = true; 450 451 /* 452 * if (ps.in_or_st) ps.block_init = 1; 453 */ 454 /* ? code = ps.block_init ? lparen : lbrace; */ 455 code = lbrace; 456 break; 457 458 case ('}'): 459 unary_delim = true; 460 /* ? code = ps.block_init ? rparen : rbrace; */ 461 code = rbrace; 462 break; 463 464 case 014: /* a form feed */ 465 unary_delim = ps.last_u_d; 466 ps.last_nl = true; /* remember this so we can set 'ps.col_1' 467 * right */ 468 code = form_feed; 469 break; 470 471 case (','): 472 unary_delim = true; 473 code = comma; 474 break; 475 476 case '.': 477 unary_delim = false; 478 code = period; 479 break; 480 481 case '-': 482 case '+': /* check for -, +, --, ++ */ 483 code = (ps.last_u_d ? unary_op : binary_op); 484 unary_delim = true; 485 486 if (*buf_ptr == token[0]) { 487 /* check for doubled character */ 488 *e_token++ = *buf_ptr++; 489 /* buffer overflow will be checked at end of loop */ 490 if (last_code == ident || last_code == rparen) { 491 code = (ps.last_u_d ? unary_op : postop); 492 /* check for following ++ or -- */ 493 unary_delim = false; 494 } 495 } 496 else if (*buf_ptr == '=') 497 /* check for operator += */ 498 *e_token++ = *buf_ptr++; 499 else if (*buf_ptr == '>') { 500 /* check for operator -> */ 501 *e_token++ = *buf_ptr++; 502 if (!pointer_as_binop) { 503 unary_delim = false; 504 code = unary_op; 505 ps.want_blank = false; 506 } 507 } 508 break; /* buffer overflow will be checked at end of 509 * switch */ 510 511 case '=': 512 if (ps.in_or_st) 513 ps.block_init = 1; 514 #ifdef undef 515 if (chartype[*buf_ptr] == opchar) { /* we have two char assignment */ 516 e_token[-1] = *buf_ptr++; 517 if ((e_token[-1] == '<' || e_token[-1] == '>') && e_token[-1] == *buf_ptr) 518 *e_token++ = *buf_ptr++; 519 *e_token++ = '='; /* Flip =+ to += */ 520 *e_token = 0; 521 } 522 #else 523 if (*buf_ptr == '=') {/* == */ 524 *e_token++ = '='; /* Flip =+ to += */ 525 buf_ptr++; 526 *e_token = 0; 527 } 528 #endif 529 code = binary_op; 530 unary_delim = true; 531 break; 532 /* can drop thru!!! */ 533 534 case '>': 535 case '<': 536 case '!': /* ops like <, <<, <=, !=, etc */ 537 if (*buf_ptr == '>' || *buf_ptr == '<' || *buf_ptr == '=') { 538 *e_token++ = *buf_ptr; 539 if (++buf_ptr >= buf_end) 540 fill_buffer(); 541 } 542 if (*buf_ptr == '=') 543 *e_token++ = *buf_ptr++; 544 code = (ps.last_u_d ? unary_op : binary_op); 545 unary_delim = true; 546 break; 547 548 default: 549 if (token[0] == '/' && *buf_ptr == '*') { 550 /* it is start of comment */ 551 *e_token++ = '*'; 552 553 if (++buf_ptr >= buf_end) 554 fill_buffer(); 555 556 code = comment; 557 unary_delim = ps.last_u_d; 558 break; 559 } 560 while (*(e_token - 1) == *buf_ptr || *buf_ptr == '=') { 561 /* 562 * handle ||, &&, etc, and also things as in int *****i 563 */ 564 *e_token++ = *buf_ptr; 565 if (++buf_ptr >= buf_end) 566 fill_buffer(); 567 } 568 code = (ps.last_u_d ? unary_op : binary_op); 569 unary_delim = true; 570 571 572 } /* end of switch */ 573 if (code != newline) { 574 l_struct = false; 575 last_code = code; 576 } 577 if (buf_ptr >= buf_end) /* check for input buffer empty */ 578 fill_buffer(); 579 ps.last_u_d = unary_delim; 580 *e_token = '\0'; /* null terminate the token */ 581 return (code); 582 } 583 584 /* 585 * Add the given keyword to the keyword table, using val as the keyword type 586 */ 587 void 588 addkey(char *key, int val) 589 { 590 struct templ *p = specials; 591 while (p->rwd) 592 if (p->rwd[0] == key[0] && strcmp(p->rwd, key) == 0) 593 return; 594 else 595 p++; 596 if (p >= specials + sizeof specials / sizeof specials[0]) 597 return; /* For now, table overflows are silently 598 * ignored */ 599 p->rwd = key; 600 p->rwcode = val; 601 p[1].rwd = NULL; 602 p[1].rwcode = 0; 603 } 604