1 /* 2 * Copyright (c) 1985 Sun Microsystems, Inc. 3 * Copyright (c) 1980, 1993 4 * The Regents of the University of California. All rights reserved. 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 3. All advertising materials mentioning features or use of this software 16 * must display the following acknowledgement: 17 * This product includes software developed by the University of 18 * California, Berkeley and its contributors. 19 * 4. Neither the name of the University nor the names of its contributors 20 * may be used to endorse or promote products derived from this software 21 * without specific prior written permission. 22 * 23 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 24 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 25 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 26 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 27 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 28 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 29 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 30 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 31 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 32 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 33 * SUCH DAMAGE. 34 * 35 * @(#)lexi.c 8.1 (Berkeley) 6/6/93 36 * $FreeBSD: src/usr.bin/indent/lexi.c,v 1.19 2005/11/20 13:48:15 dds Exp $ 37 * $DragonFly: src/usr.bin/indent/lexi.c,v 1.3 2005/04/10 20:55:38 drhodus Exp $ 38 */ 39 40 /* 41 * Here we have the token scanner for indent. It scans off one token and puts 42 * it in the global variable "token". It returns a code, indicating the type 43 * of token scanned. 44 */ 45 46 #include <err.h> 47 #include <stdio.h> 48 #include <ctype.h> 49 #include <stdlib.h> 50 #include <string.h> 51 #include "indent_globs.h" 52 #include "indent_codes.h" 53 #include "indent.h" 54 55 #define alphanum 1 56 #define opchar 3 57 58 struct templ { 59 const char *rwd; 60 int rwcode; 61 }; 62 63 struct templ specials[1000] = 64 { 65 {"switch", 1}, 66 {"case", 2}, 67 {"break", 0}, 68 {"struct", 3}, 69 {"union", 3}, 70 {"enum", 3}, 71 {"default", 2}, 72 {"int", 4}, 73 {"char", 4}, 74 {"float", 4}, 75 {"double", 4}, 76 {"long", 4}, 77 {"short", 4}, 78 {"typdef", 4}, 79 {"unsigned", 4}, 80 {"register", 4}, 81 {"static", 4}, 82 {"global", 4}, 83 {"extern", 4}, 84 {"void", 4}, 85 {"const", 4}, 86 {"volatile", 4}, 87 {"goto", 0}, 88 {"return", 0}, 89 {"if", 5}, 90 {"while", 5}, 91 {"for", 5}, 92 {"else", 6}, 93 {"do", 6}, 94 {"sizeof", 7}, 95 {0, 0} 96 }; 97 98 char chartype[128] = 99 { /* this is used to facilitate the decision of 100 * what type (alphanumeric, operator) each 101 * character is */ 102 0, 0, 0, 0, 0, 0, 0, 0, 103 0, 0, 0, 0, 0, 0, 0, 0, 104 0, 0, 0, 0, 0, 0, 0, 0, 105 0, 0, 0, 0, 0, 0, 0, 0, 106 0, 3, 0, 0, 1, 3, 3, 0, 107 0, 0, 3, 3, 0, 3, 0, 3, 108 1, 1, 1, 1, 1, 1, 1, 1, 109 1, 1, 0, 0, 3, 3, 3, 3, 110 0, 1, 1, 1, 1, 1, 1, 1, 111 1, 1, 1, 1, 1, 1, 1, 1, 112 1, 1, 1, 1, 1, 1, 1, 1, 113 1, 1, 1, 0, 0, 0, 3, 1, 114 0, 1, 1, 1, 1, 1, 1, 1, 115 1, 1, 1, 1, 1, 1, 1, 1, 116 1, 1, 1, 1, 1, 1, 1, 1, 117 1, 1, 1, 0, 3, 0, 3, 0 118 }; 119 120 int 121 lexi(void) 122 { 123 int unary_delim; /* this is set to 1 if the current token 124 * forces a following operator to be unary */ 125 static int last_code; /* the last token type returned */ 126 static int l_struct; /* set to 1 if the last token was 'struct' */ 127 int code; /* internal code to be returned */ 128 char qchar; /* the delimiter character for a string */ 129 130 e_token = s_token; /* point to start of place to save token */ 131 unary_delim = false; 132 ps.col_1 = ps.last_nl; /* tell world that this token started in 133 * column 1 iff the last thing scanned was nl */ 134 ps.last_nl = false; 135 136 while (*buf_ptr == ' ' || *buf_ptr == '\t') { /* get rid of blanks */ 137 ps.col_1 = false; /* leading blanks imply token is not in column 138 * 1 */ 139 if (++buf_ptr >= buf_end) 140 fill_buffer(); 141 } 142 143 /* Scan an alphanumeric token */ 144 if (chartype[(int)*buf_ptr] == alphanum || (buf_ptr[0] == '.' && isdigit(buf_ptr[1]))) { 145 /* 146 * we have a character or number 147 */ 148 const char *j; /* used for searching thru list of 149 * 150 * reserved words */ 151 struct templ *p; 152 153 if (isdigit(*buf_ptr) || (buf_ptr[0] == '.' && isdigit(buf_ptr[1]))) { 154 int seendot = 0, 155 seenexp = 0, 156 seensfx = 0; 157 if (*buf_ptr == '0' && 158 (buf_ptr[1] == 'x' || buf_ptr[1] == 'X')) { 159 *e_token++ = *buf_ptr++; 160 *e_token++ = *buf_ptr++; 161 while (isxdigit(*buf_ptr)) { 162 CHECK_SIZE_TOKEN; 163 *e_token++ = *buf_ptr++; 164 } 165 } 166 else 167 while (1) { 168 if (*buf_ptr == '.') { 169 if (seendot) 170 break; 171 else 172 seendot++; 173 } 174 CHECK_SIZE_TOKEN; 175 *e_token++ = *buf_ptr++; 176 if (!isdigit(*buf_ptr) && *buf_ptr != '.') { 177 if ((*buf_ptr != 'E' && *buf_ptr != 'e') || seenexp) 178 break; 179 else { 180 seenexp++; 181 seendot++; 182 CHECK_SIZE_TOKEN; 183 *e_token++ = *buf_ptr++; 184 if (*buf_ptr == '+' || *buf_ptr == '-') 185 *e_token++ = *buf_ptr++; 186 } 187 } 188 } 189 while (1) { 190 if (!(seensfx & 1) && 191 (*buf_ptr == 'U' || *buf_ptr == 'u')) { 192 CHECK_SIZE_TOKEN; 193 *e_token++ = *buf_ptr++; 194 seensfx |= 1; 195 continue; 196 } 197 if (!(seensfx & 2) && 198 (*buf_ptr == 'L' || *buf_ptr == 'l')) { 199 CHECK_SIZE_TOKEN; 200 if (buf_ptr[1] == buf_ptr[0]) 201 *e_token++ = *buf_ptr++; 202 *e_token++ = *buf_ptr++; 203 seensfx |= 2; 204 continue; 205 } 206 break; 207 } 208 } 209 else 210 while (chartype[(int)*buf_ptr] == alphanum || *buf_ptr == BACKSLASH) { 211 /* fill_buffer() terminates buffer with newline */ 212 if (*buf_ptr == BACKSLASH) { 213 if (*(buf_ptr + 1) == '\n') { 214 buf_ptr += 2; 215 if (buf_ptr >= buf_end) 216 fill_buffer(); 217 } else 218 break; 219 } 220 CHECK_SIZE_TOKEN; 221 /* copy it over */ 222 *e_token++ = *buf_ptr++; 223 if (buf_ptr >= buf_end) 224 fill_buffer(); 225 } 226 *e_token++ = '\0'; 227 while (*buf_ptr == ' ' || *buf_ptr == '\t') { /* get rid of blanks */ 228 if (++buf_ptr >= buf_end) 229 fill_buffer(); 230 } 231 ps.its_a_keyword = false; 232 ps.sizeof_keyword = false; 233 if (l_struct && !ps.p_l_follow) { 234 /* if last token was 'struct' and we're not 235 * in parentheses, then this token 236 * should be treated as a declaration */ 237 l_struct = false; 238 last_code = ident; 239 ps.last_u_d = true; 240 return (decl); 241 } 242 ps.last_u_d = l_struct; /* Operator after identifier is binary 243 * unless last token was 'struct' */ 244 l_struct = false; 245 last_code = ident; /* Remember that this is the code we will 246 * return */ 247 248 /* 249 * This loop will check if the token is a keyword. 250 */ 251 for (p = specials; (j = p->rwd) != 0; p++) { 252 const char *q = s_token; /* point at scanned token */ 253 if (*j++ != *q++ || *j++ != *q++) 254 continue; /* This test depends on the fact that 255 * identifiers are always at least 1 character 256 * long (ie. the first two bytes of the 257 * identifier are always meaningful) */ 258 if (q[-1] == 0) 259 break; /* If its a one-character identifier */ 260 while (*q++ == *j) 261 if (*j++ == 0) 262 goto found_keyword; /* I wish that C had a multi-level 263 * break... */ 264 } 265 if (p->rwd) { /* we have a keyword */ 266 found_keyword: 267 ps.its_a_keyword = true; 268 ps.last_u_d = true; 269 switch (p->rwcode) { 270 case 1: /* it is a switch */ 271 return (swstmt); 272 case 2: /* a case or default */ 273 return (casestmt); 274 275 case 3: /* a "struct" */ 276 /* 277 * Next time around, we will want to know that we have had a 278 * 'struct' 279 */ 280 l_struct = true; 281 /* FALLTHROUGH */ 282 283 case 4: /* one of the declaration keywords */ 284 if (ps.p_l_follow) { 285 ps.cast_mask |= (1 << ps.p_l_follow) & ~ps.sizeof_mask; 286 break; /* inside parens: cast, param list or sizeof */ 287 } 288 last_code = decl; 289 return (decl); 290 291 case 5: /* if, while, for */ 292 return (sp_paren); 293 294 case 6: /* do, else */ 295 return (sp_nparen); 296 297 case 7: 298 ps.sizeof_keyword = true; 299 default: /* all others are treated like any other 300 * identifier */ 301 return (ident); 302 } /* end of switch */ 303 } /* end of if (found_it) */ 304 if (*buf_ptr == '(' && ps.tos <= 1 && ps.ind_level == 0) { 305 char *tp = buf_ptr; 306 while (tp < buf_end) 307 if (*tp++ == ')' && (*tp == ';' || *tp == ',')) 308 goto not_proc; 309 strncpy(ps.procname, token, sizeof ps.procname - 1); 310 ps.in_parameter_declaration = 1; 311 rparen_count = 1; 312 not_proc:; 313 } 314 /* 315 * The following hack attempts to guess whether or not the current 316 * token is in fact a declaration keyword -- one that has been 317 * typedefd 318 */ 319 if (((*buf_ptr == '*' && buf_ptr[1] != '=') || isalpha(*buf_ptr) || *buf_ptr == '_') 320 && !ps.p_l_follow 321 && !ps.block_init 322 && (ps.last_token == rparen || ps.last_token == semicolon || 323 ps.last_token == decl || 324 ps.last_token == lbrace || ps.last_token == rbrace)) { 325 ps.its_a_keyword = true; 326 ps.last_u_d = true; 327 last_code = decl; 328 return decl; 329 } 330 if (last_code == decl) /* if this is a declared variable, then 331 * following sign is unary */ 332 ps.last_u_d = true; /* will make "int a -1" work */ 333 last_code = ident; 334 return (ident); /* the ident is not in the list */ 335 } /* end of procesing for alpanum character */ 336 337 /* Scan a non-alphanumeric token */ 338 339 *e_token++ = *buf_ptr; /* if it is only a one-character token, it is 340 * moved here */ 341 *e_token = '\0'; 342 if (++buf_ptr >= buf_end) 343 fill_buffer(); 344 345 switch (*token) { 346 case '\n': 347 unary_delim = ps.last_u_d; 348 ps.last_nl = true; /* remember that we just had a newline */ 349 code = (had_eof ? 0 : newline); 350 351 /* 352 * if data has been exhausted, the newline is a dummy, and we should 353 * return code to stop 354 */ 355 break; 356 357 case '\'': /* start of quoted character */ 358 case '"': /* start of string */ 359 qchar = *token; 360 if (troff) { 361 e_token[-1] = '`'; 362 if (qchar == '"') 363 *e_token++ = '`'; 364 e_token = chfont(&bodyf, &stringf, e_token); 365 } 366 do { /* copy the string */ 367 while (1) { /* move one character or [/<char>]<char> */ 368 if (*buf_ptr == '\n') { 369 diag2(1, "Unterminated literal"); 370 goto stop_lit; 371 } 372 CHECK_SIZE_TOKEN; /* Only have to do this once in this loop, 373 * since CHECK_SIZE guarantees that there 374 * are at least 5 entries left */ 375 *e_token = *buf_ptr++; 376 if (buf_ptr >= buf_end) 377 fill_buffer(); 378 if (*e_token == BACKSLASH) { /* if escape, copy extra char */ 379 if (*buf_ptr == '\n') /* check for escaped newline */ 380 ++line_no; 381 if (troff) { 382 *++e_token = BACKSLASH; 383 if (*buf_ptr == BACKSLASH) 384 *++e_token = BACKSLASH; 385 } 386 *++e_token = *buf_ptr++; 387 ++e_token; /* we must increment this again because we 388 * copied two chars */ 389 if (buf_ptr >= buf_end) 390 fill_buffer(); 391 } 392 else 393 break; /* we copied one character */ 394 } /* end of while (1) */ 395 } while (*e_token++ != qchar); 396 if (troff) { 397 e_token = chfont(&stringf, &bodyf, e_token - 1); 398 if (qchar == '"') 399 *e_token++ = '\''; 400 } 401 stop_lit: 402 code = ident; 403 break; 404 405 case ('('): 406 case ('['): 407 unary_delim = true; 408 code = lparen; 409 break; 410 411 case (')'): 412 case (']'): 413 code = rparen; 414 break; 415 416 case '#': 417 unary_delim = ps.last_u_d; 418 code = preesc; 419 break; 420 421 case '?': 422 unary_delim = true; 423 code = question; 424 break; 425 426 case (':'): 427 code = colon; 428 unary_delim = true; 429 break; 430 431 case (';'): 432 unary_delim = true; 433 code = semicolon; 434 break; 435 436 case ('{'): 437 unary_delim = true; 438 439 /* 440 * if (ps.in_or_st) ps.block_init = 1; 441 */ 442 /* ? code = ps.block_init ? lparen : lbrace; */ 443 code = lbrace; 444 break; 445 446 case ('}'): 447 unary_delim = true; 448 /* ? code = ps.block_init ? rparen : rbrace; */ 449 code = rbrace; 450 break; 451 452 case 014: /* a form feed */ 453 unary_delim = ps.last_u_d; 454 ps.last_nl = true; /* remember this so we can set 'ps.col_1' 455 * right */ 456 code = form_feed; 457 break; 458 459 case (','): 460 unary_delim = true; 461 code = comma; 462 break; 463 464 case '.': 465 unary_delim = false; 466 code = period; 467 break; 468 469 case '-': 470 case '+': /* check for -, +, --, ++ */ 471 code = (ps.last_u_d ? unary_op : binary_op); 472 unary_delim = true; 473 474 if (*buf_ptr == token[0]) { 475 /* check for doubled character */ 476 *e_token++ = *buf_ptr++; 477 /* buffer overflow will be checked at end of loop */ 478 if (last_code == ident || last_code == rparen) { 479 code = (ps.last_u_d ? unary_op : postop); 480 /* check for following ++ or -- */ 481 unary_delim = false; 482 } 483 } 484 else if (*buf_ptr == '=') 485 /* check for operator += */ 486 *e_token++ = *buf_ptr++; 487 else if (*buf_ptr == '>') { 488 /* check for operator -> */ 489 *e_token++ = *buf_ptr++; 490 if (!pointer_as_binop) { 491 unary_delim = false; 492 code = unary_op; 493 ps.want_blank = false; 494 } 495 } 496 break; /* buffer overflow will be checked at end of 497 * switch */ 498 499 case '=': 500 if (ps.in_or_st) 501 ps.block_init = 1; 502 #ifdef undef 503 if (chartype[*buf_ptr] == opchar) { /* we have two char assignment */ 504 e_token[-1] = *buf_ptr++; 505 if ((e_token[-1] == '<' || e_token[-1] == '>') && e_token[-1] == *buf_ptr) 506 *e_token++ = *buf_ptr++; 507 *e_token++ = '='; /* Flip =+ to += */ 508 *e_token = 0; 509 } 510 #else 511 if (*buf_ptr == '=') {/* == */ 512 *e_token++ = '='; /* Flip =+ to += */ 513 buf_ptr++; 514 *e_token = 0; 515 } 516 #endif 517 code = binary_op; 518 unary_delim = true; 519 break; 520 /* can drop thru!!! */ 521 522 case '>': 523 case '<': 524 case '!': /* ops like <, <<, <=, !=, etc */ 525 if (*buf_ptr == '>' || *buf_ptr == '<' || *buf_ptr == '=') { 526 *e_token++ = *buf_ptr; 527 if (++buf_ptr >= buf_end) 528 fill_buffer(); 529 } 530 if (*buf_ptr == '=') 531 *e_token++ = *buf_ptr++; 532 code = (ps.last_u_d ? unary_op : binary_op); 533 unary_delim = true; 534 break; 535 536 default: 537 if (token[0] == '/' && *buf_ptr == '*') { 538 /* it is start of comment */ 539 *e_token++ = '*'; 540 541 if (++buf_ptr >= buf_end) 542 fill_buffer(); 543 544 code = comment; 545 unary_delim = ps.last_u_d; 546 break; 547 } 548 while (*(e_token - 1) == *buf_ptr || *buf_ptr == '=') { 549 /* 550 * handle ||, &&, etc, and also things as in int *****i 551 */ 552 *e_token++ = *buf_ptr; 553 if (++buf_ptr >= buf_end) 554 fill_buffer(); 555 } 556 code = (ps.last_u_d ? unary_op : binary_op); 557 unary_delim = true; 558 559 560 } /* end of switch */ 561 if (code != newline) { 562 l_struct = false; 563 last_code = code; 564 } 565 if (buf_ptr >= buf_end) /* check for input buffer empty */ 566 fill_buffer(); 567 ps.last_u_d = unary_delim; 568 *e_token = '\0'; /* null terminate the token */ 569 return (code); 570 } 571 572 /* 573 * Add the given keyword to the keyword table, using val as the keyword type 574 */ 575 void 576 addkey(char *key, int val) 577 { 578 struct templ *p = specials; 579 while (p->rwd) 580 if (p->rwd[0] == key[0] && strcmp(p->rwd, key) == 0) 581 return; 582 else 583 p++; 584 if (p >= specials + sizeof specials / sizeof specials[0]) 585 return; /* For now, table overflows are silently 586 * ignored */ 587 p->rwd = key; 588 p->rwcode = val; 589 p[1].rwd = 0; 590 p[1].rwcode = 0; 591 } 592