1 /* $OpenBSD: lexi.c,v 1.15 2009/10/27 23:59:39 deraadt Exp $ */ 2 3 /* 4 * Copyright (c) 1980, 1993 5 * The Regents of the University of California. 6 * Copyright (c) 1976 Board of Trustees of the University of Illinois. 7 * Copyright (c) 1985 Sun Microsystems, Inc. 8 * All rights reserved. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 3. Neither the name of the University nor the names of its contributors 19 * may be used to endorse or promote products derived from this software 20 * without specific prior written permission. 21 * 22 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 25 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 */ 34 35 /* 36 * Here we have the token scanner for indent. It scans off one token and puts 37 * it in the global variable "token". It returns a code, indicating the type 38 * of token scanned. 39 */ 40 41 #include <stdio.h> 42 #include <ctype.h> 43 #include <stdlib.h> 44 #include <string.h> 45 #include <err.h> 46 #include "indent_globs.h" 47 #include "indent_codes.h" 48 49 #define alphanum 1 50 #define opchar 3 51 52 struct templ { 53 char *rwd; 54 int rwcode; 55 }; 56 57 struct templ specialsinit[] = { 58 { "switch", 1 }, 59 { "case", 2 }, 60 { "break", 0 }, 61 { "struct", 3 }, 62 { "union", 3 }, 63 { "enum", 3 }, 64 { "default", 2 }, 65 { "int", 4 }, 66 { "char", 4 }, 67 { "float", 4 }, 68 { "double", 4 }, 69 { "long", 4 }, 70 { "short", 4 }, 71 { "typdef", 4 }, 72 { "unsigned", 4 }, 73 { "register", 4 }, 74 { "static", 4 }, 75 { "global", 4 }, 76 { "extern", 4 }, 77 { "void", 4 }, 78 { "goto", 0 }, 79 { "return", 0 }, 80 { "if", 5 }, 81 { "while", 5 }, 82 { "for", 5 }, 83 { "else", 6 }, 84 { "do", 6 }, 85 { "sizeof", 7 }, 86 }; 87 88 struct templ *specials = specialsinit; 89 int nspecials = sizeof(specialsinit) / sizeof(specialsinit[0]); 90 int maxspecials; 91 92 char chartype[128] = 93 { /* this is used to facilitate the decision of 94 * what type (alphanumeric, operator) each 95 * character is */ 96 0, 0, 0, 0, 0, 0, 0, 0, 97 0, 0, 0, 0, 0, 0, 0, 0, 98 0, 0, 0, 0, 0, 0, 0, 0, 99 0, 0, 0, 0, 0, 0, 0, 0, 100 0, 3, 0, 0, 1, 3, 3, 0, 101 0, 0, 3, 3, 0, 3, 0, 3, 102 1, 1, 1, 1, 1, 1, 1, 1, 103 1, 1, 0, 0, 3, 3, 3, 3, 104 0, 1, 1, 1, 1, 1, 1, 1, 105 1, 1, 1, 1, 1, 1, 1, 1, 106 1, 1, 1, 1, 1, 1, 1, 1, 107 1, 1, 1, 0, 0, 0, 3, 1, 108 0, 1, 1, 1, 1, 1, 1, 1, 109 1, 1, 1, 1, 1, 1, 1, 1, 110 1, 1, 1, 1, 1, 1, 1, 1, 111 1, 1, 1, 0, 3, 0, 3, 0 112 }; 113 114 115 116 117 int 118 lexi(void) 119 { 120 int unary_delim; /* this is set to 1 if the current token 121 * forces a following operator to be unary */ 122 static int last_code; /* the last token type returned */ 123 static int l_struct; /* set to 1 if the last token was 'struct' */ 124 int code; /* internal code to be returned */ 125 char qchar; /* the delimiter character for a string */ 126 int i; 127 128 e_token = s_token; /* point to start of place to save token */ 129 unary_delim = false; 130 ps.col_1 = ps.last_nl; /* tell world that this token started in 131 * column 1 iff the last thing scanned was nl */ 132 ps.last_nl = false; 133 134 while (*buf_ptr == ' ' || *buf_ptr == '\t') { /* get rid of blanks */ 135 ps.col_1 = false; /* leading blanks imply token is not in column 136 * 1 */ 137 if (++buf_ptr >= buf_end) 138 fill_buffer(); 139 } 140 141 /* Scan an alphanumeric token */ 142 if (chartype[(int)*buf_ptr] == alphanum || 143 (buf_ptr[0] == '.' && isdigit(buf_ptr[1]))) { 144 /* 145 * we have a character or number 146 */ 147 char *j; /* used for searching thru list of 148 * reserved words */ 149 if (isdigit(*buf_ptr) || (buf_ptr[0] == '.' && isdigit(buf_ptr[1]))) { 150 int seendot = 0, 151 seenexp = 0, 152 seensfx = 0; 153 if (*buf_ptr == '0' && 154 (buf_ptr[1] == 'x' || buf_ptr[1] == 'X')) { 155 *e_token++ = *buf_ptr++; 156 *e_token++ = *buf_ptr++; 157 while (isxdigit(*buf_ptr)) { 158 CHECK_SIZE_TOKEN; 159 *e_token++ = *buf_ptr++; 160 } 161 } 162 else 163 while (1) { 164 if (*buf_ptr == '.') { 165 if (seendot) 166 break; 167 else 168 seendot++; 169 } 170 CHECK_SIZE_TOKEN; 171 *e_token++ = *buf_ptr++; 172 if (!isdigit(*buf_ptr) && *buf_ptr != '.') { 173 if ((*buf_ptr != 'E' && *buf_ptr != 'e') || seenexp) 174 break; 175 else { 176 seenexp++; 177 seendot++; 178 CHECK_SIZE_TOKEN; 179 *e_token++ = *buf_ptr++; 180 if (*buf_ptr == '+' || *buf_ptr == '-') 181 *e_token++ = *buf_ptr++; 182 } 183 } 184 } 185 while (1) { 186 if (!(seensfx & 1) && 187 (*buf_ptr == 'U' || *buf_ptr == 'u')) { 188 CHECK_SIZE_TOKEN; 189 *e_token++ = *buf_ptr++; 190 seensfx |= 1; 191 continue; 192 } 193 if (!(seensfx & 2) && 194 (*buf_ptr == 'L' || *buf_ptr == 'l')) { 195 CHECK_SIZE_TOKEN; 196 if (buf_ptr[1] == buf_ptr[0]) 197 *e_token++ = *buf_ptr++; 198 *e_token++ = *buf_ptr++; 199 seensfx |= 2; 200 continue; 201 } 202 break; 203 } 204 } 205 else 206 while (chartype[(int)*buf_ptr] == alphanum) { /* copy it over */ 207 CHECK_SIZE_TOKEN; 208 *e_token++ = *buf_ptr++; 209 if (buf_ptr >= buf_end) 210 fill_buffer(); 211 } 212 *e_token++ = '\0'; 213 while (*buf_ptr == ' ' || *buf_ptr == '\t') { /* get rid of blanks */ 214 if (++buf_ptr >= buf_end) 215 fill_buffer(); 216 } 217 ps.its_a_keyword = false; 218 ps.sizeof_keyword = false; 219 if (l_struct) { /* if last token was 'struct', then this token 220 * should be treated as a declaration */ 221 l_struct = false; 222 last_code = ident; 223 ps.last_u_d = true; 224 return (decl); 225 } 226 ps.last_u_d = false; /* Operator after identifier is binary */ 227 last_code = ident; /* Remember that this is the code we will 228 * return */ 229 230 /* 231 * This loop will check if the token is a keyword. 232 */ 233 for (i = 0; i < nspecials; i++) { 234 char *p = s_token; /* point at scanned token */ 235 j = specials[i].rwd; 236 if (*j++ != *p++ || *j++ != *p++) 237 continue; /* This test depends on the fact that 238 * identifiers are always at least 1 character 239 * long (ie. the first two bytes of the 240 * identifier are always meaningful) */ 241 if (p[-1] == 0) 242 break; /* If its a one-character identifier */ 243 while (*p++ == *j) 244 if (*j++ == 0) 245 goto found_keyword; /* I wish that C had a multi-level 246 * break... */ 247 } 248 if (i < nspecials) { /* we have a keyword */ 249 found_keyword: 250 ps.its_a_keyword = true; 251 ps.last_u_d = true; 252 switch (specials[i].rwcode) { 253 case 1: /* it is a switch */ 254 return (swstmt); 255 case 2: /* a case or default */ 256 return (casestmt); 257 258 case 3: /* a "struct" */ 259 if (ps.p_l_follow) 260 break; /* inside parens: cast */ 261 l_struct = true; 262 263 /* 264 * Next time around, we will want to know that we have had a 265 * 'struct' 266 */ 267 case 4: /* one of the declaration keywords */ 268 if (ps.p_l_follow) { 269 ps.cast_mask |= 1 << ps.p_l_follow; 270 break; /* inside parens: cast */ 271 } 272 last_code = decl; 273 return (decl); 274 275 case 5: /* if, while, for */ 276 return (sp_paren); 277 278 case 6: /* do, else */ 279 return (sp_nparen); 280 281 case 7: 282 ps.sizeof_keyword = true; 283 default: /* all others are treated like any other 284 * identifier */ 285 return (ident); 286 } /* end of switch */ 287 } /* end of if (found_it) */ 288 if (*buf_ptr == '(' && ps.tos <= 1 && ps.ind_level == 0) { 289 char *tp = buf_ptr; 290 while (tp < buf_end) 291 if (*tp++ == ')' && (*tp == ';' || *tp == ',')) 292 goto not_proc; 293 strlcpy(ps.procname, token, sizeof ps.procname); 294 ps.in_parameter_declaration = 1; 295 rparen_count = 1; 296 not_proc:; 297 } 298 /* 299 * The following hack attempts to guess whether or not the current 300 * token is in fact a declaration keyword -- one that has been 301 * typedefd 302 */ 303 if (((*buf_ptr == '*' && buf_ptr[1] != '=') || isalpha(*buf_ptr) || *buf_ptr == '_') 304 && !ps.p_l_follow 305 && !ps.block_init 306 && (ps.last_token == rparen || ps.last_token == semicolon || 307 ps.last_token == decl || 308 ps.last_token == lbrace || ps.last_token == rbrace)) { 309 ps.its_a_keyword = true; 310 ps.last_u_d = true; 311 last_code = decl; 312 return decl; 313 } 314 if (last_code == decl) /* if this is a declared variable, then 315 * following sign is unary */ 316 ps.last_u_d = true; /* will make "int a -1" work */ 317 last_code = ident; 318 return (ident); /* the ident is not in the list */ 319 } /* end of procesing for alpanum character */ 320 321 /* Scan a non-alphanumeric token */ 322 323 *e_token++ = *buf_ptr; /* if it is only a one-character token, it is 324 * moved here */ 325 *e_token = '\0'; 326 if (++buf_ptr >= buf_end) 327 fill_buffer(); 328 329 switch (*token) { 330 case '\n': 331 unary_delim = ps.last_u_d; 332 ps.last_nl = true; /* remember that we just had a newline */ 333 code = (had_eof ? 0 : newline); 334 335 /* 336 * if data has been exausted, the newline is a dummy, and we should 337 * return code to stop 338 */ 339 break; 340 341 case '\'': /* start of quoted character */ 342 case '"': /* start of string */ 343 qchar = *token; 344 if (troff) { 345 e_token[-1] = '`'; 346 if (qchar == '"') 347 *e_token++ = '`'; 348 e_token = chfont(&bodyf, &stringf, e_token); 349 } 350 do { /* copy the string */ 351 while (1) { /* move one character or [/<char>]<char> */ 352 if (*buf_ptr == '\n') { 353 printf("%d: Unterminated literal\n", line_no); 354 goto stop_lit; 355 } 356 CHECK_SIZE_TOKEN; /* Only have to do this once in this loop, 357 * since CHECK_SIZE guarantees that there 358 * are at least 5 entries left */ 359 *e_token = *buf_ptr++; 360 if (buf_ptr >= buf_end) 361 fill_buffer(); 362 if (*e_token == BACKSLASH) { /* if escape, copy extra char */ 363 if (*buf_ptr == '\n') /* check for escaped newline */ 364 ++line_no; 365 if (troff) { 366 *++e_token = BACKSLASH; 367 if (*buf_ptr == BACKSLASH) 368 *++e_token = BACKSLASH; 369 } 370 *++e_token = *buf_ptr++; 371 ++e_token; /* we must increment this again because we 372 * copied two chars */ 373 if (buf_ptr >= buf_end) 374 fill_buffer(); 375 } 376 else 377 break; /* we copied one character */ 378 } /* end of while (1) */ 379 } while (*e_token++ != qchar); 380 if (troff) { 381 e_token = chfont(&stringf, &bodyf, e_token - 1); 382 if (qchar == '"') 383 *e_token++ = '\''; 384 } 385 stop_lit: 386 code = ident; 387 break; 388 389 case ('('): 390 case ('['): 391 unary_delim = true; 392 code = lparen; 393 break; 394 395 case (')'): 396 case (']'): 397 code = rparen; 398 break; 399 400 case '#': 401 unary_delim = ps.last_u_d; 402 code = preesc; 403 break; 404 405 case '?': 406 unary_delim = true; 407 code = question; 408 break; 409 410 case (':'): 411 code = colon; 412 unary_delim = true; 413 break; 414 415 case (';'): 416 unary_delim = true; 417 code = semicolon; 418 break; 419 420 case ('{'): 421 unary_delim = true; 422 423 /* 424 * if (ps.in_or_st) ps.block_init = 1; 425 */ 426 /* ? code = ps.block_init ? lparen : lbrace; */ 427 code = lbrace; 428 break; 429 430 case ('}'): 431 unary_delim = true; 432 /* ? code = ps.block_init ? rparen : rbrace; */ 433 code = rbrace; 434 break; 435 436 case 014: /* a form feed */ 437 unary_delim = ps.last_u_d; 438 ps.last_nl = true; /* remember this so we can set 'ps.col_1' 439 * right */ 440 code = form_feed; 441 break; 442 443 case (','): 444 unary_delim = true; 445 code = comma; 446 break; 447 448 case '.': 449 unary_delim = false; 450 code = period; 451 break; 452 453 case '-': 454 case '+': /* check for -, +, --, ++ */ 455 code = (ps.last_u_d ? unary_op : binary_op); 456 unary_delim = true; 457 458 if (*buf_ptr == token[0]) { 459 /* check for doubled character */ 460 *e_token++ = *buf_ptr++; 461 /* buffer overflow will be checked at end of loop */ 462 if (last_code == ident || last_code == rparen) { 463 code = (ps.last_u_d ? unary_op : postop); 464 /* check for following ++ or -- */ 465 unary_delim = false; 466 } 467 } 468 else if (*buf_ptr == '=') 469 /* check for operator += */ 470 *e_token++ = *buf_ptr++; 471 else if (*buf_ptr == '>') { 472 /* check for operator -> */ 473 *e_token++ = *buf_ptr++; 474 if (!pointer_as_binop) { 475 unary_delim = false; 476 code = unary_op; 477 ps.want_blank = false; 478 } 479 } 480 break; /* buffer overflow will be checked at end of 481 * switch */ 482 483 case '=': 484 if (ps.in_or_st) 485 ps.block_init = 1; 486 #ifdef undef 487 if (chartype[*buf_ptr] == opchar) { /* we have two char assignment */ 488 e_token[-1] = *buf_ptr++; 489 if ((e_token[-1] == '<' || e_token[-1] == '>') && e_token[-1] == *buf_ptr) 490 *e_token++ = *buf_ptr++; 491 *e_token++ = '='; /* Flip =+ to += */ 492 *e_token = 0; 493 } 494 #else 495 if (*buf_ptr == '=') {/* == */ 496 *e_token++ = '='; /* Flip =+ to += */ 497 buf_ptr++; 498 *e_token = 0; 499 } 500 #endif 501 code = binary_op; 502 unary_delim = true; 503 break; 504 /* can drop thru!!! */ 505 506 case '>': 507 case '<': 508 case '!': /* ops like <, <<, <=, !=, etc */ 509 if (*buf_ptr == '>' || *buf_ptr == '<' || *buf_ptr == '=') { 510 *e_token++ = *buf_ptr; 511 if (++buf_ptr >= buf_end) 512 fill_buffer(); 513 } 514 if (*buf_ptr == '=') 515 *e_token++ = *buf_ptr++; 516 code = (ps.last_u_d ? unary_op : binary_op); 517 unary_delim = true; 518 break; 519 520 default: 521 if (token[0] == '/' && *buf_ptr == '*') { 522 /* it is start of comment */ 523 *e_token++ = '*'; 524 525 if (++buf_ptr >= buf_end) 526 fill_buffer(); 527 528 code = comment; 529 unary_delim = ps.last_u_d; 530 break; 531 } 532 while (*(e_token - 1) == *buf_ptr || *buf_ptr == '=') { 533 /* 534 * handle ||, &&, etc, and also things as in int *****i 535 */ 536 *e_token++ = *buf_ptr; 537 if (++buf_ptr >= buf_end) 538 fill_buffer(); 539 } 540 code = (ps.last_u_d ? unary_op : binary_op); 541 unary_delim = true; 542 543 544 } /* end of switch */ 545 if (code != newline) { 546 l_struct = false; 547 last_code = code; 548 } 549 if (buf_ptr >= buf_end) /* check for input buffer empty */ 550 fill_buffer(); 551 ps.last_u_d = unary_delim; 552 *e_token = '\0'; /* null terminate the token */ 553 return (code); 554 } 555 556 /* 557 * Add the given keyword to the keyword table, using val as the keyword type 558 */ 559 void 560 addkey(char *key, int val) 561 { 562 struct templ *p; 563 int i; 564 565 for (i = 0; i < nspecials; i++) { 566 p = &specials[i]; 567 if (p->rwd[0] == key[0] && strcmp(p->rwd, key) == 0) 568 return; 569 } 570 571 if (specials == specialsinit) { 572 /* 573 * Whoa. Must reallocate special table. 574 */ 575 nspecials = sizeof (specialsinit) / sizeof (specialsinit[0]); 576 maxspecials = nspecials + (nspecials >> 2); 577 specials = (struct templ *)calloc(maxspecials, sizeof specials[0]); 578 if (specials == NULL) 579 err(1, NULL); 580 memcpy(specials, specialsinit, sizeof specialsinit); 581 } else if (nspecials >= maxspecials) { 582 int newspecials = maxspecials + (maxspecials >> 2); 583 struct templ *specials2; 584 585 specials2 = realloc(specials, newspecials * sizeof specials[0]); 586 if (specials2 == NULL) 587 err(1, NULL); 588 specials = specials2; 589 maxspecials = newspecials; 590 } 591 592 p = &specials[nspecials]; 593 p->rwd = key; 594 p->rwcode = val; 595 nspecials++; 596 return; 597 } 598