1 /* $NetBSD: lexi.c,v 1.11 2002/05/26 22:53:38 wiz Exp $ */ 2 3 /* 4 * Copyright (c) 1980, 1993 5 * The Regents of the University of California. All rights reserved. 6 * Copyright (c) 1976 Board of Trustees of the University of Illinois. 7 * Copyright (c) 1985 Sun Microsystems, Inc. 8 * All rights reserved. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 3. All advertising materials mentioning features or use of this software 19 * must display the following acknowledgement: 20 * This product includes software developed by the University of 21 * California, Berkeley and its contributors. 22 * 4. Neither the name of the University nor the names of its contributors 23 * may be used to endorse or promote products derived from this software 24 * without specific prior written permission. 25 * 26 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 27 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 28 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 29 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 30 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 31 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 32 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 33 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 34 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 35 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 36 * SUCH DAMAGE. 37 */ 38 39 #include <sys/cdefs.h> 40 #ifndef lint 41 #if 0 42 static char sccsid[] = "@(#)lexi.c 8.1 (Berkeley) 6/6/93"; 43 #else 44 __RCSID("$NetBSD: lexi.c,v 1.11 2002/05/26 22:53:38 wiz Exp $"); 45 #endif 46 #endif /* not lint */ 47 48 /* 49 * Here we have the token scanner for indent. It scans off one token and puts 50 * it in the global variable "token". It returns a code, indicating the type 51 * of token scanned. 52 */ 53 54 #include <stdio.h> 55 #include <ctype.h> 56 #include <stdlib.h> 57 #include <string.h> 58 #include "indent_globs.h" 59 #include "indent_codes.h" 60 61 #define alphanum 1 62 #define opchar 3 63 64 struct templ { 65 char *rwd; 66 int rwcode; 67 }; 68 69 struct templ specials[1000] = 70 { 71 {"switch", 1}, 72 {"case", 2}, 73 {"break", 0}, 74 {"struct", 3}, 75 {"union", 3}, 76 {"enum", 3}, 77 {"default", 2}, 78 {"int", 4}, 79 {"char", 4}, 80 {"float", 4}, 81 {"double", 4}, 82 {"long", 4}, 83 {"short", 4}, 84 {"typdef", 4}, 85 {"unsigned", 4}, 86 {"register", 4}, 87 {"static", 4}, 88 {"global", 4}, 89 {"extern", 4}, 90 {"void", 4}, 91 {"goto", 0}, 92 {"return", 0}, 93 {"if", 5}, 94 {"while", 5}, 95 {"for", 5}, 96 {"else", 6}, 97 {"do", 6}, 98 {"sizeof", 7}, 99 {0, 0} 100 }; 101 102 char chartype[128] = 103 { /* this is used to facilitate the decision of 104 * what type (alphanumeric, operator) each 105 * character is */ 106 0, 0, 0, 0, 0, 0, 0, 0, 107 0, 0, 0, 0, 0, 0, 0, 0, 108 0, 0, 0, 0, 0, 0, 0, 0, 109 0, 0, 0, 0, 0, 0, 0, 0, 110 0, 3, 0, 0, 1, 3, 3, 0, 111 0, 0, 3, 3, 0, 3, 0, 3, 112 1, 1, 1, 1, 1, 1, 1, 1, 113 1, 1, 0, 0, 3, 3, 3, 3, 114 0, 1, 1, 1, 1, 1, 1, 1, 115 1, 1, 1, 1, 1, 1, 1, 1, 116 1, 1, 1, 1, 1, 1, 1, 1, 117 1, 1, 1, 0, 0, 0, 3, 1, 118 0, 1, 1, 1, 1, 1, 1, 1, 119 1, 1, 1, 1, 1, 1, 1, 1, 120 1, 1, 1, 1, 1, 1, 1, 1, 121 1, 1, 1, 0, 3, 0, 3, 0 122 }; 123 124 125 126 127 int 128 lexi(void) 129 { 130 int unary_delim; /* this is set to 1 if the current token 131 * 132 * forces a following operator to be unary */ 133 static int last_code; /* the last token type returned */ 134 static int l_struct; /* set to 1 if the last token was 'struct' */ 135 int code; /* internal code to be returned */ 136 char qchar; /* the delimiter character for a string */ 137 138 e_token = s_token; /* point to start of place to save token */ 139 unary_delim = false; 140 ps.col_1 = ps.last_nl; /* tell world that this token started in 141 * column 1 iff the last thing scanned was nl */ 142 ps.last_nl = false; 143 144 while (*buf_ptr == ' ' || *buf_ptr == '\t') { /* get rid of blanks */ 145 ps.col_1 = false; /* leading blanks imply token is not 146 * in column 1 */ 147 if (++buf_ptr >= buf_end) 148 fill_buffer(); 149 } 150 151 /* Scan an alphanumeric token */ 152 if (chartype[(int) *buf_ptr] == alphanum || 153 (buf_ptr[0] == '.' && isdigit((unsigned char)buf_ptr[1]))) { 154 /* 155 * we have a character or number 156 */ 157 char *j; /* used for searching thru list of 158 * 159 * reserved words */ 160 struct templ *p; 161 162 if (isdigit((unsigned char)*buf_ptr) || 163 (buf_ptr[0] == '.' && isdigit((unsigned char)buf_ptr[1]))) { 164 int seendot = 0, seenexp = 0, seensfx = 0; 165 if (*buf_ptr == '0' && 166 (buf_ptr[1] == 'x' || buf_ptr[1] == 'X')) { 167 *e_token++ = *buf_ptr++; 168 *e_token++ = *buf_ptr++; 169 while (isxdigit((unsigned char)*buf_ptr)) { 170 CHECK_SIZE_TOKEN; 171 *e_token++ = *buf_ptr++; 172 } 173 } else { 174 while (1) { 175 if (*buf_ptr == '.') { 176 if (seendot) 177 break; 178 else 179 seendot++; 180 } 181 CHECK_SIZE_TOKEN; 182 *e_token++ = *buf_ptr++; 183 if (!isdigit((unsigned char)*buf_ptr) 184 && *buf_ptr != '.') { 185 if ((*buf_ptr != 'E' 186 && *buf_ptr != 'e') || seenexp) 187 break; 188 else { 189 seenexp++; 190 seendot++; 191 CHECK_SIZE_TOKEN; 192 *e_token++ = *buf_ptr++; 193 if (*buf_ptr == '+' || *buf_ptr == '-') 194 *e_token++ = *buf_ptr++; 195 } 196 } 197 } 198 } 199 if (*buf_ptr == 'F' || *buf_ptr == 'f') { 200 /* float constant */ 201 *e_token++ = *buf_ptr++; 202 } else { 203 /* integer constant */ 204 while (1) { 205 if (!(seensfx & 1) && 206 (*buf_ptr == 'U' || 207 *buf_ptr == 'u')) { 208 CHECK_SIZE_TOKEN; 209 *e_token++ = *buf_ptr++; 210 seensfx |= 1; 211 continue; 212 } 213 if (!(seensfx & 2) && 214 (*buf_ptr == 'L' || 215 *buf_ptr == 'l')) { 216 CHECK_SIZE_TOKEN; 217 if (buf_ptr[1] == buf_ptr[0]) 218 *e_token++ = *buf_ptr++; 219 *e_token++ = *buf_ptr++; 220 seensfx |= 2; 221 continue; 222 } 223 break; 224 } 225 } 226 } else 227 while (chartype[(int) *buf_ptr] == alphanum) { /* copy it over */ 228 CHECK_SIZE_TOKEN; 229 *e_token++ = *buf_ptr++; 230 if (buf_ptr >= buf_end) 231 fill_buffer(); 232 } 233 *e_token++ = '\0'; 234 while (*buf_ptr == ' ' || *buf_ptr == '\t') { /* get rid of blanks */ 235 if (++buf_ptr >= buf_end) 236 fill_buffer(); 237 } 238 ps.its_a_keyword = false; 239 ps.sizeof_keyword = false; 240 if (l_struct) { /* if last token was 'struct', then this token 241 * should be treated as a declaration */ 242 l_struct = false; 243 last_code = ident; 244 ps.last_u_d = true; 245 return (decl); 246 } 247 ps.last_u_d = false; /* Operator after indentifier is 248 * binary */ 249 last_code = ident; /* Remember that this is the code we 250 * will return */ 251 252 /* 253 * This loop will check if the token is a keyword. 254 */ 255 for (p = specials; (j = p->rwd) != 0; p++) { 256 char *p = s_token; /* point at scanned token */ 257 if (*j++ != *p++ || *j++ != *p++) 258 continue; /* This test depends on the 259 * fact that identifiers are 260 * always at least 1 character 261 * long (ie. the first two 262 * bytes of the identifier are 263 * always meaningful) */ 264 if (p[-1] == 0) 265 break; /* If its a one-character identifier */ 266 while (*p++ == *j) 267 if (*j++ == 0) 268 goto found_keyword; /* I wish that C had a 269 * multi-level break... */ 270 } 271 if (p->rwd) { /* we have a keyword */ 272 found_keyword: 273 ps.its_a_keyword = true; 274 ps.last_u_d = true; 275 switch (p->rwcode) { 276 case 1:/* it is a switch */ 277 return (swstmt); 278 case 2:/* a case or default */ 279 return (casestmt); 280 281 case 3:/* a "struct" */ 282 if (ps.p_l_follow) 283 break; /* inside parens: cast */ 284 l_struct = true; 285 286 /* 287 * Next time around, we will want to know that we have had a 288 * 'struct' 289 */ 290 case 4:/* one of the declaration keywords */ 291 if (ps.p_l_follow) { 292 ps.cast_mask |= 1 << ps.p_l_follow; 293 break; /* inside parens: cast */ 294 } 295 last_code = decl; 296 return (decl); 297 298 case 5:/* if, while, for */ 299 return (sp_paren); 300 301 case 6:/* do, else */ 302 return (sp_nparen); 303 304 case 7: 305 ps.sizeof_keyword = true; 306 default: /* all others are treated like any 307 * other identifier */ 308 return (ident); 309 } /* end of switch */ 310 } /* end of if (found_it) */ 311 if (*buf_ptr == '(' && ps.tos <= 1 && ps.ind_level == 0) { 312 char *tp = buf_ptr; 313 while (tp < buf_end) 314 if (*tp++ == ')' && (*tp == ';' || *tp == ',')) 315 goto not_proc; 316 strncpy(ps.procname, token, sizeof ps.procname - 1); 317 ps.in_parameter_declaration = 1; 318 rparen_count = 1; 319 not_proc: ; 320 } 321 /* 322 * The following hack attempts to guess whether or not the current 323 * token is in fact a declaration keyword -- one that has been 324 * typedefd 325 */ 326 if (((*buf_ptr == '*' && buf_ptr[1] != '=') || 327 isalpha((unsigned char)*buf_ptr) || *buf_ptr == '_') 328 && !ps.p_l_follow 329 && !ps.block_init 330 && (ps.last_token == rparen || ps.last_token == semicolon || 331 ps.last_token == decl || 332 ps.last_token == lbrace || ps.last_token == rbrace)) { 333 ps.its_a_keyword = true; 334 ps.last_u_d = true; 335 last_code = decl; 336 return decl; 337 } 338 if (last_code == decl) /* if this is a declared variable, 339 * then following sign is unary */ 340 ps.last_u_d = true; /* will make "int a -1" work */ 341 last_code = ident; 342 return (ident); /* the ident is not in the list */ 343 } /* end of procesing for alpanum character */ 344 /* Scan a non-alphanumeric token */ 345 *e_token++ = *buf_ptr; /* if it is only a one-character token, it is 346 * moved here */ 347 *e_token = '\0'; 348 if (++buf_ptr >= buf_end) 349 fill_buffer(); 350 351 switch (*token) { 352 case '\n': 353 unary_delim = ps.last_u_d; 354 ps.last_nl = true; /* remember that we just had a newline */ 355 code = (had_eof ? 0 : newline); 356 357 /* 358 * if data has been exausted, the newline is a dummy, and we should 359 * return code to stop 360 */ 361 break; 362 363 case '\'': /* start of quoted character */ 364 case '"': /* start of string */ 365 qchar = *token; 366 if (troff) { 367 e_token[-1] = '`'; 368 if (qchar == '"') 369 *e_token++ = '`'; 370 e_token = chfont(&bodyf, &stringf, e_token); 371 } 372 do { /* copy the string */ 373 while (1) { /* move one character or 374 * [/<char>]<char> */ 375 if (*buf_ptr == '\n') { 376 printf("%d: Unterminated literal\n", line_no); 377 goto stop_lit; 378 } 379 CHECK_SIZE_TOKEN; /* Only have to do this 380 * once in this loop, 381 * since CHECK_SIZE 382 * guarantees that there 383 * are at least 5 384 * entries left */ 385 *e_token = *buf_ptr++; 386 if (buf_ptr >= buf_end) 387 fill_buffer(); 388 if (*e_token == BACKSLASH) { /* if escape, copy extra 389 * char */ 390 if (*buf_ptr == '\n') /* check for escaped 391 * newline */ 392 ++line_no; 393 if (troff) { 394 *++e_token = BACKSLASH; 395 if (*buf_ptr == BACKSLASH) 396 *++e_token = BACKSLASH; 397 } 398 *++e_token = *buf_ptr++; 399 ++e_token; /* we must increment 400 * this again because we 401 * copied two chars */ 402 if (buf_ptr >= buf_end) 403 fill_buffer(); 404 } else 405 break; /* we copied one character */ 406 } /* end of while (1) */ 407 } while (*e_token++ != qchar); 408 if (troff) { 409 e_token = chfont(&stringf, &bodyf, e_token - 1); 410 if (qchar == '"') 411 *e_token++ = '\''; 412 } 413 stop_lit: 414 code = ident; 415 break; 416 417 case ('('): 418 case ('['): 419 unary_delim = true; 420 code = lparen; 421 break; 422 423 case (')'): 424 case (']'): 425 code = rparen; 426 break; 427 428 case '#': 429 unary_delim = ps.last_u_d; 430 code = preesc; 431 break; 432 433 case '?': 434 unary_delim = true; 435 code = question; 436 break; 437 438 case (':'): 439 code = colon; 440 unary_delim = true; 441 break; 442 443 case (';'): 444 unary_delim = true; 445 code = semicolon; 446 break; 447 448 case ('{'): 449 unary_delim = true; 450 451 /* 452 * if (ps.in_or_st) ps.block_init = 1; 453 */ 454 /* ? code = ps.block_init ? lparen : lbrace; */ 455 code = lbrace; 456 break; 457 458 case ('}'): 459 unary_delim = true; 460 /* ? code = ps.block_init ? rparen : rbrace; */ 461 code = rbrace; 462 break; 463 464 case 014: /* a form feed */ 465 unary_delim = ps.last_u_d; 466 ps.last_nl = true; /* remember this so we can set 467 * 'ps.col_1' right */ 468 code = form_feed; 469 break; 470 471 case (','): 472 unary_delim = true; 473 code = comma; 474 break; 475 476 case '.': 477 unary_delim = false; 478 code = period; 479 break; 480 481 case '-': 482 case '+': /* check for -, +, --, ++ */ 483 code = (ps.last_u_d ? unary_op : binary_op); 484 unary_delim = true; 485 486 if (*buf_ptr == token[0]) { 487 /* check for doubled character */ 488 *e_token++ = *buf_ptr++; 489 /* buffer overflow will be checked at end of loop */ 490 if (last_code == ident || last_code == rparen) { 491 code = (ps.last_u_d ? unary_op : postop); 492 /* check for following ++ or -- */ 493 unary_delim = false; 494 } 495 } else 496 if (*buf_ptr == '=') 497 /* check for operator += */ 498 *e_token++ = *buf_ptr++; 499 else 500 if (*buf_ptr == '>') { 501 /* check for operator -> */ 502 *e_token++ = *buf_ptr++; 503 if (!pointer_as_binop) { 504 unary_delim = false; 505 code = unary_op; 506 ps.want_blank = false; 507 } 508 } 509 break; /* buffer overflow will be checked at end of 510 * switch */ 511 512 case '=': 513 if (ps.in_or_st) 514 ps.block_init = 1; 515 #ifdef undef 516 if (chartype[*buf_ptr] == opchar) { /* we have two char 517 * assignment */ 518 e_token[-1] = *buf_ptr++; 519 if ((e_token[-1] == '<' || e_token[-1] == '>') && e_token[-1] == *buf_ptr) 520 *e_token++ = *buf_ptr++; 521 *e_token++ = '='; /* Flip =+ to += */ 522 *e_token = 0; 523 } 524 #else 525 if (*buf_ptr == '=') { /* == */ 526 *e_token++ = '='; /* Flip =+ to += */ 527 buf_ptr++; 528 *e_token = 0; 529 } 530 #endif 531 code = binary_op; 532 unary_delim = true; 533 break; 534 /* can drop thru!!! */ 535 536 case '>': 537 case '<': 538 case '!': /* ops like <, <<, <=, !=, etc */ 539 if (*buf_ptr == '>' || *buf_ptr == '<' || *buf_ptr == '=') { 540 *e_token++ = *buf_ptr; 541 if (++buf_ptr >= buf_end) 542 fill_buffer(); 543 } 544 if (*buf_ptr == '=') 545 *e_token++ = *buf_ptr++; 546 code = (ps.last_u_d ? unary_op : binary_op); 547 unary_delim = true; 548 break; 549 550 default: 551 if (token[0] == '/' && *buf_ptr == '*') { 552 /* it is start of comment */ 553 *e_token++ = '*'; 554 555 if (++buf_ptr >= buf_end) 556 fill_buffer(); 557 558 code = comment; 559 unary_delim = ps.last_u_d; 560 break; 561 } 562 while (*(e_token - 1) == *buf_ptr || *buf_ptr == '=') { 563 /* 564 * handle ||, &&, etc, and also things as in int *****i 565 */ 566 *e_token++ = *buf_ptr; 567 if (++buf_ptr >= buf_end) 568 fill_buffer(); 569 } 570 code = (ps.last_u_d ? unary_op : binary_op); 571 unary_delim = true; 572 573 574 } /* end of switch */ 575 if (code != newline) { 576 l_struct = false; 577 last_code = code; 578 } 579 if (buf_ptr >= buf_end) /* check for input buffer empty */ 580 fill_buffer(); 581 ps.last_u_d = unary_delim; 582 *e_token = '\0'; /* null terminate the token */ 583 return (code); 584 } 585 /* 586 * Add the given keyword to the keyword table, using val as the keyword type 587 */ 588 void 589 addkey(char *key, int val) 590 { 591 struct templ *p = specials; 592 while (p->rwd) 593 if (p->rwd[0] == key[0] && strcmp(p->rwd, key) == 0) 594 return; 595 else 596 p++; 597 if (p >= specials + sizeof specials / sizeof specials[0]) 598 return; /* For now, table overflows are silently 599 * ignored */ 600 p->rwd = key; 601 p->rwcode = val; 602 p[1].rwd = 0; 603 p[1].rwcode = 0; 604 } 605