1 /* $OpenBSD: lex.c,v 1.5 2001/09/08 00:12:40 millert Exp $ */ 2 /**************************************************************** 3 Copyright (C) Lucent Technologies 1997 4 All Rights Reserved 5 6 Permission to use, copy, modify, and distribute this software and 7 its documentation for any purpose and without fee is hereby 8 granted, provided that the above copyright notice appear in all 9 copies and that both that the copyright notice and this 10 permission notice and warranty disclaimer appear in supporting 11 documentation, and that the name Lucent Technologies or any of 12 its entities not be used in advertising or publicity pertaining 13 to distribution of the software without specific, written prior 14 permission. 15 16 LUCENT DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, 17 INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS. 18 IN NO EVENT SHALL LUCENT OR ANY OF ITS ENTITIES BE LIABLE FOR ANY 19 SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 20 WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER 21 IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, 22 ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF 23 THIS SOFTWARE. 24 ****************************************************************/ 25 26 #include <stdio.h> 27 #include <stdlib.h> 28 #include <string.h> 29 #include <ctype.h> 30 #include "awk.h" 31 #include "ytab.h" 32 33 extern YYSTYPE yylval; 34 extern int infunc; 35 36 int lineno = 1; 37 int bracecnt = 0; 38 int brackcnt = 0; 39 int parencnt = 0; 40 41 typedef struct Keyword { 42 char *word; 43 int sub; 44 int type; 45 } Keyword; 46 47 Keyword keywords[] ={ /* keep sorted: binary searched */ 48 { "BEGIN", XBEGIN, XBEGIN }, 49 { "END", XEND, XEND }, 50 { "NF", VARNF, VARNF }, 51 { "atan2", FATAN, BLTIN }, 52 { "break", BREAK, BREAK }, 53 { "close", CLOSE, CLOSE }, 54 { "continue", CONTINUE, CONTINUE }, 55 { "cos", FCOS, BLTIN }, 56 { "delete", DELETE, DELETE }, 57 { "do", DO, DO }, 58 { "else", ELSE, ELSE }, 59 { "exit", EXIT, EXIT }, 60 { "exp", FEXP, BLTIN }, 61 { "fflush", FFLUSH, BLTIN }, 62 { "for", FOR, FOR }, 63 { "func", FUNC, FUNC }, 64 { "function", FUNC, FUNC }, 65 { "getline", GETLINE, GETLINE }, 66 { "gsub", GSUB, GSUB }, 67 { "if", IF, IF }, 68 { "in", IN, IN }, 69 { "index", INDEX, INDEX }, 70 { "int", FINT, BLTIN }, 71 { "length", FLENGTH, BLTIN }, 72 { "log", FLOG, BLTIN }, 73 { "match", MATCHFCN, MATCHFCN }, 74 { "next", NEXT, NEXT }, 75 { "nextfile", NEXTFILE, NEXTFILE }, 76 { "print", PRINT, PRINT }, 77 { "printf", PRINTF, PRINTF }, 78 { "rand", FRAND, BLTIN }, 79 { "return", RETURN, RETURN }, 80 { "sin", FSIN, BLTIN }, 81 { "split", SPLIT, SPLIT }, 82 { "sprintf", SPRINTF, SPRINTF }, 83 { "sqrt", FSQRT, BLTIN }, 84 { "srand", FSRAND, BLTIN }, 85 { "sub", SUB, SUB }, 86 { "substr", SUBSTR, SUBSTR }, 87 { "system", FSYSTEM, BLTIN }, 88 { "tolower", FTOLOWER, BLTIN }, 89 { "toupper", FTOUPPER, BLTIN }, 90 { "while", WHILE, WHILE }, 91 }; 92 93 #define DEBUG 94 #ifdef DEBUG 95 #define RET(x) { if(dbg)printf("lex %s\n", tokname(x)); return(x); } 96 #else 97 #define RET(x) return(x) 98 #endif 99 100 int peek(void) 101 { 102 int c = input(); 103 unput(c); 104 return c; 105 } 106 107 int gettok(char **pbuf, int *psz) /* get next input token */ 108 { 109 int c; 110 char *buf = *pbuf; 111 int sz = *psz; 112 char *bp = buf; 113 114 c = input(); 115 if (c == 0) 116 return 0; 117 buf[0] = c; 118 buf[1] = 0; 119 if (!isalnum(c) && c != '.' && c != '_') 120 return c; 121 122 *bp++ = c; 123 if (isalpha(c) || c == '_') { /* it's a varname */ 124 for ( ; (c = input()) != 0; ) { 125 if (bp-buf >= sz) 126 if (!adjbuf(&buf, &sz, bp-buf+2, 100, &bp, 0)) 127 FATAL( "out of space for name %.10s...", buf ); 128 if (isalnum(c) || c == '_') 129 *bp++ = c; 130 else { 131 *bp = 0; 132 unput(c); 133 break; 134 } 135 } 136 *bp = 0; 137 } else { /* it's a number */ 138 char *rem; 139 /* read input until can't be a number */ 140 for ( ; (c = input()) != 0; ) { 141 if (bp-buf >= sz) 142 if (!adjbuf(&buf, &sz, bp-buf+2, 100, &bp, 0)) 143 FATAL( "out of space for number %.10s...", buf ); 144 if (isdigit(c) || c == 'e' || c == 'E' 145 || c == '.' || c == '+' || c == '-') 146 *bp++ = c; 147 else { 148 unput(c); 149 break; 150 } 151 } 152 *bp = 0; 153 strtod(buf, &rem); /* parse the number */ 154 unputstr(rem); /* put rest back for later */ 155 rem[0] = 0; 156 } 157 *pbuf = buf; 158 *psz = sz; 159 return buf[0]; 160 } 161 162 int word(char *); 163 int string(void); 164 int regexpr(void); 165 int sc = 0; /* 1 => return a } right now */ 166 int reg = 0; /* 1 => return a REGEXPR now */ 167 168 int yylex(void) 169 { 170 int c; 171 static char *buf = 0; 172 static int bufsize = 500; 173 174 if (buf == 0 && (buf = (char *) malloc(bufsize)) == NULL) 175 FATAL( "out of space in yylex" ); 176 if (sc) { 177 sc = 0; 178 RET('}'); 179 } 180 if (reg) { 181 reg = 0; 182 return regexpr(); 183 } 184 for (;;) { 185 c = gettok(&buf, &bufsize); 186 if (c == 0) 187 return 0; 188 if (isalpha(c) || c == '_') 189 return word(buf); 190 if (isdigit(c) || c == '.') { 191 yylval.cp = setsymtab(buf, tostring(buf), atof(buf), CON|NUM, symtab); 192 /* should this also have STR set? */ 193 RET(NUMBER); 194 } 195 196 yylval.i = c; 197 switch (c) { 198 case '\n': /* {EOL} */ 199 RET(NL); 200 case '\r': /* assume \n is coming */ 201 case ' ': /* {WS}+ */ 202 case '\t': 203 break; 204 case '#': /* #.* strip comments */ 205 while ((c = input()) != '\n' && c != 0) 206 ; 207 unput(c); 208 break; 209 case ';': 210 RET(';'); 211 case '\\': 212 if (peek() == '\n') { 213 input(); 214 } else if (peek() == '\r') { 215 input(); input(); /* \n */ 216 lineno++; 217 } else { 218 RET(c); 219 } 220 break; 221 case '&': 222 if (peek() == '&') { 223 input(); RET(AND); 224 } else 225 RET('&'); 226 case '|': 227 if (peek() == '|') { 228 input(); RET(BOR); 229 } else 230 RET('|'); 231 case '!': 232 if (peek() == '=') { 233 input(); yylval.i = NE; RET(NE); 234 } else if (peek() == '~') { 235 input(); yylval.i = NOTMATCH; RET(MATCHOP); 236 } else 237 RET(NOT); 238 case '~': 239 yylval.i = MATCH; 240 RET(MATCHOP); 241 case '<': 242 if (peek() == '=') { 243 input(); yylval.i = LE; RET(LE); 244 } else { 245 yylval.i = LT; RET(LT); 246 } 247 case '=': 248 if (peek() == '=') { 249 input(); yylval.i = EQ; RET(EQ); 250 } else { 251 yylval.i = ASSIGN; RET(ASGNOP); 252 } 253 case '>': 254 if (peek() == '=') { 255 input(); yylval.i = GE; RET(GE); 256 } else if (peek() == '>') { 257 input(); yylval.i = APPEND; RET(APPEND); 258 } else { 259 yylval.i = GT; RET(GT); 260 } 261 case '+': 262 if (peek() == '+') { 263 input(); yylval.i = INCR; RET(INCR); 264 } else if (peek() == '=') { 265 input(); yylval.i = ADDEQ; RET(ASGNOP); 266 } else 267 RET('+'); 268 case '-': 269 if (peek() == '-') { 270 input(); yylval.i = DECR; RET(DECR); 271 } else if (peek() == '=') { 272 input(); yylval.i = SUBEQ; RET(ASGNOP); 273 } else 274 RET('-'); 275 case '*': 276 if (peek() == '=') { /* *= */ 277 input(); yylval.i = MULTEQ; RET(ASGNOP); 278 } else if (peek() == '*') { /* ** or **= */ 279 input(); /* eat 2nd * */ 280 if (peek() == '=') { 281 input(); yylval.i = POWEQ; RET(ASGNOP); 282 } else { 283 RET(POWER); 284 } 285 } else 286 RET('*'); 287 case '/': 288 RET('/'); 289 case '%': 290 if (peek() == '=') { 291 input(); yylval.i = MODEQ; RET(ASGNOP); 292 } else 293 RET('%'); 294 case '^': 295 if (peek() == '=') { 296 input(); yylval.i = POWEQ; RET(ASGNOP); 297 } else 298 RET(POWER); 299 300 case '$': 301 /* BUG: awkward, if not wrong */ 302 c = gettok(&buf, &bufsize); 303 if (isalpha(c)) { 304 if (strcmp(buf, "NF") == 0) { /* very special */ 305 unputstr("(NF)"); 306 RET(INDIRECT); 307 } 308 c = peek(); 309 if (c == '(' || c == '[' || (infunc && isarg(buf) >= 0)) { 310 unputstr(buf); 311 RET(INDIRECT); 312 } 313 yylval.cp = setsymtab(buf, "", 0.0, STR|NUM, symtab); 314 RET(IVAR); 315 } else { 316 unputstr(buf); 317 RET(INDIRECT); 318 } 319 320 case '}': 321 if (--bracecnt < 0) 322 SYNTAX( "extra }" ); 323 sc = 1; 324 RET(';'); 325 case ']': 326 if (--brackcnt < 0) 327 SYNTAX( "extra ]" ); 328 RET(']'); 329 case ')': 330 if (--parencnt < 0) 331 SYNTAX( "extra )" ); 332 RET(')'); 333 case '{': 334 bracecnt++; 335 RET('{'); 336 case '[': 337 brackcnt++; 338 RET('['); 339 case '(': 340 parencnt++; 341 RET('('); 342 343 case '"': 344 return string(); /* BUG: should be like tran.c ? */ 345 346 default: 347 RET(c); 348 } 349 } 350 } 351 352 int string(void) 353 { 354 int c, n; 355 char *s, *bp; 356 static char *buf = 0; 357 static int bufsz = 500; 358 359 if (buf == 0 && (buf = (char *) malloc(bufsz)) == NULL) 360 FATAL("out of space for strings"); 361 for (bp = buf; (c = input()) != '"'; ) { 362 if (!adjbuf(&buf, &bufsz, bp-buf+2, 500, &bp, 0)) 363 FATAL("out of space for string %.10s...", buf); 364 switch (c) { 365 case '\n': 366 case '\r': 367 case 0: 368 SYNTAX( "non-terminated string %.10s...", buf ); 369 lineno++; 370 break; 371 case '\\': 372 c = input(); 373 switch (c) { 374 case '"': *bp++ = '"'; break; 375 case 'n': *bp++ = '\n'; break; 376 case 't': *bp++ = '\t'; break; 377 case 'f': *bp++ = '\f'; break; 378 case 'r': *bp++ = '\r'; break; 379 case 'b': *bp++ = '\b'; break; 380 case 'v': *bp++ = '\v'; break; 381 case 'a': *bp++ = '\007'; break; 382 case '\\': *bp++ = '\\'; break; 383 384 case '0': case '1': case '2': /* octal: \d \dd \ddd */ 385 case '3': case '4': case '5': case '6': case '7': 386 n = c - '0'; 387 if ((c = peek()) >= '0' && c < '8') { 388 n = 8 * n + input() - '0'; 389 if ((c = peek()) >= '0' && c < '8') 390 n = 8 * n + input() - '0'; 391 } 392 *bp++ = n; 393 break; 394 395 case 'x': /* hex \x0-9a-fA-F + */ 396 { char xbuf[100], *px; 397 for (px = xbuf; (c = input()) != 0 && px-xbuf < 100-2; ) { 398 if (isdigit(c) 399 || (c >= 'a' && c <= 'f') 400 || (c >= 'A' && c <= 'F')) 401 *px++ = c; 402 else 403 break; 404 } 405 *px = 0; 406 unput(c); 407 sscanf(xbuf, "%x", &n); 408 *bp++ = n; 409 break; 410 } 411 412 default: 413 *bp++ = c; 414 break; 415 } 416 break; 417 default: 418 *bp++ = c; 419 break; 420 } 421 } 422 *bp = 0; 423 s = tostring(buf); 424 *bp++ = ' '; *bp++ = 0; 425 yylval.cp = setsymtab(buf, s, 0.0, CON|STR|DONTFREE, symtab); 426 RET(STRING); 427 } 428 429 430 int binsearch(char *w, Keyword *kp, int n) 431 { 432 int cond, low, mid, high; 433 434 low = 0; 435 high = n - 1; 436 while (low <= high) { 437 mid = (low + high) / 2; 438 if ((cond = strcmp(w, kp[mid].word)) < 0) 439 high = mid - 1; 440 else if (cond > 0) 441 low = mid + 1; 442 else 443 return mid; 444 } 445 return -1; 446 } 447 448 int word(char *w) 449 { 450 Keyword *kp; 451 int c, n; 452 453 n = binsearch(w, keywords, sizeof(keywords)/sizeof(keywords[0])); 454 kp = keywords + n; 455 if (n != -1) { /* found in table */ 456 yylval.i = kp->sub; 457 switch (kp->type) { /* special handling */ 458 case FSYSTEM: 459 if (safe) 460 SYNTAX( "system is unsafe" ); 461 RET(kp->type); 462 case FUNC: 463 if (infunc) 464 SYNTAX( "illegal nested function" ); 465 RET(kp->type); 466 case RETURN: 467 if (!infunc) 468 SYNTAX( "return not in function" ); 469 RET(kp->type); 470 case VARNF: 471 yylval.cp = setsymtab("NF", "", 0.0, NUM, symtab); 472 RET(VARNF); 473 default: 474 RET(kp->type); 475 } 476 } 477 c = peek(); /* look for '(' */ 478 if (c != '(' && infunc && (n=isarg(w)) >= 0) { 479 yylval.i = n; 480 RET(ARG); 481 } else { 482 yylval.cp = setsymtab(w, "", 0.0, STR|NUM|DONTFREE, symtab); 483 if (c == '(') { 484 RET(CALL); 485 } else { 486 RET(VAR); 487 } 488 } 489 } 490 491 void startreg(void) /* next call to yyles will return a regular expression */ 492 { 493 reg = 1; 494 } 495 496 int regexpr(void) 497 { 498 int c; 499 static char *buf = 0; 500 static int bufsz = 500; 501 char *bp; 502 503 if (buf == 0 && (buf = (char *) malloc(bufsz)) == NULL) 504 FATAL("out of space for rex expr"); 505 bp = buf; 506 for ( ; (c = input()) != '/' && c != 0; ) { 507 if (!adjbuf(&buf, &bufsz, bp-buf+3, 500, &bp, 0)) 508 FATAL("out of space for reg expr %.10s...", buf); 509 if (c == '\n') { 510 SYNTAX( "newline in regular expression %.10s...", buf ); 511 unput('\n'); 512 break; 513 } else if (c == '\\') { 514 *bp++ = '\\'; 515 *bp++ = input(); 516 } else { 517 *bp++ = c; 518 } 519 } 520 *bp = 0; 521 yylval.s = tostring(buf); 522 unput('/'); 523 RET(REGEXPR); 524 } 525 526 /* low-level lexical stuff, sort of inherited from lex */ 527 528 char ebuf[300]; 529 char *ep = ebuf; 530 char yysbuf[100]; /* pushback buffer */ 531 char *yysptr = yysbuf; 532 FILE *yyin = 0; 533 534 int input(void) /* get next lexical input character */ 535 { 536 int c; 537 extern char *lexprog; 538 539 if (yysptr > yysbuf) 540 c = *--yysptr; 541 else if (lexprog != NULL) { /* awk '...' */ 542 if ((c = *lexprog) != 0) 543 lexprog++; 544 } else /* awk -f ... */ 545 c = pgetc(); 546 if (c == '\n') 547 lineno++; 548 else if (c == EOF) 549 c = 0; 550 if (ep >= ebuf + sizeof ebuf) 551 ep = ebuf; 552 return *ep++ = c; 553 } 554 555 void unput(int c) /* put lexical character back on input */ 556 { 557 if (c == '\n') 558 lineno--; 559 if (yysptr >= yysbuf + sizeof(yysbuf)) 560 FATAL("pushed back too much: %.20s...", yysbuf); 561 *yysptr++ = c; 562 if (--ep < ebuf) 563 ep = ebuf + sizeof(ebuf) - 1; 564 } 565 566 void unputstr(char *s) /* put a string back on input */ 567 { 568 int i; 569 570 for (i = strlen(s)-1; i >= 0; i--) 571 unput(s[i]); 572 } 573