1 /**************************************************************** 2 Copyright (C) Lucent Technologies 1997 3 All Rights Reserved 4 5 Permission to use, copy, modify, and distribute this software and 6 its documentation for any purpose and without fee is hereby 7 granted, provided that the above copyright notice appear in all 8 copies and that both that the copyright notice and this 9 permission notice and warranty disclaimer appear in supporting 10 documentation, and that the name Lucent Technologies or any of 11 its entities not be used in advertising or publicity pertaining 12 to distribution of the software without specific, written prior 13 permission. 14 15 LUCENT DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, 16 INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS. 17 IN NO EVENT SHALL LUCENT OR ANY OF ITS ENTITIES BE LIABLE FOR ANY 18 SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 19 WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER 20 IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, 21 ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF 22 THIS SOFTWARE. 23 ****************************************************************/ 24 25 #include <stdio.h> 26 #include <stdlib.h> 27 #include <string.h> 28 #include <ctype.h> 29 #include "awk.h" 30 #include "ytab.h" 31 32 extern YYSTYPE yylval; 33 extern bool infunc; 34 35 int lineno = 1; 36 int bracecnt = 0; 37 int brackcnt = 0; 38 int parencnt = 0; 39 40 typedef struct Keyword { 41 const char *word; 42 int sub; 43 int type; 44 } Keyword; 45 46 const Keyword keywords[] = { /* keep sorted: binary searched */ 47 { "BEGIN", XBEGIN, XBEGIN }, 48 { "END", XEND, XEND }, 49 { "NF", VARNF, VARNF }, 50 { "atan2", FATAN, BLTIN }, 51 { "break", BREAK, BREAK }, 52 { "close", CLOSE, CLOSE }, 53 { "continue", CONTINUE, CONTINUE }, 54 { "cos", FCOS, BLTIN }, 55 { "delete", DELETE, DELETE }, 56 { "do", DO, DO }, 57 { "else", ELSE, ELSE }, 58 { "exit", EXIT, EXIT }, 59 { "exp", FEXP, BLTIN }, 60 { "fflush", FFLUSH, BLTIN }, 61 { "for", FOR, FOR }, 62 { "func", FUNC, FUNC }, 63 { "function", FUNC, FUNC }, 64 { "getline", GETLINE, GETLINE }, 65 { "gsub", GSUB, GSUB }, 66 { "if", IF, IF }, 67 { "in", IN, IN }, 68 { "index", INDEX, INDEX }, 69 { "int", FINT, BLTIN }, 70 { "length", FLENGTH, BLTIN }, 71 { "log", FLOG, BLTIN }, 72 { "match", MATCHFCN, MATCHFCN }, 73 { "next", NEXT, NEXT }, 74 { "nextfile", NEXTFILE, NEXTFILE }, 75 { "print", PRINT, PRINT }, 76 { "printf", PRINTF, PRINTF }, 77 { "rand", FRAND, BLTIN }, 78 { "return", RETURN, RETURN }, 79 { "sin", FSIN, BLTIN }, 80 { "split", SPLIT, SPLIT }, 81 { "sprintf", SPRINTF, SPRINTF }, 82 { "sqrt", FSQRT, BLTIN }, 83 { "srand", FSRAND, BLTIN }, 84 { "sub", SUB, SUB }, 85 { "substr", SUBSTR, SUBSTR }, 86 { "system", FSYSTEM, BLTIN }, 87 { "tolower", FTOLOWER, BLTIN }, 88 { "toupper", FTOUPPER, BLTIN }, 89 { "while", WHILE, WHILE }, 90 }; 91 92 #define RET(x) { if(dbg)printf("lex %s\n", tokname(x)); return(x); } 93 94 static int peek(void) 95 { 96 int c = input(); 97 unput(c); 98 return c; 99 } 100 101 static int gettok(char **pbuf, int *psz) /* get next input token */ 102 { 103 int c, retc; 104 char *buf = *pbuf; 105 int sz = *psz; 106 char *bp = buf; 107 108 c = input(); 109 if (c == 0) 110 return 0; 111 buf[0] = c; 112 buf[1] = 0; 113 if (!isalnum(c) && c != '.' && c != '_') 114 return c; 115 116 *bp++ = c; 117 if (isalpha(c) || c == '_') { /* it's a varname */ 118 for ( ; (c = input()) != 0; ) { 119 if (bp-buf >= sz) 120 if (!adjbuf(&buf, &sz, bp-buf+2, 100, &bp, "gettok")) 121 FATAL( "out of space for name %.10s...", buf ); 122 if (isalnum(c) || c == '_') 123 *bp++ = c; 124 else { 125 *bp = 0; 126 unput(c); 127 break; 128 } 129 } 130 *bp = 0; 131 retc = 'a'; /* alphanumeric */ 132 } else { /* maybe it's a number, but could be . */ 133 char *rem; 134 /* read input until can't be a number */ 135 for ( ; (c = input()) != 0; ) { 136 if (bp-buf >= sz) 137 if (!adjbuf(&buf, &sz, bp-buf+2, 100, &bp, "gettok")) 138 FATAL( "out of space for number %.10s...", buf ); 139 if (isdigit(c) || c == 'e' || c == 'E' 140 || c == '.' || c == '+' || c == '-') 141 *bp++ = c; 142 else { 143 unput(c); 144 break; 145 } 146 } 147 *bp = 0; 148 strtod(buf, &rem); /* parse the number */ 149 if (rem == buf) { /* it wasn't a valid number at all */ 150 buf[1] = 0; /* return one character as token */ 151 retc = buf[0]; /* character is its own type */ 152 unputstr(rem+1); /* put rest back for later */ 153 } else { /* some prefix was a number */ 154 unputstr(rem); /* put rest back for later */ 155 rem[0] = 0; /* truncate buf after number part */ 156 retc = '0'; /* type is number */ 157 } 158 } 159 *pbuf = buf; 160 *psz = sz; 161 return retc; 162 } 163 164 int word(char *); 165 int string(void); 166 int regexpr(void); 167 bool sc = false; /* true => return a } right now */ 168 bool reg = false; /* true => return a REGEXPR now */ 169 170 int yylex(void) 171 { 172 int c; 173 static char *buf = NULL; 174 static int bufsize = 5; /* BUG: setting this small causes core dump! */ 175 176 if (buf == NULL && (buf = malloc(bufsize)) == NULL) 177 FATAL( "out of space in yylex" ); 178 if (sc) { 179 sc = false; 180 RET('}'); 181 } 182 if (reg) { 183 reg = false; 184 return regexpr(); 185 } 186 for (;;) { 187 c = gettok(&buf, &bufsize); 188 if (c == 0) 189 return 0; 190 if (isalpha(c) || c == '_') 191 return word(buf); 192 if (isdigit(c)) { 193 char *cp = tostring(buf); 194 yylval.cp = setsymtab(buf, cp, atof(buf), CON|NUM, symtab); 195 free(cp); 196 /* should this also have STR set? */ 197 RET(NUMBER); 198 } 199 200 yylval.i = c; 201 switch (c) { 202 case '\n': /* {EOL} */ 203 lineno++; 204 RET(NL); 205 case '\r': /* assume \n is coming */ 206 case ' ': /* {WS}+ */ 207 case '\t': 208 break; 209 case '#': /* #.* strip comments */ 210 while ((c = input()) != '\n' && c != 0) 211 ; 212 unput(c); 213 /* 214 * Next line is a hack, itcompensates for 215 * unput's treatment of \n. 216 */ 217 lineno++; 218 break; 219 case ';': 220 RET(';'); 221 case '\\': 222 if (peek() == '\n') { 223 input(); 224 lineno++; 225 } else if (peek() == '\r') { 226 input(); input(); /* \n */ 227 lineno++; 228 } else { 229 RET(c); 230 } 231 break; 232 case '&': 233 if (peek() == '&') { 234 input(); RET(AND); 235 } else 236 RET('&'); 237 case '|': 238 if (peek() == '|') { 239 input(); RET(BOR); 240 } else 241 RET('|'); 242 case '!': 243 if (peek() == '=') { 244 input(); yylval.i = NE; RET(NE); 245 } else if (peek() == '~') { 246 input(); yylval.i = NOTMATCH; RET(MATCHOP); 247 } else 248 RET(NOT); 249 case '~': 250 yylval.i = MATCH; 251 RET(MATCHOP); 252 case '<': 253 if (peek() == '=') { 254 input(); yylval.i = LE; RET(LE); 255 } else { 256 yylval.i = LT; RET(LT); 257 } 258 case '=': 259 if (peek() == '=') { 260 input(); yylval.i = EQ; RET(EQ); 261 } else { 262 yylval.i = ASSIGN; RET(ASGNOP); 263 } 264 case '>': 265 if (peek() == '=') { 266 input(); yylval.i = GE; RET(GE); 267 } else if (peek() == '>') { 268 input(); yylval.i = APPEND; RET(APPEND); 269 } else { 270 yylval.i = GT; RET(GT); 271 } 272 case '+': 273 if (peek() == '+') { 274 input(); yylval.i = INCR; RET(INCR); 275 } else if (peek() == '=') { 276 input(); yylval.i = ADDEQ; RET(ASGNOP); 277 } else 278 RET('+'); 279 case '-': 280 if (peek() == '-') { 281 input(); yylval.i = DECR; RET(DECR); 282 } else if (peek() == '=') { 283 input(); yylval.i = SUBEQ; RET(ASGNOP); 284 } else 285 RET('-'); 286 case '*': 287 if (peek() == '=') { /* *= */ 288 input(); yylval.i = MULTEQ; RET(ASGNOP); 289 } else if (peek() == '*') { /* ** or **= */ 290 input(); /* eat 2nd * */ 291 if (peek() == '=') { 292 input(); yylval.i = POWEQ; RET(ASGNOP); 293 } else { 294 RET(POWER); 295 } 296 } else 297 RET('*'); 298 case '/': 299 RET('/'); 300 case '%': 301 if (peek() == '=') { 302 input(); yylval.i = MODEQ; RET(ASGNOP); 303 } else 304 RET('%'); 305 case '^': 306 if (peek() == '=') { 307 input(); yylval.i = POWEQ; RET(ASGNOP); 308 } else 309 RET(POWER); 310 311 case '$': 312 /* BUG: awkward, if not wrong */ 313 c = gettok(&buf, &bufsize); 314 if (isalpha(c)) { 315 if (strcmp(buf, "NF") == 0) { /* very special */ 316 unputstr("(NF)"); 317 RET(INDIRECT); 318 } 319 c = peek(); 320 if (c == '(' || c == '[' || (infunc && isarg(buf) >= 0)) { 321 unputstr(buf); 322 RET(INDIRECT); 323 } 324 yylval.cp = setsymtab(buf, "", 0.0, STR|NUM, symtab); 325 RET(IVAR); 326 } else if (c == 0) { /* */ 327 SYNTAX( "unexpected end of input after $" ); 328 RET(';'); 329 } else { 330 unputstr(buf); 331 RET(INDIRECT); 332 } 333 334 case '}': 335 if (--bracecnt < 0) 336 SYNTAX( "extra }" ); 337 sc = true; 338 RET(';'); 339 case ']': 340 if (--brackcnt < 0) 341 SYNTAX( "extra ]" ); 342 RET(']'); 343 case ')': 344 if (--parencnt < 0) 345 SYNTAX( "extra )" ); 346 RET(')'); 347 case '{': 348 bracecnt++; 349 RET('{'); 350 case '[': 351 brackcnt++; 352 RET('['); 353 case '(': 354 parencnt++; 355 RET('('); 356 357 case '"': 358 return string(); /* BUG: should be like tran.c ? */ 359 360 default: 361 RET(c); 362 } 363 } 364 } 365 366 int string(void) 367 { 368 int c, n; 369 char *s, *bp; 370 static char *buf = NULL; 371 static int bufsz = 500; 372 373 if (buf == NULL && (buf = malloc(bufsz)) == NULL) 374 FATAL("out of space for strings"); 375 for (bp = buf; (c = input()) != '"'; ) { 376 if (!adjbuf(&buf, &bufsz, bp-buf+2, 500, &bp, "string")) 377 FATAL("out of space for string %.10s...", buf); 378 switch (c) { 379 case '\n': 380 case '\r': 381 case 0: 382 *bp = '\0'; 383 SYNTAX( "non-terminated string %.10s...", buf ); 384 if (c == 0) /* hopeless */ 385 FATAL( "giving up" ); 386 lineno++; 387 break; 388 case '\\': 389 c = input(); 390 switch (c) { 391 case '\n': break; 392 case '"': *bp++ = '"'; break; 393 case 'n': *bp++ = '\n'; break; 394 case 't': *bp++ = '\t'; break; 395 case 'f': *bp++ = '\f'; break; 396 case 'r': *bp++ = '\r'; break; 397 case 'b': *bp++ = '\b'; break; 398 case 'v': *bp++ = '\v'; break; 399 case 'a': *bp++ = '\a'; break; 400 case '\\': *bp++ = '\\'; break; 401 402 case '0': case '1': case '2': /* octal: \d \dd \ddd */ 403 case '3': case '4': case '5': case '6': case '7': 404 n = c - '0'; 405 if ((c = peek()) >= '0' && c < '8') { 406 n = 8 * n + input() - '0'; 407 if ((c = peek()) >= '0' && c < '8') 408 n = 8 * n + input() - '0'; 409 } 410 *bp++ = n; 411 break; 412 413 case 'x': /* hex \x0-9a-fA-F + */ 414 { char xbuf[100], *px; 415 for (px = xbuf; (c = input()) != 0 && px-xbuf < 100-2; ) { 416 if (isdigit(c) 417 || (c >= 'a' && c <= 'f') 418 || (c >= 'A' && c <= 'F')) 419 *px++ = c; 420 else 421 break; 422 } 423 *px = 0; 424 unput(c); 425 sscanf(xbuf, "%x", (unsigned int *) &n); 426 *bp++ = n; 427 break; 428 } 429 430 default: 431 *bp++ = c; 432 break; 433 } 434 break; 435 default: 436 *bp++ = c; 437 break; 438 } 439 } 440 *bp = 0; 441 s = tostring(buf); 442 *bp++ = ' '; *bp++ = '\0'; 443 yylval.cp = setsymtab(buf, s, 0.0, CON|STR|DONTFREE, symtab); 444 free(s); 445 RET(STRING); 446 } 447 448 449 static int binsearch(char *w, const Keyword *kp, int n) 450 { 451 int cond, low, mid, high; 452 453 low = 0; 454 high = n - 1; 455 while (low <= high) { 456 mid = (low + high) / 2; 457 if ((cond = strcmp(w, kp[mid].word)) < 0) 458 high = mid - 1; 459 else if (cond > 0) 460 low = mid + 1; 461 else 462 return mid; 463 } 464 return -1; 465 } 466 467 int word(char *w) 468 { 469 const Keyword *kp; 470 int c, n; 471 472 n = binsearch(w, keywords, sizeof(keywords)/sizeof(keywords[0])); 473 if (n != -1) { /* found in table */ 474 kp = keywords + n; 475 yylval.i = kp->sub; 476 switch (kp->type) { /* special handling */ 477 case BLTIN: 478 if (kp->sub == FSYSTEM && safe) 479 SYNTAX( "system is unsafe" ); 480 RET(kp->type); 481 case FUNC: 482 if (infunc) 483 SYNTAX( "illegal nested function" ); 484 RET(kp->type); 485 case RETURN: 486 if (!infunc) 487 SYNTAX( "return not in function" ); 488 RET(kp->type); 489 case VARNF: 490 yylval.cp = setsymtab("NF", "", 0.0, NUM, symtab); 491 RET(VARNF); 492 default: 493 RET(kp->type); 494 } 495 } 496 c = peek(); /* look for '(' */ 497 if (c != '(' && infunc && (n=isarg(w)) >= 0) { 498 yylval.i = n; 499 RET(ARG); 500 } else { 501 yylval.cp = setsymtab(w, "", 0.0, STR|NUM|DONTFREE, symtab); 502 if (c == '(') { 503 RET(CALL); 504 } else { 505 RET(VAR); 506 } 507 } 508 } 509 510 void startreg(void) /* next call to yylex will return a regular expression */ 511 { 512 reg = true; 513 } 514 515 int regexpr(void) 516 { 517 int c; 518 static char *buf = NULL; 519 static int bufsz = 500; 520 char *bp; 521 522 if (buf == NULL && (buf = malloc(bufsz)) == NULL) 523 FATAL("out of space for rex expr"); 524 bp = buf; 525 for ( ; (c = input()) != '/' && c != 0; ) { 526 if (!adjbuf(&buf, &bufsz, bp-buf+3, 500, &bp, "regexpr")) 527 FATAL("out of space for reg expr %.10s...", buf); 528 if (c == '\n') { 529 *bp = '\0'; 530 SYNTAX( "newline in regular expression %.10s...", buf ); 531 unput('\n'); 532 break; 533 } else if (c == '\\') { 534 *bp++ = '\\'; 535 *bp++ = input(); 536 } else { 537 *bp++ = c; 538 } 539 } 540 *bp = 0; 541 if (c == 0) 542 SYNTAX("non-terminated regular expression %.10s...", buf); 543 yylval.s = tostring(buf); 544 unput('/'); 545 RET(REGEXPR); 546 } 547 548 /* low-level lexical stuff, sort of inherited from lex */ 549 550 char ebuf[300]; 551 char *ep = ebuf; 552 char yysbuf[100]; /* pushback buffer */ 553 char *yysptr = yysbuf; 554 FILE *yyin = NULL; 555 556 int input(void) /* get next lexical input character */ 557 { 558 int c; 559 extern char *lexprog; 560 561 if (yysptr > yysbuf) 562 c = (uschar)*--yysptr; 563 else if (lexprog != NULL) { /* awk '...' */ 564 if ((c = (uschar)*lexprog) != 0) 565 lexprog++; 566 } else /* awk -f ... */ 567 c = pgetc(); 568 if (c == EOF) 569 c = 0; 570 if (ep >= ebuf + sizeof ebuf) 571 ep = ebuf; 572 *ep = c; 573 if (c != 0) { 574 ep++; 575 } 576 return (c); 577 } 578 579 void unput(int c) /* put lexical character back on input */ 580 { 581 if (c == '\n') 582 lineno--; 583 if (yysptr >= yysbuf + sizeof(yysbuf)) 584 FATAL("pushed back too much: %.20s...", yysbuf); 585 *yysptr++ = c; 586 if (--ep < ebuf) 587 ep = ebuf + sizeof(ebuf) - 1; 588 } 589 590 void unputstr(const char *s) /* put a string back on input */ 591 { 592 int i; 593 594 for (i = strlen(s)-1; i >= 0; i--) 595 unput(s[i]); 596 } 597