1 /* $OpenBSD: lex.c,v 1.25 2020/07/30 17:45:44 millert Exp $ */ 2 /**************************************************************** 3 Copyright (C) Lucent Technologies 1997 4 All Rights Reserved 5 6 Permission to use, copy, modify, and distribute this software and 7 its documentation for any purpose and without fee is hereby 8 granted, provided that the above copyright notice appear in all 9 copies and that both that the copyright notice and this 10 permission notice and warranty disclaimer appear in supporting 11 documentation, and that the name Lucent Technologies or any of 12 its entities not be used in advertising or publicity pertaining 13 to distribution of the software without specific, written prior 14 permission. 15 16 LUCENT DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, 17 INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS. 18 IN NO EVENT SHALL LUCENT OR ANY OF ITS ENTITIES BE LIABLE FOR ANY 19 SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 20 WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER 21 IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, 22 ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF 23 THIS SOFTWARE. 24 ****************************************************************/ 25 26 #include <stdio.h> 27 #include <stdlib.h> 28 #include <string.h> 29 #include <ctype.h> 30 #include "awk.h" 31 #include "awkgram.tab.h" 32 33 extern YYSTYPE yylval; 34 extern bool infunc; 35 36 int lineno = 1; 37 int bracecnt = 0; 38 int brackcnt = 0; 39 int parencnt = 0; 40 41 typedef struct Keyword { 42 const char *word; 43 int sub; 44 int type; 45 } Keyword; 46 47 const Keyword keywords[] = { /* keep sorted: binary searched */ 48 { "BEGIN", XBEGIN, XBEGIN }, 49 { "END", XEND, XEND }, 50 { "NF", VARNF, VARNF }, 51 { "and", FAND, BLTIN }, 52 { "atan2", FATAN, BLTIN }, 53 { "break", BREAK, BREAK }, 54 { "close", CLOSE, CLOSE }, 55 { "compl", FCOMPL, BLTIN }, 56 { "continue", CONTINUE, CONTINUE }, 57 { "cos", FCOS, BLTIN }, 58 { "delete", DELETE, DELETE }, 59 { "do", DO, DO }, 60 { "else", ELSE, ELSE }, 61 { "exit", EXIT, EXIT }, 62 { "exp", FEXP, BLTIN }, 63 { "fflush", FFLUSH, BLTIN }, 64 { "for", FOR, FOR }, 65 { "func", FUNC, FUNC }, 66 { "function", FUNC, FUNC }, 67 { "gensub", GENSUB, GENSUB }, 68 { "getline", GETLINE, GETLINE }, 69 { "gsub", GSUB, GSUB }, 70 { "if", IF, IF }, 71 { "in", IN, IN }, 72 { "index", INDEX, INDEX }, 73 { "int", FINT, BLTIN }, 74 { "length", FLENGTH, BLTIN }, 75 { "log", FLOG, BLTIN }, 76 { "lshift", FLSHIFT, BLTIN }, 77 { "match", MATCHFCN, MATCHFCN }, 78 { "next", NEXT, NEXT }, 79 { "nextfile", NEXTFILE, NEXTFILE }, 80 { "or", FFOR, BLTIN }, 81 { "print", PRINT, PRINT }, 82 { "printf", PRINTF, PRINTF }, 83 { "rand", FRAND, BLTIN }, 84 { "return", RETURN, RETURN }, 85 { "rshift", FRSHIFT, BLTIN }, 86 { "sin", FSIN, BLTIN }, 87 { "split", SPLIT, SPLIT }, 88 { "sprintf", SPRINTF, SPRINTF }, 89 { "sqrt", FSQRT, BLTIN }, 90 { "srand", FSRAND, BLTIN }, 91 { "strftime", FSTRFTIME, BLTIN }, 92 { "sub", SUB, SUB }, 93 { "substr", SUBSTR, SUBSTR }, 94 { "system", FSYSTEM, BLTIN }, 95 { "systime", FSYSTIME, BLTIN }, 96 { "tolower", FTOLOWER, BLTIN }, 97 { "toupper", FTOUPPER, BLTIN }, 98 { "while", WHILE, WHILE }, 99 { "xor", FXOR, BLTIN }, 100 }; 101 102 #define RET(x) { if(dbg)printf("lex %s\n", tokname(x)); return(x); } 103 104 static int peek(void) 105 { 106 int c = input(); 107 unput(c); 108 return c; 109 } 110 111 static int gettok(char **pbuf, int *psz) /* get next input token */ 112 { 113 int c, retc; 114 char *buf = *pbuf; 115 int sz = *psz; 116 char *bp = buf; 117 118 c = input(); 119 if (c == 0) 120 return 0; 121 buf[0] = c; 122 buf[1] = 0; 123 if (!isalnum(c) && c != '.' && c != '_') 124 return c; 125 126 *bp++ = c; 127 if (isalpha(c) || c == '_') { /* it's a varname */ 128 for ( ; (c = input()) != 0; ) { 129 if (bp-buf >= sz) 130 if (!adjbuf(&buf, &sz, bp-buf+2, 100, &bp, "gettok")) 131 FATAL( "out of space for name %.10s...", buf ); 132 if (isalnum(c) || c == '_') 133 *bp++ = c; 134 else { 135 *bp = 0; 136 unput(c); 137 break; 138 } 139 } 140 *bp = 0; 141 retc = 'a'; /* alphanumeric */ 142 } else { /* maybe it's a number, but could be . */ 143 char *rem; 144 /* read input until can't be a number */ 145 for ( ; (c = input()) != 0; ) { 146 if (bp-buf >= sz) 147 if (!adjbuf(&buf, &sz, bp-buf+2, 100, &bp, "gettok")) 148 FATAL( "out of space for number %.10s...", buf ); 149 if (isdigit(c) || c == 'e' || c == 'E' 150 || c == '.' || c == '+' || c == '-') 151 *bp++ = c; 152 else { 153 unput(c); 154 break; 155 } 156 } 157 *bp = 0; 158 strtod(buf, &rem); /* parse the number */ 159 if (rem == buf) { /* it wasn't a valid number at all */ 160 buf[1] = 0; /* return one character as token */ 161 retc = (uschar)buf[0]; /* character is its own type */ 162 unputstr(rem+1); /* put rest back for later */ 163 } else { /* some prefix was a number */ 164 unputstr(rem); /* put rest back for later */ 165 rem[0] = 0; /* truncate buf after number part */ 166 retc = '0'; /* type is number */ 167 } 168 } 169 *pbuf = buf; 170 *psz = sz; 171 return retc; 172 } 173 174 int word(char *); 175 int string(void); 176 int regexpr(void); 177 bool sc = false; /* true => return a } right now */ 178 bool reg = false; /* true => return a REGEXPR now */ 179 180 int yylex(void) 181 { 182 int c; 183 static char *buf = NULL; 184 static int bufsize = 5; /* BUG: setting this small causes core dump! */ 185 186 if (buf == NULL && (buf = malloc(bufsize)) == NULL) 187 FATAL( "out of space in yylex" ); 188 if (sc) { 189 sc = false; 190 RET('}'); 191 } 192 if (reg) { 193 reg = false; 194 return regexpr(); 195 } 196 for (;;) { 197 c = gettok(&buf, &bufsize); 198 if (c == 0) 199 return 0; 200 if (isalpha(c) || c == '_') 201 return word(buf); 202 if (isdigit(c)) { 203 char *cp = tostring(buf); 204 yylval.cp = setsymtab(buf, cp, atof(buf), CON|NUM, symtab); 205 free(cp); 206 /* should this also have STR set? */ 207 RET(NUMBER); 208 } 209 210 yylval.i = c; 211 switch (c) { 212 case '\n': /* {EOL} */ 213 lineno++; 214 RET(NL); 215 case '\r': /* assume \n is coming */ 216 case ' ': /* {WS}+ */ 217 case '\t': 218 break; 219 case '#': /* #.* strip comments */ 220 while ((c = input()) != '\n' && c != 0) 221 ; 222 unput(c); 223 /* 224 * Next line is a hack, itcompensates for 225 * unput's treatment of \n. 226 */ 227 lineno++; 228 break; 229 case ';': 230 RET(';'); 231 case '\\': 232 if (peek() == '\n') { 233 input(); 234 lineno++; 235 } else if (peek() == '\r') { 236 input(); input(); /* \n */ 237 lineno++; 238 } else { 239 RET(c); 240 } 241 break; 242 case '&': 243 if (peek() == '&') { 244 input(); RET(AND); 245 } else 246 RET('&'); 247 case '|': 248 if (peek() == '|') { 249 input(); RET(BOR); 250 } else 251 RET('|'); 252 case '!': 253 if (peek() == '=') { 254 input(); yylval.i = NE; RET(NE); 255 } else if (peek() == '~') { 256 input(); yylval.i = NOTMATCH; RET(MATCHOP); 257 } else 258 RET(NOT); 259 case '~': 260 yylval.i = MATCH; 261 RET(MATCHOP); 262 case '<': 263 if (peek() == '=') { 264 input(); yylval.i = LE; RET(LE); 265 } else { 266 yylval.i = LT; RET(LT); 267 } 268 case '=': 269 if (peek() == '=') { 270 input(); yylval.i = EQ; RET(EQ); 271 } else { 272 yylval.i = ASSIGN; RET(ASGNOP); 273 } 274 case '>': 275 if (peek() == '=') { 276 input(); yylval.i = GE; RET(GE); 277 } else if (peek() == '>') { 278 input(); yylval.i = APPEND; RET(APPEND); 279 } else { 280 yylval.i = GT; RET(GT); 281 } 282 case '+': 283 if (peek() == '+') { 284 input(); yylval.i = INCR; RET(INCR); 285 } else if (peek() == '=') { 286 input(); yylval.i = ADDEQ; RET(ASGNOP); 287 } else 288 RET('+'); 289 case '-': 290 if (peek() == '-') { 291 input(); yylval.i = DECR; RET(DECR); 292 } else if (peek() == '=') { 293 input(); yylval.i = SUBEQ; RET(ASGNOP); 294 } else 295 RET('-'); 296 case '*': 297 if (peek() == '=') { /* *= */ 298 input(); yylval.i = MULTEQ; RET(ASGNOP); 299 } else if (peek() == '*') { /* ** or **= */ 300 input(); /* eat 2nd * */ 301 if (peek() == '=') { 302 input(); yylval.i = POWEQ; RET(ASGNOP); 303 } else { 304 RET(POWER); 305 } 306 } else 307 RET('*'); 308 case '/': 309 RET('/'); 310 case '%': 311 if (peek() == '=') { 312 input(); yylval.i = MODEQ; RET(ASGNOP); 313 } else 314 RET('%'); 315 case '^': 316 if (peek() == '=') { 317 input(); yylval.i = POWEQ; RET(ASGNOP); 318 } else 319 RET(POWER); 320 321 case '$': 322 /* BUG: awkward, if not wrong */ 323 c = gettok(&buf, &bufsize); 324 if (isalpha(c)) { 325 if (strcmp(buf, "NF") == 0) { /* very special */ 326 unputstr("(NF)"); 327 RET(INDIRECT); 328 } 329 c = peek(); 330 if (c == '(' || c == '[' || (infunc && isarg(buf) >= 0)) { 331 unputstr(buf); 332 RET(INDIRECT); 333 } 334 yylval.cp = setsymtab(buf, "", 0.0, STR|NUM, symtab); 335 RET(IVAR); 336 } else if (c == 0) { /* */ 337 SYNTAX( "unexpected end of input after $" ); 338 RET(';'); 339 } else { 340 unputstr(buf); 341 RET(INDIRECT); 342 } 343 344 case '}': 345 if (--bracecnt < 0) 346 SYNTAX( "extra }" ); 347 sc = true; 348 RET(';'); 349 case ']': 350 if (--brackcnt < 0) 351 SYNTAX( "extra ]" ); 352 RET(']'); 353 case ')': 354 if (--parencnt < 0) 355 SYNTAX( "extra )" ); 356 RET(')'); 357 case '{': 358 bracecnt++; 359 RET('{'); 360 case '[': 361 brackcnt++; 362 RET('['); 363 case '(': 364 parencnt++; 365 RET('('); 366 367 case '"': 368 return string(); /* BUG: should be like tran.c ? */ 369 370 default: 371 RET(c); 372 } 373 } 374 } 375 376 int string(void) 377 { 378 int c, n; 379 char *s, *bp; 380 static char *buf = NULL; 381 static int bufsz = 500; 382 383 if (buf == NULL && (buf = malloc(bufsz)) == NULL) 384 FATAL("out of space for strings"); 385 for (bp = buf; (c = input()) != '"'; ) { 386 if (!adjbuf(&buf, &bufsz, bp-buf+2, 500, &bp, "string")) 387 FATAL("out of space for string %.10s...", buf); 388 switch (c) { 389 case '\n': 390 case '\r': 391 case 0: 392 *bp = '\0'; 393 SYNTAX( "non-terminated string %.10s...", buf ); 394 if (c == 0) /* hopeless */ 395 FATAL( "giving up" ); 396 lineno++; 397 break; 398 case '\\': 399 c = input(); 400 switch (c) { 401 case '\n': break; 402 case '"': *bp++ = '"'; break; 403 case 'n': *bp++ = '\n'; break; 404 case 't': *bp++ = '\t'; break; 405 case 'f': *bp++ = '\f'; break; 406 case 'r': *bp++ = '\r'; break; 407 case 'b': *bp++ = '\b'; break; 408 case 'v': *bp++ = '\v'; break; 409 case 'a': *bp++ = '\a'; break; 410 case '\\': *bp++ = '\\'; break; 411 412 case '0': case '1': case '2': /* octal: \d \dd \ddd */ 413 case '3': case '4': case '5': case '6': case '7': 414 n = c - '0'; 415 if ((c = peek()) >= '0' && c < '8') { 416 n = 8 * n + input() - '0'; 417 if ((c = peek()) >= '0' && c < '8') 418 n = 8 * n + input() - '0'; 419 } 420 *bp++ = n; 421 break; 422 423 case 'x': /* hex \x0-9a-fA-F + */ 424 { char xbuf[100], *px; 425 for (px = xbuf; (c = input()) != 0 && px-xbuf < 100-2; ) { 426 if (isdigit(c) 427 || (c >= 'a' && c <= 'f') 428 || (c >= 'A' && c <= 'F')) 429 *px++ = c; 430 else 431 break; 432 } 433 *px = 0; 434 unput(c); 435 sscanf(xbuf, "%x", (unsigned int *) &n); 436 *bp++ = n; 437 break; 438 } 439 440 default: 441 *bp++ = c; 442 break; 443 } 444 break; 445 default: 446 *bp++ = c; 447 break; 448 } 449 } 450 *bp = 0; 451 s = tostring(buf); 452 *bp++ = ' '; *bp++ = '\0'; 453 yylval.cp = setsymtab(buf, s, 0.0, CON|STR|DONTFREE, symtab); 454 free(s); 455 RET(STRING); 456 } 457 458 459 static int binsearch(char *w, const Keyword *kp, int n) 460 { 461 int cond, low, mid, high; 462 463 low = 0; 464 high = n - 1; 465 while (low <= high) { 466 mid = (low + high) / 2; 467 if ((cond = strcmp(w, kp[mid].word)) < 0) 468 high = mid - 1; 469 else if (cond > 0) 470 low = mid + 1; 471 else 472 return mid; 473 } 474 return -1; 475 } 476 477 int word(char *w) 478 { 479 const Keyword *kp; 480 int c, n; 481 482 n = binsearch(w, keywords, sizeof(keywords)/sizeof(keywords[0])); 483 if (n != -1) { /* found in table */ 484 kp = keywords + n; 485 yylval.i = kp->sub; 486 switch (kp->type) { /* special handling */ 487 case BLTIN: 488 if (kp->sub == FSYSTEM && safe) 489 SYNTAX( "system is unsafe" ); 490 RET(kp->type); 491 case FUNC: 492 if (infunc) 493 SYNTAX( "illegal nested function" ); 494 RET(kp->type); 495 case RETURN: 496 if (!infunc) 497 SYNTAX( "return not in function" ); 498 RET(kp->type); 499 case VARNF: 500 yylval.cp = setsymtab("NF", "", 0.0, NUM, symtab); 501 RET(VARNF); 502 default: 503 RET(kp->type); 504 } 505 } 506 c = peek(); /* look for '(' */ 507 if (c != '(' && infunc && (n=isarg(w)) >= 0) { 508 yylval.i = n; 509 RET(ARG); 510 } else { 511 yylval.cp = setsymtab(w, "", 0.0, STR|NUM|DONTFREE, symtab); 512 if (c == '(') { 513 RET(CALL); 514 } else { 515 RET(VAR); 516 } 517 } 518 } 519 520 void startreg(void) /* next call to yylex will return a regular expression */ 521 { 522 reg = true; 523 } 524 525 int regexpr(void) 526 { 527 int c, openclass = 0; 528 static char *buf = NULL; 529 static int bufsz = 500; 530 char *bp, *cstart; 531 532 if (buf == NULL && (buf = malloc(bufsz)) == NULL) 533 FATAL("out of space for rex expr"); 534 bp = buf; 535 for ( ; ((c = input()) != '/' || openclass > 0) && c != 0; ) { 536 if (!adjbuf(&buf, &bufsz, bp-buf+3, 500, &bp, "regexpr")) 537 FATAL("out of space for reg expr %.10s...", buf); 538 if (c == '\n') { 539 *bp = '\0'; 540 SYNTAX( "newline in regular expression %.10s...", buf ); 541 unput('\n'); 542 break; 543 } else if (c == '\\') { 544 *bp++ = '\\'; 545 *bp++ = input(); 546 } else { 547 /* 548 * POSIX requires a slash in a regexp to be escaped, 549 * other awks don't require it to be escaped inside 550 * a character class. 551 */ 552 if (!do_posix) { 553 if (c == '[') { 554 int nextc = peek(); 555 if (openclass == 0 || nextc == ':' || 556 nextc == '.' || nextc == '=') { 557 if (++openclass == 1) 558 cstart = bp; 559 } 560 } else if (c == ']' && openclass > 0) { 561 /* 562 * A ']' as the first char in a 563 * class is treated literally. 564 */ 565 if (cstart != bp - 1 && 566 (cstart != bp - 2 || bp[-1] != '^')) 567 openclass--; 568 } 569 } 570 *bp++ = c; 571 } 572 } 573 *bp = 0; 574 if (c == 0) 575 SYNTAX("non-terminated regular expression %.10s...", buf); 576 yylval.s = tostring(buf); 577 unput('/'); 578 RET(REGEXPR); 579 } 580 581 /* low-level lexical stuff, sort of inherited from lex */ 582 583 char ebuf[300]; 584 char *ep = ebuf; 585 char yysbuf[100]; /* pushback buffer */ 586 char *yysptr = yysbuf; 587 FILE *yyin = NULL; 588 589 int input(void) /* get next lexical input character */ 590 { 591 int c; 592 extern char *lexprog; 593 594 if (yysptr > yysbuf) 595 c = (uschar)*--yysptr; 596 else if (lexprog != NULL) { /* awk '...' */ 597 if ((c = (uschar)*lexprog) != 0) 598 lexprog++; 599 } else /* awk -f ... */ 600 c = pgetc(); 601 if (c == EOF) 602 c = 0; 603 if (ep >= ebuf + sizeof ebuf) 604 ep = ebuf; 605 *ep = c; 606 if (c != 0) { 607 ep++; 608 } 609 return (c); 610 } 611 612 void unput(int c) /* put lexical character back on input */ 613 { 614 if (c == '\n') 615 lineno--; 616 if (yysptr >= yysbuf + sizeof(yysbuf)) 617 FATAL("pushed back too much: %.20s...", yysbuf); 618 *yysptr++ = c; 619 if (--ep < ebuf) 620 ep = ebuf + sizeof(ebuf) - 1; 621 } 622 623 void unputstr(const char *s) /* put a string back on input */ 624 { 625 int i; 626 627 for (i = strlen(s)-1; i >= 0; i--) 628 unput(s[i]); 629 } 630