1 /* $OpenBSD: lex.c,v 1.27 2020/12/09 20:00:11 millert Exp $ */ 2 /**************************************************************** 3 Copyright (C) Lucent Technologies 1997 4 All Rights Reserved 5 6 Permission to use, copy, modify, and distribute this software and 7 its documentation for any purpose and without fee is hereby 8 granted, provided that the above copyright notice appear in all 9 copies and that both that the copyright notice and this 10 permission notice and warranty disclaimer appear in supporting 11 documentation, and that the name Lucent Technologies or any of 12 its entities not be used in advertising or publicity pertaining 13 to distribution of the software without specific, written prior 14 permission. 15 16 LUCENT DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, 17 INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS. 18 IN NO EVENT SHALL LUCENT OR ANY OF ITS ENTITIES BE LIABLE FOR ANY 19 SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 20 WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER 21 IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, 22 ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF 23 THIS SOFTWARE. 24 ****************************************************************/ 25 26 #include <stdio.h> 27 #include <stdlib.h> 28 #include <string.h> 29 #include <ctype.h> 30 #include "awk.h" 31 #include "awkgram.tab.h" 32 33 extern YYSTYPE yylval; 34 extern bool infunc; 35 36 int lineno = 1; 37 int bracecnt = 0; 38 int brackcnt = 0; 39 int parencnt = 0; 40 41 typedef struct Keyword { 42 const char *word; 43 int sub; 44 int type; 45 } Keyword; 46 47 const Keyword keywords[] = { /* keep sorted: binary searched */ 48 { "BEGIN", XBEGIN, XBEGIN }, 49 { "END", XEND, XEND }, 50 { "NF", VARNF, VARNF }, 51 { "and", FAND, BLTIN }, 52 { "atan2", FATAN, BLTIN }, 53 { "break", BREAK, BREAK }, 54 { "close", CLOSE, CLOSE }, 55 { "compl", FCOMPL, BLTIN }, 56 { "continue", CONTINUE, CONTINUE }, 57 { "cos", FCOS, BLTIN }, 58 { "delete", DELETE, DELETE }, 59 { "do", DO, DO }, 60 { "else", ELSE, ELSE }, 61 { "exit", EXIT, EXIT }, 62 { "exp", FEXP, BLTIN }, 63 { "fflush", FFLUSH, BLTIN }, 64 { "for", FOR, FOR }, 65 { "func", FUNC, FUNC }, 66 { "function", FUNC, FUNC }, 67 { "gensub", GENSUB, GENSUB }, 68 { "getline", GETLINE, GETLINE }, 69 { "gsub", GSUB, GSUB }, 70 { "if", IF, IF }, 71 { "in", IN, IN }, 72 { "index", INDEX, INDEX }, 73 { "int", FINT, BLTIN }, 74 { "length", FLENGTH, BLTIN }, 75 { "log", FLOG, BLTIN }, 76 { "lshift", FLSHIFT, BLTIN }, 77 { "match", MATCHFCN, MATCHFCN }, 78 { "mktime", FMKTIME, BLTIN }, 79 { "next", NEXT, NEXT }, 80 { "nextfile", NEXTFILE, NEXTFILE }, 81 { "or", FFOR, BLTIN }, 82 { "print", PRINT, PRINT }, 83 { "printf", PRINTF, PRINTF }, 84 { "rand", FRAND, BLTIN }, 85 { "return", RETURN, RETURN }, 86 { "rshift", FRSHIFT, BLTIN }, 87 { "sin", FSIN, BLTIN }, 88 { "split", SPLIT, SPLIT }, 89 { "sprintf", SPRINTF, SPRINTF }, 90 { "sqrt", FSQRT, BLTIN }, 91 { "srand", FSRAND, BLTIN }, 92 { "strftime", FSTRFTIME, BLTIN }, 93 { "sub", SUB, SUB }, 94 { "substr", SUBSTR, SUBSTR }, 95 { "system", FSYSTEM, BLTIN }, 96 { "systime", FSYSTIME, BLTIN }, 97 { "tolower", FTOLOWER, BLTIN }, 98 { "toupper", FTOUPPER, BLTIN }, 99 { "while", WHILE, WHILE }, 100 { "xor", FXOR, BLTIN }, 101 }; 102 103 #define RET(x) { if(dbg)printf("lex %s\n", tokname(x)); return(x); } 104 105 static int peek(void) 106 { 107 int c = input(); 108 unput(c); 109 return c; 110 } 111 112 static int gettok(char **pbuf, int *psz) /* get next input token */ 113 { 114 int c, retc; 115 char *buf = *pbuf; 116 int sz = *psz; 117 char *bp = buf; 118 119 c = input(); 120 if (c == 0) 121 return 0; 122 buf[0] = c; 123 buf[1] = 0; 124 if (!isalnum(c) && c != '.' && c != '_') 125 return c; 126 127 *bp++ = c; 128 if (isalpha(c) || c == '_') { /* it's a varname */ 129 for ( ; (c = input()) != 0; ) { 130 if (bp-buf >= sz) 131 if (!adjbuf(&buf, &sz, bp-buf+2, 100, &bp, "gettok")) 132 FATAL( "out of space for name %.10s...", buf ); 133 if (isalnum(c) || c == '_') 134 *bp++ = c; 135 else { 136 *bp = 0; 137 unput(c); 138 break; 139 } 140 } 141 *bp = 0; 142 retc = 'a'; /* alphanumeric */ 143 } else { /* maybe it's a number, but could be . */ 144 char *rem; 145 /* read input until can't be a number */ 146 for ( ; (c = input()) != 0; ) { 147 if (bp-buf >= sz) 148 if (!adjbuf(&buf, &sz, bp-buf+2, 100, &bp, "gettok")) 149 FATAL( "out of space for number %.10s...", buf ); 150 if (isdigit(c) || c == 'e' || c == 'E' 151 || c == '.' || c == '+' || c == '-') 152 *bp++ = c; 153 else { 154 unput(c); 155 break; 156 } 157 } 158 *bp = 0; 159 strtod(buf, &rem); /* parse the number */ 160 if (rem == buf) { /* it wasn't a valid number at all */ 161 buf[1] = 0; /* return one character as token */ 162 retc = (uschar)buf[0]; /* character is its own type */ 163 unputstr(rem+1); /* put rest back for later */ 164 } else { /* some prefix was a number */ 165 unputstr(rem); /* put rest back for later */ 166 rem[0] = 0; /* truncate buf after number part */ 167 retc = '0'; /* type is number */ 168 } 169 } 170 *pbuf = buf; 171 *psz = sz; 172 return retc; 173 } 174 175 int word(char *); 176 int string(void); 177 int regexpr(void); 178 bool sc = false; /* true => return a } right now */ 179 bool reg = false; /* true => return a REGEXPR now */ 180 181 int yylex(void) 182 { 183 int c; 184 static char *buf = NULL; 185 static int bufsize = 5; /* BUG: setting this small causes core dump! */ 186 187 if (buf == NULL && (buf = (char *) malloc(bufsize)) == NULL) 188 FATAL( "out of space in yylex" ); 189 if (sc) { 190 sc = false; 191 RET('}'); 192 } 193 if (reg) { 194 reg = false; 195 return regexpr(); 196 } 197 for (;;) { 198 c = gettok(&buf, &bufsize); 199 if (c == 0) 200 return 0; 201 if (isalpha(c) || c == '_') 202 return word(buf); 203 if (isdigit(c)) { 204 char *cp = tostring(buf); 205 double result; 206 207 if (is_number(cp, & result)) 208 yylval.cp = setsymtab(buf, cp, result, CON|NUM, symtab); 209 else 210 yylval.cp = setsymtab(buf, cp, 0.0, STR, symtab); 211 free(cp); 212 /* should this also have STR set? */ 213 RET(NUMBER); 214 } 215 216 yylval.i = c; 217 switch (c) { 218 case '\n': /* {EOL} */ 219 lineno++; 220 RET(NL); 221 case '\r': /* assume \n is coming */ 222 case ' ': /* {WS}+ */ 223 case '\t': 224 break; 225 case '#': /* #.* strip comments */ 226 while ((c = input()) != '\n' && c != 0) 227 ; 228 unput(c); 229 /* 230 * Next line is a hack, itcompensates for 231 * unput's treatment of \n. 232 */ 233 lineno++; 234 break; 235 case ';': 236 RET(';'); 237 case '\\': 238 if (peek() == '\n') { 239 input(); 240 lineno++; 241 } else if (peek() == '\r') { 242 input(); input(); /* \n */ 243 lineno++; 244 } else { 245 RET(c); 246 } 247 break; 248 case '&': 249 if (peek() == '&') { 250 input(); RET(AND); 251 } else 252 RET('&'); 253 case '|': 254 if (peek() == '|') { 255 input(); RET(BOR); 256 } else 257 RET('|'); 258 case '!': 259 if (peek() == '=') { 260 input(); yylval.i = NE; RET(NE); 261 } else if (peek() == '~') { 262 input(); yylval.i = NOTMATCH; RET(MATCHOP); 263 } else 264 RET(NOT); 265 case '~': 266 yylval.i = MATCH; 267 RET(MATCHOP); 268 case '<': 269 if (peek() == '=') { 270 input(); yylval.i = LE; RET(LE); 271 } else { 272 yylval.i = LT; RET(LT); 273 } 274 case '=': 275 if (peek() == '=') { 276 input(); yylval.i = EQ; RET(EQ); 277 } else { 278 yylval.i = ASSIGN; RET(ASGNOP); 279 } 280 case '>': 281 if (peek() == '=') { 282 input(); yylval.i = GE; RET(GE); 283 } else if (peek() == '>') { 284 input(); yylval.i = APPEND; RET(APPEND); 285 } else { 286 yylval.i = GT; RET(GT); 287 } 288 case '+': 289 if (peek() == '+') { 290 input(); yylval.i = INCR; RET(INCR); 291 } else if (peek() == '=') { 292 input(); yylval.i = ADDEQ; RET(ASGNOP); 293 } else 294 RET('+'); 295 case '-': 296 if (peek() == '-') { 297 input(); yylval.i = DECR; RET(DECR); 298 } else if (peek() == '=') { 299 input(); yylval.i = SUBEQ; RET(ASGNOP); 300 } else 301 RET('-'); 302 case '*': 303 if (peek() == '=') { /* *= */ 304 input(); yylval.i = MULTEQ; RET(ASGNOP); 305 } else if (peek() == '*') { /* ** or **= */ 306 input(); /* eat 2nd * */ 307 if (peek() == '=') { 308 input(); yylval.i = POWEQ; RET(ASGNOP); 309 } else { 310 RET(POWER); 311 } 312 } else 313 RET('*'); 314 case '/': 315 RET('/'); 316 case '%': 317 if (peek() == '=') { 318 input(); yylval.i = MODEQ; RET(ASGNOP); 319 } else 320 RET('%'); 321 case '^': 322 if (peek() == '=') { 323 input(); yylval.i = POWEQ; RET(ASGNOP); 324 } else 325 RET(POWER); 326 327 case '$': 328 /* BUG: awkward, if not wrong */ 329 c = gettok(&buf, &bufsize); 330 if (isalpha(c)) { 331 if (strcmp(buf, "NF") == 0) { /* very special */ 332 unputstr("(NF)"); 333 RET(INDIRECT); 334 } 335 c = peek(); 336 if (c == '(' || c == '[' || (infunc && isarg(buf) >= 0)) { 337 unputstr(buf); 338 RET(INDIRECT); 339 } 340 yylval.cp = setsymtab(buf, "", 0.0, STR|NUM, symtab); 341 RET(IVAR); 342 } else if (c == 0) { /* */ 343 SYNTAX( "unexpected end of input after $" ); 344 RET(';'); 345 } else { 346 unputstr(buf); 347 RET(INDIRECT); 348 } 349 350 case '}': 351 if (--bracecnt < 0) 352 SYNTAX( "extra }" ); 353 sc = true; 354 RET(';'); 355 case ']': 356 if (--brackcnt < 0) 357 SYNTAX( "extra ]" ); 358 RET(']'); 359 case ')': 360 if (--parencnt < 0) 361 SYNTAX( "extra )" ); 362 RET(')'); 363 case '{': 364 bracecnt++; 365 RET('{'); 366 case '[': 367 brackcnt++; 368 RET('['); 369 case '(': 370 parencnt++; 371 RET('('); 372 373 case '"': 374 return string(); /* BUG: should be like tran.c ? */ 375 376 default: 377 RET(c); 378 } 379 } 380 } 381 382 int string(void) 383 { 384 int c, n; 385 char *s, *bp; 386 static char *buf = NULL; 387 static int bufsz = 500; 388 389 if (buf == NULL && (buf = (char *) malloc(bufsz)) == NULL) 390 FATAL("out of space for strings"); 391 for (bp = buf; (c = input()) != '"'; ) { 392 if (!adjbuf(&buf, &bufsz, bp-buf+2, 500, &bp, "string")) 393 FATAL("out of space for string %.10s...", buf); 394 switch (c) { 395 case '\n': 396 case '\r': 397 case 0: 398 *bp = '\0'; 399 SYNTAX( "non-terminated string %.10s...", buf ); 400 if (c == 0) /* hopeless */ 401 FATAL( "giving up" ); 402 lineno++; 403 break; 404 case '\\': 405 c = input(); 406 switch (c) { 407 case '\n': break; 408 case '"': *bp++ = '"'; break; 409 case 'n': *bp++ = '\n'; break; 410 case 't': *bp++ = '\t'; break; 411 case 'f': *bp++ = '\f'; break; 412 case 'r': *bp++ = '\r'; break; 413 case 'b': *bp++ = '\b'; break; 414 case 'v': *bp++ = '\v'; break; 415 case 'a': *bp++ = '\a'; break; 416 case '\\': *bp++ = '\\'; break; 417 418 case '0': case '1': case '2': /* octal: \d \dd \ddd */ 419 case '3': case '4': case '5': case '6': case '7': 420 n = c - '0'; 421 if ((c = peek()) >= '0' && c < '8') { 422 n = 8 * n + input() - '0'; 423 if ((c = peek()) >= '0' && c < '8') 424 n = 8 * n + input() - '0'; 425 } 426 *bp++ = n; 427 break; 428 429 case 'x': /* hex \x0-9a-fA-F + */ 430 { char xbuf[100], *px; 431 for (px = xbuf; (c = input()) != 0 && px-xbuf < 100-2; ) { 432 if (isdigit(c) 433 || (c >= 'a' && c <= 'f') 434 || (c >= 'A' && c <= 'F')) 435 *px++ = c; 436 else 437 break; 438 } 439 *px = 0; 440 unput(c); 441 sscanf(xbuf, "%x", (unsigned int *) &n); 442 *bp++ = n; 443 break; 444 } 445 446 default: 447 *bp++ = c; 448 break; 449 } 450 break; 451 default: 452 *bp++ = c; 453 break; 454 } 455 } 456 *bp = 0; 457 s = tostring(buf); 458 *bp++ = ' '; *bp++ = '\0'; 459 yylval.cp = setsymtab(buf, s, 0.0, CON|STR|DONTFREE, symtab); 460 free(s); 461 RET(STRING); 462 } 463 464 465 static int binsearch(char *w, const Keyword *kp, int n) 466 { 467 int cond, low, mid, high; 468 469 low = 0; 470 high = n - 1; 471 while (low <= high) { 472 mid = (low + high) / 2; 473 if ((cond = strcmp(w, kp[mid].word)) < 0) 474 high = mid - 1; 475 else if (cond > 0) 476 low = mid + 1; 477 else 478 return mid; 479 } 480 return -1; 481 } 482 483 int word(char *w) 484 { 485 const Keyword *kp; 486 int c, n; 487 488 n = binsearch(w, keywords, sizeof(keywords)/sizeof(keywords[0])); 489 if (n != -1) { /* found in table */ 490 kp = keywords + n; 491 yylval.i = kp->sub; 492 switch (kp->type) { /* special handling */ 493 case BLTIN: 494 if (kp->sub == FSYSTEM && safe) 495 SYNTAX( "system is unsafe" ); 496 RET(kp->type); 497 case FUNC: 498 if (infunc) 499 SYNTAX( "illegal nested function" ); 500 RET(kp->type); 501 case RETURN: 502 if (!infunc) 503 SYNTAX( "return not in function" ); 504 RET(kp->type); 505 case VARNF: 506 yylval.cp = setsymtab("NF", "", 0.0, NUM, symtab); 507 RET(VARNF); 508 default: 509 RET(kp->type); 510 } 511 } 512 c = peek(); /* look for '(' */ 513 if (c != '(' && infunc && (n=isarg(w)) >= 0) { 514 yylval.i = n; 515 RET(ARG); 516 } else { 517 yylval.cp = setsymtab(w, "", 0.0, STR|NUM|DONTFREE, symtab); 518 if (c == '(') { 519 RET(CALL); 520 } else { 521 RET(VAR); 522 } 523 } 524 } 525 526 void startreg(void) /* next call to yylex will return a regular expression */ 527 { 528 reg = true; 529 } 530 531 int regexpr(void) 532 { 533 int c, openclass = 0; 534 static char *buf = NULL; 535 static int bufsz = 500; 536 char *bp, *cstart; 537 538 if (buf == NULL && (buf = (char *) malloc(bufsz)) == NULL) 539 FATAL("out of space for rex expr"); 540 bp = buf; 541 for ( ; ((c = input()) != '/' || openclass > 0) && c != 0; ) { 542 if (!adjbuf(&buf, &bufsz, bp-buf+3, 500, &bp, "regexpr")) 543 FATAL("out of space for reg expr %.10s...", buf); 544 if (c == '\n') { 545 *bp = '\0'; 546 SYNTAX( "newline in regular expression %.10s...", buf ); 547 unput('\n'); 548 break; 549 } else if (c == '\\') { 550 *bp++ = '\\'; 551 *bp++ = input(); 552 } else { 553 /* 554 * POSIX requires a slash in a regexp to be escaped, 555 * other awks don't require it to be escaped inside 556 * a character class. 557 */ 558 if (!do_posix) { 559 if (c == '[') { 560 int nextc = peek(); 561 if (openclass == 0 || nextc == ':' || 562 nextc == '.' || nextc == '=') { 563 if (++openclass == 1) 564 cstart = bp; 565 } 566 } else if (c == ']' && openclass > 0) { 567 /* 568 * A ']' as the first char in a 569 * class is treated literally. 570 */ 571 if (cstart != bp - 1 && 572 (cstart != bp - 2 || bp[-1] != '^')) 573 openclass--; 574 } 575 } 576 *bp++ = c; 577 } 578 } 579 *bp = 0; 580 if (c == 0) 581 SYNTAX("non-terminated regular expression %.10s...", buf); 582 yylval.s = tostring(buf); 583 unput('/'); 584 RET(REGEXPR); 585 } 586 587 /* low-level lexical stuff, sort of inherited from lex */ 588 589 char ebuf[300]; 590 char *ep = ebuf; 591 char yysbuf[100]; /* pushback buffer */ 592 char *yysptr = yysbuf; 593 FILE *yyin = NULL; 594 595 int input(void) /* get next lexical input character */ 596 { 597 int c; 598 extern char *lexprog; 599 600 if (yysptr > yysbuf) 601 c = (uschar)*--yysptr; 602 else if (lexprog != NULL) { /* awk '...' */ 603 if ((c = (uschar)*lexprog) != 0) 604 lexprog++; 605 } else /* awk -f ... */ 606 c = pgetc(); 607 if (c == EOF) 608 c = 0; 609 if (ep >= ebuf + sizeof ebuf) 610 ep = ebuf; 611 *ep = c; 612 if (c != 0) { 613 ep++; 614 } 615 return (c); 616 } 617 618 void unput(int c) /* put lexical character back on input */ 619 { 620 if (c == '\n') 621 lineno--; 622 if (yysptr >= yysbuf + sizeof(yysbuf)) 623 FATAL("pushed back too much: %.20s...", yysbuf); 624 *yysptr++ = c; 625 if (--ep < ebuf) 626 ep = ebuf + sizeof(ebuf) - 1; 627 } 628 629 void unputstr(const char *s) /* put a string back on input */ 630 { 631 int i; 632 633 for (i = strlen(s)-1; i >= 0; i--) 634 unput(s[i]); 635 } 636