1 /* $OpenBSD: lex.c,v 1.34 2024/06/03 00:58:04 millert Exp $ */
2 /****************************************************************
3 Copyright (C) Lucent Technologies 1997
4 All Rights Reserved
5
6 Permission to use, copy, modify, and distribute this software and
7 its documentation for any purpose and without fee is hereby
8 granted, provided that the above copyright notice appear in all
9 copies and that both that the copyright notice and this
10 permission notice and warranty disclaimer appear in supporting
11 documentation, and that the name Lucent Technologies or any of
12 its entities not be used in advertising or publicity pertaining
13 to distribution of the software without specific, written prior
14 permission.
15
16 LUCENT DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
17 INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS.
18 IN NO EVENT SHALL LUCENT OR ANY OF ITS ENTITIES BE LIABLE FOR ANY
19 SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
20 WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER
21 IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,
22 ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF
23 THIS SOFTWARE.
24 ****************************************************************/
25
26 #include <stdio.h>
27 #include <stdlib.h>
28 #include <string.h>
29 #include <ctype.h>
30 #include "awk.h"
31 #include "awkgram.tab.h"
32
33 extern YYSTYPE yylval;
34 extern bool infunc;
35
36 int lineno = 1;
37 int bracecnt = 0;
38 int brackcnt = 0;
39 int parencnt = 0;
40
41 typedef struct Keyword {
42 const char *word;
43 int sub;
44 int type;
45 } Keyword;
46
47 const Keyword keywords[] = { /* keep sorted: binary searched */
48 { "BEGIN", XBEGIN, XBEGIN },
49 { "END", XEND, XEND },
50 { "NF", VARNF, VARNF },
51 { "and", FAND, BLTIN },
52 { "atan2", FATAN, BLTIN },
53 { "break", BREAK, BREAK },
54 { "close", CLOSE, CLOSE },
55 { "compl", FCOMPL, BLTIN },
56 { "continue", CONTINUE, CONTINUE },
57 { "cos", FCOS, BLTIN },
58 { "delete", DELETE, DELETE },
59 { "do", DO, DO },
60 { "else", ELSE, ELSE },
61 { "exit", EXIT, EXIT },
62 { "exp", FEXP, BLTIN },
63 { "fflush", FFLUSH, BLTIN },
64 { "for", FOR, FOR },
65 { "func", FUNC, FUNC },
66 { "function", FUNC, FUNC },
67 { "gensub", GENSUB, GENSUB },
68 { "getline", GETLINE, GETLINE },
69 { "gsub", GSUB, GSUB },
70 { "if", IF, IF },
71 { "in", IN, IN },
72 { "index", INDEX, INDEX },
73 { "int", FINT, BLTIN },
74 { "length", FLENGTH, BLTIN },
75 { "log", FLOG, BLTIN },
76 { "lshift", FLSHIFT, BLTIN },
77 { "match", MATCHFCN, MATCHFCN },
78 { "mktime", FMKTIME, BLTIN },
79 { "next", NEXT, NEXT },
80 { "nextfile", NEXTFILE, NEXTFILE },
81 { "or", FFOR, BLTIN },
82 { "print", PRINT, PRINT },
83 { "printf", PRINTF, PRINTF },
84 { "rand", FRAND, BLTIN },
85 { "return", RETURN, RETURN },
86 { "rshift", FRSHIFT, BLTIN },
87 { "sin", FSIN, BLTIN },
88 { "split", SPLIT, SPLIT },
89 { "sprintf", SPRINTF, SPRINTF },
90 { "sqrt", FSQRT, BLTIN },
91 { "srand", FSRAND, BLTIN },
92 { "strftime", FSTRFTIME, BLTIN },
93 { "sub", SUB, SUB },
94 { "substr", SUBSTR, SUBSTR },
95 { "system", FSYSTEM, BLTIN },
96 { "systime", FSYSTIME, BLTIN },
97 { "tolower", FTOLOWER, BLTIN },
98 { "toupper", FTOUPPER, BLTIN },
99 { "while", WHILE, WHILE },
100 { "xor", FXOR, BLTIN },
101 };
102
103 #define RET(x) { if(dbg)printf("lex %s\n", tokname(x)); return(x); }
104
peek(void)105 static int peek(void)
106 {
107 int c = input();
108 unput(c);
109 return c;
110 }
111
gettok(char ** pbuf,int * psz)112 static int gettok(char **pbuf, int *psz) /* get next input token */
113 {
114 int c, retc;
115 char *buf = *pbuf;
116 int sz = *psz;
117 char *bp = buf;
118
119 c = input();
120 if (c == 0)
121 return 0;
122 buf[0] = c;
123 buf[1] = 0;
124 if (!isalnum(c) && c != '.' && c != '_')
125 return c;
126
127 *bp++ = c;
128 if (isalpha(c) || c == '_') { /* it's a varname */
129 for ( ; (c = input()) != 0; ) {
130 if (bp-buf >= sz)
131 if (!adjbuf(&buf, &sz, bp-buf+2, 100, &bp, "gettok"))
132 FATAL( "out of space for name %.10s...", buf );
133 if (isalnum(c) || c == '_')
134 *bp++ = c;
135 else {
136 *bp = 0;
137 unput(c);
138 break;
139 }
140 }
141 *bp = 0;
142 retc = 'a'; /* alphanumeric */
143 } else { /* maybe it's a number, but could be . */
144 char *rem;
145 /* read input until can't be a number */
146 for ( ; (c = input()) != 0; ) {
147 if (bp-buf >= sz)
148 if (!adjbuf(&buf, &sz, bp-buf+2, 100, &bp, "gettok"))
149 FATAL( "out of space for number %.10s...", buf );
150 if (isdigit(c) || c == 'e' || c == 'E'
151 || c == '.' || c == '+' || c == '-')
152 *bp++ = c;
153 else {
154 unput(c);
155 break;
156 }
157 }
158 *bp = 0;
159 strtod(buf, &rem); /* parse the number */
160 if (rem == buf) { /* it wasn't a valid number at all */
161 buf[1] = 0; /* return one character as token */
162 retc = (uschar)buf[0]; /* character is its own type */
163 unputstr(rem+1); /* put rest back for later */
164 } else { /* some prefix was a number */
165 unputstr(rem); /* put rest back for later */
166 rem[0] = 0; /* truncate buf after number part */
167 retc = '0'; /* type is number */
168 }
169 }
170 *pbuf = buf;
171 *psz = sz;
172 return retc;
173 }
174
175 int word(char *);
176 int string(void);
177 int regexpr(void);
178 bool sc = false; /* true => return a } right now */
179 bool reg = false; /* true => return a REGEXPR now */
180
yylex(void)181 int yylex(void)
182 {
183 int c;
184 static char *buf = NULL;
185 static int bufsize = 5; /* BUG: setting this small causes core dump! */
186
187 if (buf == NULL && (buf = (char *) malloc(bufsize)) == NULL)
188 FATAL( "out of space in yylex" );
189 if (sc) {
190 sc = false;
191 RET('}');
192 }
193 if (reg) {
194 reg = false;
195 return regexpr();
196 }
197 for (;;) {
198 c = gettok(&buf, &bufsize);
199 if (c == 0)
200 return 0;
201 if (isalpha(c) || c == '_')
202 return word(buf);
203 if (isdigit(c)) {
204 char *cp = tostring(buf);
205 double result;
206
207 if (is_number(cp, & result))
208 yylval.cp = setsymtab(buf, cp, result, CON|NUM, symtab);
209 else
210 yylval.cp = setsymtab(buf, cp, 0.0, STR, symtab);
211 free(cp);
212 /* should this also have STR set? */
213 RET(NUMBER);
214 }
215
216 yylval.i = c;
217 switch (c) {
218 case '\n': /* {EOL} */
219 lineno++;
220 RET(NL);
221 case '\r': /* assume \n is coming */
222 case ' ': /* {WS}+ */
223 case '\t':
224 break;
225 case '#': /* #.* strip comments */
226 while ((c = input()) != '\n' && c != 0)
227 ;
228 unput(c);
229 /*
230 * Next line is a hack, it compensates for
231 * unput's treatment of \n.
232 */
233 lineno++;
234 break;
235 case ';':
236 RET(';');
237 case '\\':
238 if (peek() == '\n') {
239 input();
240 lineno++;
241 } else if (peek() == '\r') {
242 input(); input(); /* \n */
243 lineno++;
244 } else {
245 RET(c);
246 }
247 break;
248 case '&':
249 if (peek() == '&') {
250 input(); RET(AND);
251 } else
252 RET('&');
253 case '|':
254 if (peek() == '|') {
255 input(); RET(BOR);
256 } else
257 RET('|');
258 case '!':
259 if (peek() == '=') {
260 input(); yylval.i = NE; RET(NE);
261 } else if (peek() == '~') {
262 input(); yylval.i = NOTMATCH; RET(MATCHOP);
263 } else
264 RET(NOT);
265 case '~':
266 yylval.i = MATCH;
267 RET(MATCHOP);
268 case '<':
269 if (peek() == '=') {
270 input(); yylval.i = LE; RET(LE);
271 } else {
272 yylval.i = LT; RET(LT);
273 }
274 case '=':
275 if (peek() == '=') {
276 input(); yylval.i = EQ; RET(EQ);
277 } else {
278 yylval.i = ASSIGN; RET(ASGNOP);
279 }
280 case '>':
281 if (peek() == '=') {
282 input(); yylval.i = GE; RET(GE);
283 } else if (peek() == '>') {
284 input(); yylval.i = APPEND; RET(APPEND);
285 } else {
286 yylval.i = GT; RET(GT);
287 }
288 case '+':
289 if (peek() == '+') {
290 input(); yylval.i = INCR; RET(INCR);
291 } else if (peek() == '=') {
292 input(); yylval.i = ADDEQ; RET(ASGNOP);
293 } else
294 RET('+');
295 case '-':
296 if (peek() == '-') {
297 input(); yylval.i = DECR; RET(DECR);
298 } else if (peek() == '=') {
299 input(); yylval.i = SUBEQ; RET(ASGNOP);
300 } else
301 RET('-');
302 case '*':
303 if (peek() == '=') { /* *= */
304 input(); yylval.i = MULTEQ; RET(ASGNOP);
305 } else if (peek() == '*') { /* ** or **= */
306 input(); /* eat 2nd * */
307 if (peek() == '=') {
308 input(); yylval.i = POWEQ; RET(ASGNOP);
309 } else {
310 RET(POWER);
311 }
312 } else
313 RET('*');
314 case '/':
315 RET('/');
316 case '%':
317 if (peek() == '=') {
318 input(); yylval.i = MODEQ; RET(ASGNOP);
319 } else
320 RET('%');
321 case '^':
322 if (peek() == '=') {
323 input(); yylval.i = POWEQ; RET(ASGNOP);
324 } else
325 RET(POWER);
326
327 case '$':
328 /* BUG: awkward, if not wrong */
329 c = gettok(&buf, &bufsize);
330 if (isalpha(c)) {
331 if (strcmp(buf, "NF") == 0) { /* very special */
332 unputstr("(NF)");
333 RET(INDIRECT);
334 }
335 c = peek();
336 if (c == '(' || c == '[' || (infunc && isarg(buf) >= 0)) {
337 unputstr(buf);
338 RET(INDIRECT);
339 }
340 yylval.cp = setsymtab(buf, "", 0.0, STR|NUM, symtab);
341 RET(IVAR);
342 } else if (c == 0) { /* */
343 SYNTAX( "unexpected end of input after $" );
344 RET(';');
345 } else {
346 unputstr(buf);
347 RET(INDIRECT);
348 }
349
350 case '}':
351 if (--bracecnt < 0)
352 SYNTAX( "extra }" );
353 sc = true;
354 RET(';');
355 case ']':
356 if (--brackcnt < 0)
357 SYNTAX( "extra ]" );
358 RET(']');
359 case ')':
360 if (--parencnt < 0)
361 SYNTAX( "extra )" );
362 RET(')');
363 case '{':
364 bracecnt++;
365 RET('{');
366 case '[':
367 brackcnt++;
368 RET('[');
369 case '(':
370 parencnt++;
371 RET('(');
372
373 case '"':
374 return string(); /* BUG: should be like tran.c ? */
375
376 default:
377 RET(c);
378 }
379 }
380 }
381
string(void)382 int string(void)
383 {
384 int c, n;
385 char *s, *bp;
386 static char *buf = NULL;
387 static int bufsz = 500;
388
389 if (buf == NULL && (buf = (char *) malloc(bufsz)) == NULL)
390 FATAL("out of space for strings");
391 for (bp = buf; (c = input()) != '"'; ) {
392 if (!adjbuf(&buf, &bufsz, bp-buf+2, 500, &bp, "string"))
393 FATAL("out of space for string %.10s...", buf);
394 switch (c) {
395 case '\n':
396 case '\r':
397 case 0:
398 *bp = '\0';
399 SYNTAX( "non-terminated string %.10s...", buf );
400 if (c == 0) /* hopeless */
401 FATAL( "giving up" );
402 lineno++;
403 break;
404 case '\\':
405 c = input();
406 switch (c) {
407 case '\n': break;
408 case '"': *bp++ = '"'; break;
409 case 'n': *bp++ = '\n'; break;
410 case 't': *bp++ = '\t'; break;
411 case 'f': *bp++ = '\f'; break;
412 case 'r': *bp++ = '\r'; break;
413 case 'b': *bp++ = '\b'; break;
414 case 'v': *bp++ = '\v'; break;
415 case 'a': *bp++ = '\a'; break;
416 case '\\': *bp++ = '\\'; break;
417
418 case '0': case '1': case '2': /* octal: \d \dd \ddd */
419 case '3': case '4': case '5': case '6': case '7':
420 n = c - '0';
421 if ((c = peek()) >= '0' && c < '8') {
422 n = 8 * n + input() - '0';
423 if ((c = peek()) >= '0' && c < '8')
424 n = 8 * n + input() - '0';
425 }
426 *bp++ = n;
427 break;
428
429 case 'x': /* hex \x0-9a-fA-F (exactly two) */
430 {
431 int i;
432
433 if (!isxdigit(peek())) {
434 unput(c);
435 break;
436 }
437 n = 0;
438 for (i = 0; i < 2; i++) {
439 c = input();
440 if (c == 0)
441 break;
442 if (isxdigit(c)) {
443 c = tolower(c);
444 n *= 16;
445 if (isdigit(c))
446 n += (c - '0');
447 else
448 n += 10 + (c - 'a');
449 } else {
450 unput(c);
451 break;
452 }
453 }
454 if (i)
455 *bp++ = n;
456 break;
457 }
458
459 case 'u': /* utf \u0-9a-fA-F (1..8) */
460 {
461 int i;
462
463 n = 0;
464 for (i = 0; i < 8; i++) {
465 c = input();
466 if (!isxdigit(c) || c == 0)
467 break;
468 c = tolower(c);
469 n *= 16;
470 if (isdigit(c))
471 n += (c - '0');
472 else
473 n += 10 + (c - 'a');
474 }
475 unput(c);
476 bp += runetochar(bp, n);
477 break;
478 }
479
480 default:
481 *bp++ = c;
482 break;
483 }
484 break;
485 default:
486 *bp++ = c;
487 break;
488 }
489 }
490 *bp = 0;
491 s = tostring(buf);
492 *bp++ = ' '; *bp++ = '\0';
493 yylval.cp = setsymtab(buf, s, 0.0, CON|STR|DONTFREE, symtab);
494 free(s);
495 RET(STRING);
496 }
497
498
binsearch(char * w,const Keyword * kp,int n)499 static int binsearch(char *w, const Keyword *kp, int n)
500 {
501 int cond, low, mid, high;
502
503 low = 0;
504 high = n - 1;
505 while (low <= high) {
506 mid = (low + high) / 2;
507 if ((cond = strcmp(w, kp[mid].word)) < 0)
508 high = mid - 1;
509 else if (cond > 0)
510 low = mid + 1;
511 else
512 return mid;
513 }
514 return -1;
515 }
516
word(char * w)517 int word(char *w)
518 {
519 const Keyword *kp;
520 int c, n;
521
522 n = binsearch(w, keywords, sizeof(keywords)/sizeof(keywords[0]));
523 if (n != -1) { /* found in table */
524 kp = keywords + n;
525 yylval.i = kp->sub;
526 switch (kp->type) { /* special handling */
527 case BLTIN:
528 if (kp->sub == FSYSTEM && safe)
529 SYNTAX( "system is unsafe" );
530 RET(kp->type);
531 case FUNC:
532 if (infunc)
533 SYNTAX( "illegal nested function" );
534 RET(kp->type);
535 case RETURN:
536 if (!infunc)
537 SYNTAX( "return not in function" );
538 RET(kp->type);
539 case VARNF:
540 yylval.cp = setsymtab("NF", "", 0.0, NUM, symtab);
541 RET(VARNF);
542 default:
543 RET(kp->type);
544 }
545 }
546 c = peek(); /* look for '(' */
547 if (c != '(' && infunc && (n=isarg(w)) >= 0) {
548 yylval.i = n;
549 RET(ARG);
550 } else {
551 yylval.cp = setsymtab(w, "", 0.0, STR|NUM|DONTFREE, symtab);
552 if (c == '(') {
553 RET(CALL);
554 } else {
555 RET(VAR);
556 }
557 }
558 }
559
startreg(void)560 void startreg(void) /* next call to yylex will return a regular expression */
561 {
562 reg = true;
563 }
564
regexpr(void)565 int regexpr(void)
566 {
567 int c, openclass = 0;
568 static char *buf = NULL;
569 static int bufsz = 500;
570 char *bp, *cstart;
571
572 if (buf == NULL && (buf = (char *) malloc(bufsz)) == NULL)
573 FATAL("out of space for reg expr");
574 bp = buf;
575 for ( ; ((c = input()) != '/' || openclass > 0) && c != 0; ) {
576 if (!adjbuf(&buf, &bufsz, bp-buf+3, 500, &bp, "regexpr"))
577 FATAL("out of space for reg expr %.10s...", buf);
578 if (c == '\n') {
579 *bp = '\0';
580 SYNTAX( "newline in regular expression %.10s...", buf );
581 unput('\n');
582 break;
583 } else if (c == '\\') {
584 *bp++ = '\\';
585 *bp++ = input();
586 } else {
587 /*
588 * POSIX requires a slash in a regexp to be escaped,
589 * other awks don't require it to be escaped inside
590 * a character class.
591 */
592 if (!do_posix) {
593 if (c == '[') {
594 int nextc = peek();
595 if (openclass == 0 || nextc == ':' ||
596 nextc == '.' || nextc == '=') {
597 if (++openclass == 1)
598 cstart = bp;
599 }
600 } else if (c == ']' && openclass > 0) {
601 /*
602 * A ']' as the first char in a
603 * class is treated literally.
604 */
605 if (cstart != bp - 1 &&
606 (cstart != bp - 2 || bp[-1] != '^'))
607 openclass--;
608 }
609 }
610 *bp++ = c;
611 }
612 }
613 *bp = 0;
614 if (c == 0)
615 SYNTAX("non-terminated regular expression %.10s...", buf);
616 yylval.s = tostring(buf);
617 unput('/');
618 RET(REGEXPR);
619 }
620
621 /* low-level lexical stuff, sort of inherited from lex */
622
623 char ebuf[300];
624 char *ep = ebuf;
625 char yysbuf[100]; /* pushback buffer */
626 char *yysptr = yysbuf;
627 FILE *yyin = NULL;
628
input(void)629 int input(void) /* get next lexical input character */
630 {
631 int c;
632 extern char *lexprog;
633
634 if (yysptr > yysbuf)
635 c = (uschar)*--yysptr;
636 else if (lexprog != NULL) { /* awk '...' */
637 if ((c = (uschar)*lexprog) != 0)
638 lexprog++;
639 } else /* awk -f ... */
640 c = pgetc();
641 if (c == EOF)
642 c = 0;
643 if (ep >= ebuf + sizeof ebuf)
644 ep = ebuf;
645 *ep = c;
646 if (c != 0) {
647 ep++;
648 }
649 return (c);
650 }
651
unput(int c)652 void unput(int c) /* put lexical character back on input */
653 {
654 if (c == '\n')
655 lineno--;
656 if (yysptr >= yysbuf + sizeof(yysbuf))
657 FATAL("pushed back too much: %.20s...", yysbuf);
658 *yysptr++ = c;
659 if (--ep < ebuf)
660 ep = ebuf + sizeof(ebuf) - 1;
661 }
662
unputstr(const char * s)663 void unputstr(const char *s) /* put a string back on input */
664 {
665 int i;
666
667 for (i = strlen(s)-1; i >= 0; i--)
668 unput(s[i]);
669 }
670