xref: /openbsd/usr.bin/awk/lex.c (revision fa9a5497)
1 /*	$OpenBSD: lex.c,v 1.34 2024/06/03 00:58:04 millert Exp $	*/
2 /****************************************************************
3 Copyright (C) Lucent Technologies 1997
4 All Rights Reserved
5 
6 Permission to use, copy, modify, and distribute this software and
7 its documentation for any purpose and without fee is hereby
8 granted, provided that the above copyright notice appear in all
9 copies and that both that the copyright notice and this
10 permission notice and warranty disclaimer appear in supporting
11 documentation, and that the name Lucent Technologies or any of
12 its entities not be used in advertising or publicity pertaining
13 to distribution of the software without specific, written prior
14 permission.
15 
16 LUCENT DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
17 INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS.
18 IN NO EVENT SHALL LUCENT OR ANY OF ITS ENTITIES BE LIABLE FOR ANY
19 SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
20 WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER
21 IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,
22 ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF
23 THIS SOFTWARE.
24 ****************************************************************/
25 
26 #include <stdio.h>
27 #include <stdlib.h>
28 #include <string.h>
29 #include <ctype.h>
30 #include "awk.h"
31 #include "awkgram.tab.h"
32 
33 extern YYSTYPE	yylval;
34 extern bool	infunc;
35 
36 int	lineno	= 1;
37 int	bracecnt = 0;
38 int	brackcnt  = 0;
39 int	parencnt = 0;
40 
41 typedef struct Keyword {
42 	const char *word;
43 	int	sub;
44 	int	type;
45 } Keyword;
46 
47 const Keyword keywords[] = {	/* keep sorted: binary searched */
48 	{ "BEGIN",	XBEGIN,		XBEGIN },
49 	{ "END",	XEND,		XEND },
50 	{ "NF",		VARNF,		VARNF },
51 	{ "and",	FAND,		BLTIN },
52 	{ "atan2",	FATAN,		BLTIN },
53 	{ "break",	BREAK,		BREAK },
54 	{ "close",	CLOSE,		CLOSE },
55 	{ "compl",	FCOMPL,		BLTIN },
56 	{ "continue",	CONTINUE,	CONTINUE },
57 	{ "cos",	FCOS,		BLTIN },
58 	{ "delete",	DELETE,		DELETE },
59 	{ "do",		DO,		DO },
60 	{ "else",	ELSE,		ELSE },
61 	{ "exit",	EXIT,		EXIT },
62 	{ "exp",	FEXP,		BLTIN },
63 	{ "fflush",	FFLUSH,		BLTIN },
64 	{ "for",	FOR,		FOR },
65 	{ "func",	FUNC,		FUNC },
66 	{ "function",	FUNC,		FUNC },
67 	{ "gensub",	GENSUB,		GENSUB },
68 	{ "getline",	GETLINE,	GETLINE },
69 	{ "gsub",	GSUB,		GSUB },
70 	{ "if",		IF,		IF },
71 	{ "in",		IN,		IN },
72 	{ "index",	INDEX,		INDEX },
73 	{ "int",	FINT,		BLTIN },
74 	{ "length",	FLENGTH,	BLTIN },
75 	{ "log",	FLOG,		BLTIN },
76 	{ "lshift",	FLSHIFT,	BLTIN },
77 	{ "match",	MATCHFCN,	MATCHFCN },
78 	{ "mktime",	FMKTIME,	BLTIN },
79 	{ "next",	NEXT,		NEXT },
80 	{ "nextfile",	NEXTFILE,	NEXTFILE },
81 	{ "or",		FFOR,		BLTIN },
82 	{ "print",	PRINT,		PRINT },
83 	{ "printf",	PRINTF,		PRINTF },
84 	{ "rand",	FRAND,		BLTIN },
85 	{ "return",	RETURN,		RETURN },
86 	{ "rshift",	FRSHIFT,	BLTIN },
87 	{ "sin",	FSIN,		BLTIN },
88 	{ "split",	SPLIT,		SPLIT },
89 	{ "sprintf",	SPRINTF,	SPRINTF },
90 	{ "sqrt",	FSQRT,		BLTIN },
91 	{ "srand",	FSRAND,		BLTIN },
92 	{ "strftime",	FSTRFTIME,	BLTIN },
93 	{ "sub",	SUB,		SUB },
94 	{ "substr",	SUBSTR,		SUBSTR },
95 	{ "system",	FSYSTEM,	BLTIN },
96 	{ "systime",	FSYSTIME,	BLTIN },
97 	{ "tolower",	FTOLOWER,	BLTIN },
98 	{ "toupper",	FTOUPPER,	BLTIN },
99 	{ "while",	WHILE,		WHILE },
100 	{ "xor",	FXOR,		BLTIN },
101 };
102 
103 #define	RET(x)	{ if(dbg)printf("lex %s\n", tokname(x)); return(x); }
104 
peek(void)105 static int peek(void)
106 {
107 	int c = input();
108 	unput(c);
109 	return c;
110 }
111 
gettok(char ** pbuf,int * psz)112 static int gettok(char **pbuf, int *psz)	/* get next input token */
113 {
114 	int c, retc;
115 	char *buf = *pbuf;
116 	int sz = *psz;
117 	char *bp = buf;
118 
119 	c = input();
120 	if (c == 0)
121 		return 0;
122 	buf[0] = c;
123 	buf[1] = 0;
124 	if (!isalnum(c) && c != '.' && c != '_')
125 		return c;
126 
127 	*bp++ = c;
128 	if (isalpha(c) || c == '_') {	/* it's a varname */
129 		for ( ; (c = input()) != 0; ) {
130 			if (bp-buf >= sz)
131 				if (!adjbuf(&buf, &sz, bp-buf+2, 100, &bp, "gettok"))
132 					FATAL( "out of space for name %.10s...", buf );
133 			if (isalnum(c) || c == '_')
134 				*bp++ = c;
135 			else {
136 				*bp = 0;
137 				unput(c);
138 				break;
139 			}
140 		}
141 		*bp = 0;
142 		retc = 'a';	/* alphanumeric */
143 	} else {	/* maybe it's a number, but could be . */
144 		char *rem;
145 		/* read input until can't be a number */
146 		for ( ; (c = input()) != 0; ) {
147 			if (bp-buf >= sz)
148 				if (!adjbuf(&buf, &sz, bp-buf+2, 100, &bp, "gettok"))
149 					FATAL( "out of space for number %.10s...", buf );
150 			if (isdigit(c) || c == 'e' || c == 'E'
151 			  || c == '.' || c == '+' || c == '-')
152 				*bp++ = c;
153 			else {
154 				unput(c);
155 				break;
156 			}
157 		}
158 		*bp = 0;
159 		strtod(buf, &rem);	/* parse the number */
160 		if (rem == buf) {	/* it wasn't a valid number at all */
161 			buf[1] = 0;	/* return one character as token */
162 			retc = (uschar)buf[0];	/* character is its own type */
163 			unputstr(rem+1); /* put rest back for later */
164 		} else {	/* some prefix was a number */
165 			unputstr(rem);	/* put rest back for later */
166 			rem[0] = 0;	/* truncate buf after number part */
167 			retc = '0';	/* type is number */
168 		}
169 	}
170 	*pbuf = buf;
171 	*psz = sz;
172 	return retc;
173 }
174 
175 int	word(char *);
176 int	string(void);
177 int	regexpr(void);
178 bool	sc	= false;	/* true => return a } right now */
179 bool	reg	= false;	/* true => return a REGEXPR now */
180 
yylex(void)181 int yylex(void)
182 {
183 	int c;
184 	static char *buf = NULL;
185 	static int bufsize = 5; /* BUG: setting this small causes core dump! */
186 
187 	if (buf == NULL && (buf = (char *) malloc(bufsize)) == NULL)
188 		FATAL( "out of space in yylex" );
189 	if (sc) {
190 		sc = false;
191 		RET('}');
192 	}
193 	if (reg) {
194 		reg = false;
195 		return regexpr();
196 	}
197 	for (;;) {
198 		c = gettok(&buf, &bufsize);
199 		if (c == 0)
200 			return 0;
201 		if (isalpha(c) || c == '_')
202 			return word(buf);
203 		if (isdigit(c)) {
204 			char *cp = tostring(buf);
205 			double result;
206 
207 			if (is_number(cp, & result))
208 				yylval.cp = setsymtab(buf, cp, result, CON|NUM, symtab);
209 			else
210 				yylval.cp = setsymtab(buf, cp, 0.0, STR, symtab);
211 			free(cp);
212 			/* should this also have STR set? */
213 			RET(NUMBER);
214 		}
215 
216 		yylval.i = c;
217 		switch (c) {
218 		case '\n':	/* {EOL} */
219 			lineno++;
220 			RET(NL);
221 		case '\r':	/* assume \n is coming */
222 		case ' ':	/* {WS}+ */
223 		case '\t':
224 			break;
225 		case '#':	/* #.* strip comments */
226 			while ((c = input()) != '\n' && c != 0)
227 				;
228 			unput(c);
229 			/*
230 			 * Next line is a hack, it compensates for
231 			 * unput's treatment of \n.
232 			 */
233 			lineno++;
234 			break;
235 		case ';':
236 			RET(';');
237 		case '\\':
238 			if (peek() == '\n') {
239 				input();
240 				lineno++;
241 			} else if (peek() == '\r') {
242 				input(); input();	/* \n */
243 				lineno++;
244 			} else {
245 				RET(c);
246 			}
247 			break;
248 		case '&':
249 			if (peek() == '&') {
250 				input(); RET(AND);
251 			} else
252 				RET('&');
253 		case '|':
254 			if (peek() == '|') {
255 				input(); RET(BOR);
256 			} else
257 				RET('|');
258 		case '!':
259 			if (peek() == '=') {
260 				input(); yylval.i = NE; RET(NE);
261 			} else if (peek() == '~') {
262 				input(); yylval.i = NOTMATCH; RET(MATCHOP);
263 			} else
264 				RET(NOT);
265 		case '~':
266 			yylval.i = MATCH;
267 			RET(MATCHOP);
268 		case '<':
269 			if (peek() == '=') {
270 				input(); yylval.i = LE; RET(LE);
271 			} else {
272 				yylval.i = LT; RET(LT);
273 			}
274 		case '=':
275 			if (peek() == '=') {
276 				input(); yylval.i = EQ; RET(EQ);
277 			} else {
278 				yylval.i = ASSIGN; RET(ASGNOP);
279 			}
280 		case '>':
281 			if (peek() == '=') {
282 				input(); yylval.i = GE; RET(GE);
283 			} else if (peek() == '>') {
284 				input(); yylval.i = APPEND; RET(APPEND);
285 			} else {
286 				yylval.i = GT; RET(GT);
287 			}
288 		case '+':
289 			if (peek() == '+') {
290 				input(); yylval.i = INCR; RET(INCR);
291 			} else if (peek() == '=') {
292 				input(); yylval.i = ADDEQ; RET(ASGNOP);
293 			} else
294 				RET('+');
295 		case '-':
296 			if (peek() == '-') {
297 				input(); yylval.i = DECR; RET(DECR);
298 			} else if (peek() == '=') {
299 				input(); yylval.i = SUBEQ; RET(ASGNOP);
300 			} else
301 				RET('-');
302 		case '*':
303 			if (peek() == '=') {	/* *= */
304 				input(); yylval.i = MULTEQ; RET(ASGNOP);
305 			} else if (peek() == '*') {	/* ** or **= */
306 				input();	/* eat 2nd * */
307 				if (peek() == '=') {
308 					input(); yylval.i = POWEQ; RET(ASGNOP);
309 				} else {
310 					RET(POWER);
311 				}
312 			} else
313 				RET('*');
314 		case '/':
315 			RET('/');
316 		case '%':
317 			if (peek() == '=') {
318 				input(); yylval.i = MODEQ; RET(ASGNOP);
319 			} else
320 				RET('%');
321 		case '^':
322 			if (peek() == '=') {
323 				input(); yylval.i = POWEQ; RET(ASGNOP);
324 			} else
325 				RET(POWER);
326 
327 		case '$':
328 			/* BUG: awkward, if not wrong */
329 			c = gettok(&buf, &bufsize);
330 			if (isalpha(c)) {
331 				if (strcmp(buf, "NF") == 0) {	/* very special */
332 					unputstr("(NF)");
333 					RET(INDIRECT);
334 				}
335 				c = peek();
336 				if (c == '(' || c == '[' || (infunc && isarg(buf) >= 0)) {
337 					unputstr(buf);
338 					RET(INDIRECT);
339 				}
340 				yylval.cp = setsymtab(buf, "", 0.0, STR|NUM, symtab);
341 				RET(IVAR);
342 			} else if (c == 0) {	/*  */
343 				SYNTAX( "unexpected end of input after $" );
344 				RET(';');
345 			} else {
346 				unputstr(buf);
347 				RET(INDIRECT);
348 			}
349 
350 		case '}':
351 			if (--bracecnt < 0)
352 				SYNTAX( "extra }" );
353 			sc = true;
354 			RET(';');
355 		case ']':
356 			if (--brackcnt < 0)
357 				SYNTAX( "extra ]" );
358 			RET(']');
359 		case ')':
360 			if (--parencnt < 0)
361 				SYNTAX( "extra )" );
362 			RET(')');
363 		case '{':
364 			bracecnt++;
365 			RET('{');
366 		case '[':
367 			brackcnt++;
368 			RET('[');
369 		case '(':
370 			parencnt++;
371 			RET('(');
372 
373 		case '"':
374 			return string();	/* BUG: should be like tran.c ? */
375 
376 		default:
377 			RET(c);
378 		}
379 	}
380 }
381 
string(void)382 int string(void)
383 {
384 	int c, n;
385 	char *s, *bp;
386 	static char *buf = NULL;
387 	static int bufsz = 500;
388 
389 	if (buf == NULL && (buf = (char *) malloc(bufsz)) == NULL)
390 		FATAL("out of space for strings");
391 	for (bp = buf; (c = input()) != '"'; ) {
392 		if (!adjbuf(&buf, &bufsz, bp-buf+2, 500, &bp, "string"))
393 			FATAL("out of space for string %.10s...", buf);
394 		switch (c) {
395 		case '\n':
396 		case '\r':
397 		case 0:
398 			*bp = '\0';
399 			SYNTAX( "non-terminated string %.10s...", buf );
400 			if (c == 0)	/* hopeless */
401 				FATAL( "giving up" );
402 			lineno++;
403 			break;
404 		case '\\':
405 			c = input();
406 			switch (c) {
407 			case '\n': break;
408 			case '"': *bp++ = '"'; break;
409 			case 'n': *bp++ = '\n'; break;
410 			case 't': *bp++ = '\t'; break;
411 			case 'f': *bp++ = '\f'; break;
412 			case 'r': *bp++ = '\r'; break;
413 			case 'b': *bp++ = '\b'; break;
414 			case 'v': *bp++ = '\v'; break;
415 			case 'a': *bp++ = '\a'; break;
416 			case '\\': *bp++ = '\\'; break;
417 
418 			case '0': case '1': case '2': /* octal: \d \dd \ddd */
419 			case '3': case '4': case '5': case '6': case '7':
420 				n = c - '0';
421 				if ((c = peek()) >= '0' && c < '8') {
422 					n = 8 * n + input() - '0';
423 					if ((c = peek()) >= '0' && c < '8')
424 						n = 8 * n + input() - '0';
425 				}
426 				*bp++ = n;
427 				break;
428 
429 			case 'x':	/* hex  \x0-9a-fA-F (exactly two) */
430 			    {
431 				int i;
432 
433 				if (!isxdigit(peek())) {
434 					unput(c);
435 					break;
436 				}
437 				n = 0;
438 				for (i = 0; i < 2; i++) {
439 					c = input();
440 					if (c == 0)
441 						break;
442 					if (isxdigit(c)) {
443 						c = tolower(c);
444 						n *= 16;
445 						if (isdigit(c))
446 							n += (c - '0');
447 						else
448 							n += 10 + (c - 'a');
449 					} else {
450 						unput(c);
451 						break;
452 					}
453 				}
454 				if (i)
455 					*bp++ = n;
456 				break;
457 			    }
458 
459 			case 'u':	/* utf  \u0-9a-fA-F (1..8) */
460 			    {
461 				int i;
462 
463 				n = 0;
464 				for (i = 0; i < 8; i++) {
465 					c = input();
466 					if (!isxdigit(c) || c == 0)
467 						break;
468 					c = tolower(c);
469 					n *= 16;
470 					if (isdigit(c))
471 						n += (c - '0');
472 					else
473 						n += 10 + (c - 'a');
474 				}
475 				unput(c);
476 				bp += runetochar(bp, n);
477 				break;
478 			    }
479 
480 			default:
481 				*bp++ = c;
482 				break;
483 			}
484 			break;
485 		default:
486 			*bp++ = c;
487 			break;
488 		}
489 	}
490 	*bp = 0;
491 	s = tostring(buf);
492 	*bp++ = ' '; *bp++ = '\0';
493 	yylval.cp = setsymtab(buf, s, 0.0, CON|STR|DONTFREE, symtab);
494 	free(s);
495 	RET(STRING);
496 }
497 
498 
binsearch(char * w,const Keyword * kp,int n)499 static int binsearch(char *w, const Keyword *kp, int n)
500 {
501 	int cond, low, mid, high;
502 
503 	low = 0;
504 	high = n - 1;
505 	while (low <= high) {
506 		mid = (low + high) / 2;
507 		if ((cond = strcmp(w, kp[mid].word)) < 0)
508 			high = mid - 1;
509 		else if (cond > 0)
510 			low = mid + 1;
511 		else
512 			return mid;
513 	}
514 	return -1;
515 }
516 
word(char * w)517 int word(char *w)
518 {
519 	const Keyword *kp;
520 	int c, n;
521 
522 	n = binsearch(w, keywords, sizeof(keywords)/sizeof(keywords[0]));
523 	if (n != -1) {	/* found in table */
524 		kp = keywords + n;
525 		yylval.i = kp->sub;
526 		switch (kp->type) {	/* special handling */
527 		case BLTIN:
528 			if (kp->sub == FSYSTEM && safe)
529 				SYNTAX( "system is unsafe" );
530 			RET(kp->type);
531 		case FUNC:
532 			if (infunc)
533 				SYNTAX( "illegal nested function" );
534 			RET(kp->type);
535 		case RETURN:
536 			if (!infunc)
537 				SYNTAX( "return not in function" );
538 			RET(kp->type);
539 		case VARNF:
540 			yylval.cp = setsymtab("NF", "", 0.0, NUM, symtab);
541 			RET(VARNF);
542 		default:
543 			RET(kp->type);
544 		}
545 	}
546 	c = peek();	/* look for '(' */
547 	if (c != '(' && infunc && (n=isarg(w)) >= 0) {
548 		yylval.i = n;
549 		RET(ARG);
550 	} else {
551 		yylval.cp = setsymtab(w, "", 0.0, STR|NUM|DONTFREE, symtab);
552 		if (c == '(') {
553 			RET(CALL);
554 		} else {
555 			RET(VAR);
556 		}
557 	}
558 }
559 
startreg(void)560 void startreg(void)	/* next call to yylex will return a regular expression */
561 {
562 	reg = true;
563 }
564 
regexpr(void)565 int regexpr(void)
566 {
567 	int c, openclass = 0;
568 	static char *buf = NULL;
569 	static int bufsz = 500;
570 	char *bp, *cstart;
571 
572 	if (buf == NULL && (buf = (char *) malloc(bufsz)) == NULL)
573 		FATAL("out of space for reg expr");
574 	bp = buf;
575 	for ( ; ((c = input()) != '/' || openclass > 0) && c != 0; ) {
576 		if (!adjbuf(&buf, &bufsz, bp-buf+3, 500, &bp, "regexpr"))
577 			FATAL("out of space for reg expr %.10s...", buf);
578 		if (c == '\n') {
579 			*bp = '\0';
580 			SYNTAX( "newline in regular expression %.10s...", buf );
581 			unput('\n');
582 			break;
583 		} else if (c == '\\') {
584 			*bp++ = '\\';
585 			*bp++ = input();
586 		} else {
587 			/*
588 			 * POSIX requires a slash in a regexp to be escaped,
589 			 * other awks don't require it to be escaped inside
590 			 * a character class.
591 			 */
592 			if (!do_posix) {
593 				if (c == '[') {
594 					int nextc = peek();
595 					if (openclass == 0 || nextc == ':' ||
596 					    nextc == '.' || nextc == '=') {
597 						if (++openclass == 1)
598 							cstart = bp;
599 					}
600 				} else if (c == ']' && openclass > 0) {
601 					/*
602 					 * A ']' as the first char in a
603 					 * class is treated literally.
604 					 */
605 					if (cstart != bp - 1 &&
606 					    (cstart != bp - 2 || bp[-1] != '^'))
607 						openclass--;
608 				}
609 			}
610 			*bp++ = c;
611 		}
612 	}
613 	*bp = 0;
614 	if (c == 0)
615 		SYNTAX("non-terminated regular expression %.10s...", buf);
616 	yylval.s = tostring(buf);
617 	unput('/');
618 	RET(REGEXPR);
619 }
620 
621 /* low-level lexical stuff, sort of inherited from lex */
622 
623 char	ebuf[300];
624 char	*ep = ebuf;
625 char	yysbuf[100];	/* pushback buffer */
626 char	*yysptr = yysbuf;
627 FILE	*yyin = NULL;
628 
input(void)629 int input(void)	/* get next lexical input character */
630 {
631 	int c;
632 	extern char *lexprog;
633 
634 	if (yysptr > yysbuf)
635 		c = (uschar)*--yysptr;
636 	else if (lexprog != NULL) {	/* awk '...' */
637 		if ((c = (uschar)*lexprog) != 0)
638 			lexprog++;
639 	} else				/* awk -f ... */
640 		c = pgetc();
641 	if (c == EOF)
642 		c = 0;
643 	if (ep >= ebuf + sizeof ebuf)
644 		ep = ebuf;
645 	*ep = c;
646 	if (c != 0) {
647 		ep++;
648 	}
649 	return (c);
650 }
651 
unput(int c)652 void unput(int c)	/* put lexical character back on input */
653 {
654 	if (c == '\n')
655 		lineno--;
656 	if (yysptr >= yysbuf + sizeof(yysbuf))
657 		FATAL("pushed back too much: %.20s...", yysbuf);
658 	*yysptr++ = c;
659 	if (--ep < ebuf)
660 		ep = ebuf + sizeof(ebuf) - 1;
661 }
662 
unputstr(const char * s)663 void unputstr(const char *s)	/* put a string back on input */
664 {
665 	int i;
666 
667 	for (i = strlen(s)-1; i >= 0; i--)
668 		unput(s[i]);
669 }
670