xref: /openbsd/usr.bin/awk/lex.c (revision 55cc5ba3)
1 /*	$OpenBSD: lex.c,v 1.27 2020/12/09 20:00:11 millert Exp $	*/
2 /****************************************************************
3 Copyright (C) Lucent Technologies 1997
4 All Rights Reserved
5 
6 Permission to use, copy, modify, and distribute this software and
7 its documentation for any purpose and without fee is hereby
8 granted, provided that the above copyright notice appear in all
9 copies and that both that the copyright notice and this
10 permission notice and warranty disclaimer appear in supporting
11 documentation, and that the name Lucent Technologies or any of
12 its entities not be used in advertising or publicity pertaining
13 to distribution of the software without specific, written prior
14 permission.
15 
16 LUCENT DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
17 INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS.
18 IN NO EVENT SHALL LUCENT OR ANY OF ITS ENTITIES BE LIABLE FOR ANY
19 SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
20 WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER
21 IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,
22 ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF
23 THIS SOFTWARE.
24 ****************************************************************/
25 
26 #include <stdio.h>
27 #include <stdlib.h>
28 #include <string.h>
29 #include <ctype.h>
30 #include "awk.h"
31 #include "awkgram.tab.h"
32 
33 extern YYSTYPE	yylval;
34 extern bool	infunc;
35 
36 int	lineno	= 1;
37 int	bracecnt = 0;
38 int	brackcnt  = 0;
39 int	parencnt = 0;
40 
41 typedef struct Keyword {
42 	const char *word;
43 	int	sub;
44 	int	type;
45 } Keyword;
46 
47 const Keyword keywords[] = {	/* keep sorted: binary searched */
48 	{ "BEGIN",	XBEGIN,		XBEGIN },
49 	{ "END",	XEND,		XEND },
50 	{ "NF",		VARNF,		VARNF },
51 	{ "and",	FAND,		BLTIN },
52 	{ "atan2",	FATAN,		BLTIN },
53 	{ "break",	BREAK,		BREAK },
54 	{ "close",	CLOSE,		CLOSE },
55 	{ "compl",	FCOMPL,		BLTIN },
56 	{ "continue",	CONTINUE,	CONTINUE },
57 	{ "cos",	FCOS,		BLTIN },
58 	{ "delete",	DELETE,		DELETE },
59 	{ "do",		DO,		DO },
60 	{ "else",	ELSE,		ELSE },
61 	{ "exit",	EXIT,		EXIT },
62 	{ "exp",	FEXP,		BLTIN },
63 	{ "fflush",	FFLUSH,		BLTIN },
64 	{ "for",	FOR,		FOR },
65 	{ "func",	FUNC,		FUNC },
66 	{ "function",	FUNC,		FUNC },
67 	{ "gensub",	GENSUB,		GENSUB },
68 	{ "getline",	GETLINE,	GETLINE },
69 	{ "gsub",	GSUB,		GSUB },
70 	{ "if",		IF,		IF },
71 	{ "in",		IN,		IN },
72 	{ "index",	INDEX,		INDEX },
73 	{ "int",	FINT,		BLTIN },
74 	{ "length",	FLENGTH,	BLTIN },
75 	{ "log",	FLOG,		BLTIN },
76 	{ "lshift",	FLSHIFT,	BLTIN },
77 	{ "match",	MATCHFCN,	MATCHFCN },
78 	{ "mktime",	FMKTIME,	BLTIN },
79 	{ "next",	NEXT,		NEXT },
80 	{ "nextfile",	NEXTFILE,	NEXTFILE },
81 	{ "or",		FFOR,		BLTIN },
82 	{ "print",	PRINT,		PRINT },
83 	{ "printf",	PRINTF,		PRINTF },
84 	{ "rand",	FRAND,		BLTIN },
85 	{ "return",	RETURN,		RETURN },
86 	{ "rshift",	FRSHIFT,	BLTIN },
87 	{ "sin",	FSIN,		BLTIN },
88 	{ "split",	SPLIT,		SPLIT },
89 	{ "sprintf",	SPRINTF,	SPRINTF },
90 	{ "sqrt",	FSQRT,		BLTIN },
91 	{ "srand",	FSRAND,		BLTIN },
92 	{ "strftime",	FSTRFTIME,	BLTIN },
93 	{ "sub",	SUB,		SUB },
94 	{ "substr",	SUBSTR,		SUBSTR },
95 	{ "system",	FSYSTEM,	BLTIN },
96 	{ "systime",	FSYSTIME,	BLTIN },
97 	{ "tolower",	FTOLOWER,	BLTIN },
98 	{ "toupper",	FTOUPPER,	BLTIN },
99 	{ "while",	WHILE,		WHILE },
100 	{ "xor",	FXOR,		BLTIN },
101 };
102 
103 #define	RET(x)	{ if(dbg)printf("lex %s\n", tokname(x)); return(x); }
104 
105 static int peek(void)
106 {
107 	int c = input();
108 	unput(c);
109 	return c;
110 }
111 
112 static int gettok(char **pbuf, int *psz)	/* get next input token */
113 {
114 	int c, retc;
115 	char *buf = *pbuf;
116 	int sz = *psz;
117 	char *bp = buf;
118 
119 	c = input();
120 	if (c == 0)
121 		return 0;
122 	buf[0] = c;
123 	buf[1] = 0;
124 	if (!isalnum(c) && c != '.' && c != '_')
125 		return c;
126 
127 	*bp++ = c;
128 	if (isalpha(c) || c == '_') {	/* it's a varname */
129 		for ( ; (c = input()) != 0; ) {
130 			if (bp-buf >= sz)
131 				if (!adjbuf(&buf, &sz, bp-buf+2, 100, &bp, "gettok"))
132 					FATAL( "out of space for name %.10s...", buf );
133 			if (isalnum(c) || c == '_')
134 				*bp++ = c;
135 			else {
136 				*bp = 0;
137 				unput(c);
138 				break;
139 			}
140 		}
141 		*bp = 0;
142 		retc = 'a';	/* alphanumeric */
143 	} else {	/* maybe it's a number, but could be . */
144 		char *rem;
145 		/* read input until can't be a number */
146 		for ( ; (c = input()) != 0; ) {
147 			if (bp-buf >= sz)
148 				if (!adjbuf(&buf, &sz, bp-buf+2, 100, &bp, "gettok"))
149 					FATAL( "out of space for number %.10s...", buf );
150 			if (isdigit(c) || c == 'e' || c == 'E'
151 			  || c == '.' || c == '+' || c == '-')
152 				*bp++ = c;
153 			else {
154 				unput(c);
155 				break;
156 			}
157 		}
158 		*bp = 0;
159 		strtod(buf, &rem);	/* parse the number */
160 		if (rem == buf) {	/* it wasn't a valid number at all */
161 			buf[1] = 0;	/* return one character as token */
162 			retc = (uschar)buf[0];	/* character is its own type */
163 			unputstr(rem+1); /* put rest back for later */
164 		} else {	/* some prefix was a number */
165 			unputstr(rem);	/* put rest back for later */
166 			rem[0] = 0;	/* truncate buf after number part */
167 			retc = '0';	/* type is number */
168 		}
169 	}
170 	*pbuf = buf;
171 	*psz = sz;
172 	return retc;
173 }
174 
175 int	word(char *);
176 int	string(void);
177 int	regexpr(void);
178 bool	sc	= false;	/* true => return a } right now */
179 bool	reg	= false;	/* true => return a REGEXPR now */
180 
181 int yylex(void)
182 {
183 	int c;
184 	static char *buf = NULL;
185 	static int bufsize = 5; /* BUG: setting this small causes core dump! */
186 
187 	if (buf == NULL && (buf = (char *) malloc(bufsize)) == NULL)
188 		FATAL( "out of space in yylex" );
189 	if (sc) {
190 		sc = false;
191 		RET('}');
192 	}
193 	if (reg) {
194 		reg = false;
195 		return regexpr();
196 	}
197 	for (;;) {
198 		c = gettok(&buf, &bufsize);
199 		if (c == 0)
200 			return 0;
201 		if (isalpha(c) || c == '_')
202 			return word(buf);
203 		if (isdigit(c)) {
204 			char *cp = tostring(buf);
205 			double result;
206 
207 			if (is_number(cp, & result))
208 				yylval.cp = setsymtab(buf, cp, result, CON|NUM, symtab);
209 			else
210 				yylval.cp = setsymtab(buf, cp, 0.0, STR, symtab);
211 			free(cp);
212 			/* should this also have STR set? */
213 			RET(NUMBER);
214 		}
215 
216 		yylval.i = c;
217 		switch (c) {
218 		case '\n':	/* {EOL} */
219 			lineno++;
220 			RET(NL);
221 		case '\r':	/* assume \n is coming */
222 		case ' ':	/* {WS}+ */
223 		case '\t':
224 			break;
225 		case '#':	/* #.* strip comments */
226 			while ((c = input()) != '\n' && c != 0)
227 				;
228 			unput(c);
229 			/*
230 			 * Next line is a hack, itcompensates for
231 			 * unput's treatment of \n.
232 			 */
233 			lineno++;
234 			break;
235 		case ';':
236 			RET(';');
237 		case '\\':
238 			if (peek() == '\n') {
239 				input();
240 				lineno++;
241 			} else if (peek() == '\r') {
242 				input(); input();	/* \n */
243 				lineno++;
244 			} else {
245 				RET(c);
246 			}
247 			break;
248 		case '&':
249 			if (peek() == '&') {
250 				input(); RET(AND);
251 			} else
252 				RET('&');
253 		case '|':
254 			if (peek() == '|') {
255 				input(); RET(BOR);
256 			} else
257 				RET('|');
258 		case '!':
259 			if (peek() == '=') {
260 				input(); yylval.i = NE; RET(NE);
261 			} else if (peek() == '~') {
262 				input(); yylval.i = NOTMATCH; RET(MATCHOP);
263 			} else
264 				RET(NOT);
265 		case '~':
266 			yylval.i = MATCH;
267 			RET(MATCHOP);
268 		case '<':
269 			if (peek() == '=') {
270 				input(); yylval.i = LE; RET(LE);
271 			} else {
272 				yylval.i = LT; RET(LT);
273 			}
274 		case '=':
275 			if (peek() == '=') {
276 				input(); yylval.i = EQ; RET(EQ);
277 			} else {
278 				yylval.i = ASSIGN; RET(ASGNOP);
279 			}
280 		case '>':
281 			if (peek() == '=') {
282 				input(); yylval.i = GE; RET(GE);
283 			} else if (peek() == '>') {
284 				input(); yylval.i = APPEND; RET(APPEND);
285 			} else {
286 				yylval.i = GT; RET(GT);
287 			}
288 		case '+':
289 			if (peek() == '+') {
290 				input(); yylval.i = INCR; RET(INCR);
291 			} else if (peek() == '=') {
292 				input(); yylval.i = ADDEQ; RET(ASGNOP);
293 			} else
294 				RET('+');
295 		case '-':
296 			if (peek() == '-') {
297 				input(); yylval.i = DECR; RET(DECR);
298 			} else if (peek() == '=') {
299 				input(); yylval.i = SUBEQ; RET(ASGNOP);
300 			} else
301 				RET('-');
302 		case '*':
303 			if (peek() == '=') {	/* *= */
304 				input(); yylval.i = MULTEQ; RET(ASGNOP);
305 			} else if (peek() == '*') {	/* ** or **= */
306 				input();	/* eat 2nd * */
307 				if (peek() == '=') {
308 					input(); yylval.i = POWEQ; RET(ASGNOP);
309 				} else {
310 					RET(POWER);
311 				}
312 			} else
313 				RET('*');
314 		case '/':
315 			RET('/');
316 		case '%':
317 			if (peek() == '=') {
318 				input(); yylval.i = MODEQ; RET(ASGNOP);
319 			} else
320 				RET('%');
321 		case '^':
322 			if (peek() == '=') {
323 				input(); yylval.i = POWEQ; RET(ASGNOP);
324 			} else
325 				RET(POWER);
326 
327 		case '$':
328 			/* BUG: awkward, if not wrong */
329 			c = gettok(&buf, &bufsize);
330 			if (isalpha(c)) {
331 				if (strcmp(buf, "NF") == 0) {	/* very special */
332 					unputstr("(NF)");
333 					RET(INDIRECT);
334 				}
335 				c = peek();
336 				if (c == '(' || c == '[' || (infunc && isarg(buf) >= 0)) {
337 					unputstr(buf);
338 					RET(INDIRECT);
339 				}
340 				yylval.cp = setsymtab(buf, "", 0.0, STR|NUM, symtab);
341 				RET(IVAR);
342 			} else if (c == 0) {	/*  */
343 				SYNTAX( "unexpected end of input after $" );
344 				RET(';');
345 			} else {
346 				unputstr(buf);
347 				RET(INDIRECT);
348 			}
349 
350 		case '}':
351 			if (--bracecnt < 0)
352 				SYNTAX( "extra }" );
353 			sc = true;
354 			RET(';');
355 		case ']':
356 			if (--brackcnt < 0)
357 				SYNTAX( "extra ]" );
358 			RET(']');
359 		case ')':
360 			if (--parencnt < 0)
361 				SYNTAX( "extra )" );
362 			RET(')');
363 		case '{':
364 			bracecnt++;
365 			RET('{');
366 		case '[':
367 			brackcnt++;
368 			RET('[');
369 		case '(':
370 			parencnt++;
371 			RET('(');
372 
373 		case '"':
374 			return string();	/* BUG: should be like tran.c ? */
375 
376 		default:
377 			RET(c);
378 		}
379 	}
380 }
381 
382 int string(void)
383 {
384 	int c, n;
385 	char *s, *bp;
386 	static char *buf = NULL;
387 	static int bufsz = 500;
388 
389 	if (buf == NULL && (buf = (char *) malloc(bufsz)) == NULL)
390 		FATAL("out of space for strings");
391 	for (bp = buf; (c = input()) != '"'; ) {
392 		if (!adjbuf(&buf, &bufsz, bp-buf+2, 500, &bp, "string"))
393 			FATAL("out of space for string %.10s...", buf);
394 		switch (c) {
395 		case '\n':
396 		case '\r':
397 		case 0:
398 			*bp = '\0';
399 			SYNTAX( "non-terminated string %.10s...", buf );
400 			if (c == 0)	/* hopeless */
401 				FATAL( "giving up" );
402 			lineno++;
403 			break;
404 		case '\\':
405 			c = input();
406 			switch (c) {
407 			case '\n': break;
408 			case '"': *bp++ = '"'; break;
409 			case 'n': *bp++ = '\n'; break;
410 			case 't': *bp++ = '\t'; break;
411 			case 'f': *bp++ = '\f'; break;
412 			case 'r': *bp++ = '\r'; break;
413 			case 'b': *bp++ = '\b'; break;
414 			case 'v': *bp++ = '\v'; break;
415 			case 'a': *bp++ = '\a'; break;
416 			case '\\': *bp++ = '\\'; break;
417 
418 			case '0': case '1': case '2': /* octal: \d \dd \ddd */
419 			case '3': case '4': case '5': case '6': case '7':
420 				n = c - '0';
421 				if ((c = peek()) >= '0' && c < '8') {
422 					n = 8 * n + input() - '0';
423 					if ((c = peek()) >= '0' && c < '8')
424 						n = 8 * n + input() - '0';
425 				}
426 				*bp++ = n;
427 				break;
428 
429 			case 'x':	/* hex  \x0-9a-fA-F + */
430 			    {	char xbuf[100], *px;
431 				for (px = xbuf; (c = input()) != 0 && px-xbuf < 100-2; ) {
432 					if (isdigit(c)
433 					 || (c >= 'a' && c <= 'f')
434 					 || (c >= 'A' && c <= 'F'))
435 						*px++ = c;
436 					else
437 						break;
438 				}
439 				*px = 0;
440 				unput(c);
441 	  			sscanf(xbuf, "%x", (unsigned int *) &n);
442 				*bp++ = n;
443 				break;
444 			    }
445 
446 			default:
447 				*bp++ = c;
448 				break;
449 			}
450 			break;
451 		default:
452 			*bp++ = c;
453 			break;
454 		}
455 	}
456 	*bp = 0;
457 	s = tostring(buf);
458 	*bp++ = ' '; *bp++ = '\0';
459 	yylval.cp = setsymtab(buf, s, 0.0, CON|STR|DONTFREE, symtab);
460 	free(s);
461 	RET(STRING);
462 }
463 
464 
465 static int binsearch(char *w, const Keyword *kp, int n)
466 {
467 	int cond, low, mid, high;
468 
469 	low = 0;
470 	high = n - 1;
471 	while (low <= high) {
472 		mid = (low + high) / 2;
473 		if ((cond = strcmp(w, kp[mid].word)) < 0)
474 			high = mid - 1;
475 		else if (cond > 0)
476 			low = mid + 1;
477 		else
478 			return mid;
479 	}
480 	return -1;
481 }
482 
483 int word(char *w)
484 {
485 	const Keyword *kp;
486 	int c, n;
487 
488 	n = binsearch(w, keywords, sizeof(keywords)/sizeof(keywords[0]));
489 	if (n != -1) {	/* found in table */
490 		kp = keywords + n;
491 		yylval.i = kp->sub;
492 		switch (kp->type) {	/* special handling */
493 		case BLTIN:
494 			if (kp->sub == FSYSTEM && safe)
495 				SYNTAX( "system is unsafe" );
496 			RET(kp->type);
497 		case FUNC:
498 			if (infunc)
499 				SYNTAX( "illegal nested function" );
500 			RET(kp->type);
501 		case RETURN:
502 			if (!infunc)
503 				SYNTAX( "return not in function" );
504 			RET(kp->type);
505 		case VARNF:
506 			yylval.cp = setsymtab("NF", "", 0.0, NUM, symtab);
507 			RET(VARNF);
508 		default:
509 			RET(kp->type);
510 		}
511 	}
512 	c = peek();	/* look for '(' */
513 	if (c != '(' && infunc && (n=isarg(w)) >= 0) {
514 		yylval.i = n;
515 		RET(ARG);
516 	} else {
517 		yylval.cp = setsymtab(w, "", 0.0, STR|NUM|DONTFREE, symtab);
518 		if (c == '(') {
519 			RET(CALL);
520 		} else {
521 			RET(VAR);
522 		}
523 	}
524 }
525 
526 void startreg(void)	/* next call to yylex will return a regular expression */
527 {
528 	reg = true;
529 }
530 
531 int regexpr(void)
532 {
533 	int c, openclass = 0;
534 	static char *buf = NULL;
535 	static int bufsz = 500;
536 	char *bp, *cstart;
537 
538 	if (buf == NULL && (buf = (char *) malloc(bufsz)) == NULL)
539 		FATAL("out of space for rex expr");
540 	bp = buf;
541 	for ( ; ((c = input()) != '/' || openclass > 0) && c != 0; ) {
542 		if (!adjbuf(&buf, &bufsz, bp-buf+3, 500, &bp, "regexpr"))
543 			FATAL("out of space for reg expr %.10s...", buf);
544 		if (c == '\n') {
545 			*bp = '\0';
546 			SYNTAX( "newline in regular expression %.10s...", buf );
547 			unput('\n');
548 			break;
549 		} else if (c == '\\') {
550 			*bp++ = '\\';
551 			*bp++ = input();
552 		} else {
553 			/*
554 			 * POSIX requires a slash in a regexp to be escaped,
555 			 * other awks don't require it to be escaped inside
556 			 * a character class.
557 			 */
558 			if (!do_posix) {
559 				if (c == '[') {
560 					int nextc = peek();
561 					if (openclass == 0 || nextc == ':' ||
562 					    nextc == '.' || nextc == '=') {
563 						if (++openclass == 1)
564 							cstart = bp;
565 					}
566 				} else if (c == ']' && openclass > 0) {
567 					/*
568 					 * A ']' as the first char in a
569 					 * class is treated literally.
570 					 */
571 					if (cstart != bp - 1 &&
572 					    (cstart != bp - 2 || bp[-1] != '^'))
573 						openclass--;
574 				}
575 			}
576 			*bp++ = c;
577 		}
578 	}
579 	*bp = 0;
580 	if (c == 0)
581 		SYNTAX("non-terminated regular expression %.10s...", buf);
582 	yylval.s = tostring(buf);
583 	unput('/');
584 	RET(REGEXPR);
585 }
586 
587 /* low-level lexical stuff, sort of inherited from lex */
588 
589 char	ebuf[300];
590 char	*ep = ebuf;
591 char	yysbuf[100];	/* pushback buffer */
592 char	*yysptr = yysbuf;
593 FILE	*yyin = NULL;
594 
595 int input(void)	/* get next lexical input character */
596 {
597 	int c;
598 	extern char *lexprog;
599 
600 	if (yysptr > yysbuf)
601 		c = (uschar)*--yysptr;
602 	else if (lexprog != NULL) {	/* awk '...' */
603 		if ((c = (uschar)*lexprog) != 0)
604 			lexprog++;
605 	} else				/* awk -f ... */
606 		c = pgetc();
607 	if (c == EOF)
608 		c = 0;
609 	if (ep >= ebuf + sizeof ebuf)
610 		ep = ebuf;
611 	*ep = c;
612 	if (c != 0) {
613 		ep++;
614 	}
615 	return (c);
616 }
617 
618 void unput(int c)	/* put lexical character back on input */
619 {
620 	if (c == '\n')
621 		lineno--;
622 	if (yysptr >= yysbuf + sizeof(yysbuf))
623 		FATAL("pushed back too much: %.20s...", yysbuf);
624 	*yysptr++ = c;
625 	if (--ep < ebuf)
626 		ep = ebuf + sizeof(ebuf) - 1;
627 }
628 
629 void unputstr(const char *s)	/* put a string back on input */
630 {
631 	int i;
632 
633 	for (i = strlen(s)-1; i >= 0; i--)
634 		unput(s[i]);
635 }
636