xref: /openbsd/usr.bin/awk/lex.c (revision e5dd7070)
1 /*	$OpenBSD: lex.c,v 1.25 2020/07/30 17:45:44 millert Exp $	*/
2 /****************************************************************
3 Copyright (C) Lucent Technologies 1997
4 All Rights Reserved
5 
6 Permission to use, copy, modify, and distribute this software and
7 its documentation for any purpose and without fee is hereby
8 granted, provided that the above copyright notice appear in all
9 copies and that both that the copyright notice and this
10 permission notice and warranty disclaimer appear in supporting
11 documentation, and that the name Lucent Technologies or any of
12 its entities not be used in advertising or publicity pertaining
13 to distribution of the software without specific, written prior
14 permission.
15 
16 LUCENT DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
17 INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS.
18 IN NO EVENT SHALL LUCENT OR ANY OF ITS ENTITIES BE LIABLE FOR ANY
19 SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
20 WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER
21 IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,
22 ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF
23 THIS SOFTWARE.
24 ****************************************************************/
25 
26 #include <stdio.h>
27 #include <stdlib.h>
28 #include <string.h>
29 #include <ctype.h>
30 #include "awk.h"
31 #include "awkgram.tab.h"
32 
33 extern YYSTYPE	yylval;
34 extern bool	infunc;
35 
36 int	lineno	= 1;
37 int	bracecnt = 0;
38 int	brackcnt  = 0;
39 int	parencnt = 0;
40 
41 typedef struct Keyword {
42 	const char *word;
43 	int	sub;
44 	int	type;
45 } Keyword;
46 
47 const Keyword keywords[] = {	/* keep sorted: binary searched */
48 	{ "BEGIN",	XBEGIN,		XBEGIN },
49 	{ "END",	XEND,		XEND },
50 	{ "NF",		VARNF,		VARNF },
51 	{ "and",	FAND,		BLTIN },
52 	{ "atan2",	FATAN,		BLTIN },
53 	{ "break",	BREAK,		BREAK },
54 	{ "close",	CLOSE,		CLOSE },
55 	{ "compl",	FCOMPL,		BLTIN },
56 	{ "continue",	CONTINUE,	CONTINUE },
57 	{ "cos",	FCOS,		BLTIN },
58 	{ "delete",	DELETE,		DELETE },
59 	{ "do",		DO,		DO },
60 	{ "else",	ELSE,		ELSE },
61 	{ "exit",	EXIT,		EXIT },
62 	{ "exp",	FEXP,		BLTIN },
63 	{ "fflush",	FFLUSH,		BLTIN },
64 	{ "for",	FOR,		FOR },
65 	{ "func",	FUNC,		FUNC },
66 	{ "function",	FUNC,		FUNC },
67 	{ "gensub",	GENSUB,		GENSUB },
68 	{ "getline",	GETLINE,	GETLINE },
69 	{ "gsub",	GSUB,		GSUB },
70 	{ "if",		IF,		IF },
71 	{ "in",		IN,		IN },
72 	{ "index",	INDEX,		INDEX },
73 	{ "int",	FINT,		BLTIN },
74 	{ "length",	FLENGTH,	BLTIN },
75 	{ "log",	FLOG,		BLTIN },
76 	{ "lshift",	FLSHIFT,	BLTIN },
77 	{ "match",	MATCHFCN,	MATCHFCN },
78 	{ "next",	NEXT,		NEXT },
79 	{ "nextfile",	NEXTFILE,	NEXTFILE },
80 	{ "or",		FFOR,		BLTIN },
81 	{ "print",	PRINT,		PRINT },
82 	{ "printf",	PRINTF,		PRINTF },
83 	{ "rand",	FRAND,		BLTIN },
84 	{ "return",	RETURN,		RETURN },
85 	{ "rshift",	FRSHIFT,	BLTIN },
86 	{ "sin",	FSIN,		BLTIN },
87 	{ "split",	SPLIT,		SPLIT },
88 	{ "sprintf",	SPRINTF,	SPRINTF },
89 	{ "sqrt",	FSQRT,		BLTIN },
90 	{ "srand",	FSRAND,		BLTIN },
91 	{ "strftime",	FSTRFTIME,	BLTIN },
92 	{ "sub",	SUB,		SUB },
93 	{ "substr",	SUBSTR,		SUBSTR },
94 	{ "system",	FSYSTEM,	BLTIN },
95 	{ "systime",	FSYSTIME,	BLTIN },
96 	{ "tolower",	FTOLOWER,	BLTIN },
97 	{ "toupper",	FTOUPPER,	BLTIN },
98 	{ "while",	WHILE,		WHILE },
99 	{ "xor",	FXOR,		BLTIN },
100 };
101 
102 #define	RET(x)	{ if(dbg)printf("lex %s\n", tokname(x)); return(x); }
103 
104 static int peek(void)
105 {
106 	int c = input();
107 	unput(c);
108 	return c;
109 }
110 
111 static int gettok(char **pbuf, int *psz)	/* get next input token */
112 {
113 	int c, retc;
114 	char *buf = *pbuf;
115 	int sz = *psz;
116 	char *bp = buf;
117 
118 	c = input();
119 	if (c == 0)
120 		return 0;
121 	buf[0] = c;
122 	buf[1] = 0;
123 	if (!isalnum(c) && c != '.' && c != '_')
124 		return c;
125 
126 	*bp++ = c;
127 	if (isalpha(c) || c == '_') {	/* it's a varname */
128 		for ( ; (c = input()) != 0; ) {
129 			if (bp-buf >= sz)
130 				if (!adjbuf(&buf, &sz, bp-buf+2, 100, &bp, "gettok"))
131 					FATAL( "out of space for name %.10s...", buf );
132 			if (isalnum(c) || c == '_')
133 				*bp++ = c;
134 			else {
135 				*bp = 0;
136 				unput(c);
137 				break;
138 			}
139 		}
140 		*bp = 0;
141 		retc = 'a';	/* alphanumeric */
142 	} else {	/* maybe it's a number, but could be . */
143 		char *rem;
144 		/* read input until can't be a number */
145 		for ( ; (c = input()) != 0; ) {
146 			if (bp-buf >= sz)
147 				if (!adjbuf(&buf, &sz, bp-buf+2, 100, &bp, "gettok"))
148 					FATAL( "out of space for number %.10s...", buf );
149 			if (isdigit(c) || c == 'e' || c == 'E'
150 			  || c == '.' || c == '+' || c == '-')
151 				*bp++ = c;
152 			else {
153 				unput(c);
154 				break;
155 			}
156 		}
157 		*bp = 0;
158 		strtod(buf, &rem);	/* parse the number */
159 		if (rem == buf) {	/* it wasn't a valid number at all */
160 			buf[1] = 0;	/* return one character as token */
161 			retc = (uschar)buf[0];	/* character is its own type */
162 			unputstr(rem+1); /* put rest back for later */
163 		} else {	/* some prefix was a number */
164 			unputstr(rem);	/* put rest back for later */
165 			rem[0] = 0;	/* truncate buf after number part */
166 			retc = '0';	/* type is number */
167 		}
168 	}
169 	*pbuf = buf;
170 	*psz = sz;
171 	return retc;
172 }
173 
174 int	word(char *);
175 int	string(void);
176 int	regexpr(void);
177 bool	sc	= false;	/* true => return a } right now */
178 bool	reg	= false;	/* true => return a REGEXPR now */
179 
180 int yylex(void)
181 {
182 	int c;
183 	static char *buf = NULL;
184 	static int bufsize = 5; /* BUG: setting this small causes core dump! */
185 
186 	if (buf == NULL && (buf = malloc(bufsize)) == NULL)
187 		FATAL( "out of space in yylex" );
188 	if (sc) {
189 		sc = false;
190 		RET('}');
191 	}
192 	if (reg) {
193 		reg = false;
194 		return regexpr();
195 	}
196 	for (;;) {
197 		c = gettok(&buf, &bufsize);
198 		if (c == 0)
199 			return 0;
200 		if (isalpha(c) || c == '_')
201 			return word(buf);
202 		if (isdigit(c)) {
203 			char *cp = tostring(buf);
204 			yylval.cp = setsymtab(buf, cp, atof(buf), CON|NUM, symtab);
205 			free(cp);
206 			/* should this also have STR set? */
207 			RET(NUMBER);
208 		}
209 
210 		yylval.i = c;
211 		switch (c) {
212 		case '\n':	/* {EOL} */
213 			lineno++;
214 			RET(NL);
215 		case '\r':	/* assume \n is coming */
216 		case ' ':	/* {WS}+ */
217 		case '\t':
218 			break;
219 		case '#':	/* #.* strip comments */
220 			while ((c = input()) != '\n' && c != 0)
221 				;
222 			unput(c);
223 			/*
224 			 * Next line is a hack, itcompensates for
225 			 * unput's treatment of \n.
226 			 */
227 			lineno++;
228 			break;
229 		case ';':
230 			RET(';');
231 		case '\\':
232 			if (peek() == '\n') {
233 				input();
234 				lineno++;
235 			} else if (peek() == '\r') {
236 				input(); input();	/* \n */
237 				lineno++;
238 			} else {
239 				RET(c);
240 			}
241 			break;
242 		case '&':
243 			if (peek() == '&') {
244 				input(); RET(AND);
245 			} else
246 				RET('&');
247 		case '|':
248 			if (peek() == '|') {
249 				input(); RET(BOR);
250 			} else
251 				RET('|');
252 		case '!':
253 			if (peek() == '=') {
254 				input(); yylval.i = NE; RET(NE);
255 			} else if (peek() == '~') {
256 				input(); yylval.i = NOTMATCH; RET(MATCHOP);
257 			} else
258 				RET(NOT);
259 		case '~':
260 			yylval.i = MATCH;
261 			RET(MATCHOP);
262 		case '<':
263 			if (peek() == '=') {
264 				input(); yylval.i = LE; RET(LE);
265 			} else {
266 				yylval.i = LT; RET(LT);
267 			}
268 		case '=':
269 			if (peek() == '=') {
270 				input(); yylval.i = EQ; RET(EQ);
271 			} else {
272 				yylval.i = ASSIGN; RET(ASGNOP);
273 			}
274 		case '>':
275 			if (peek() == '=') {
276 				input(); yylval.i = GE; RET(GE);
277 			} else if (peek() == '>') {
278 				input(); yylval.i = APPEND; RET(APPEND);
279 			} else {
280 				yylval.i = GT; RET(GT);
281 			}
282 		case '+':
283 			if (peek() == '+') {
284 				input(); yylval.i = INCR; RET(INCR);
285 			} else if (peek() == '=') {
286 				input(); yylval.i = ADDEQ; RET(ASGNOP);
287 			} else
288 				RET('+');
289 		case '-':
290 			if (peek() == '-') {
291 				input(); yylval.i = DECR; RET(DECR);
292 			} else if (peek() == '=') {
293 				input(); yylval.i = SUBEQ; RET(ASGNOP);
294 			} else
295 				RET('-');
296 		case '*':
297 			if (peek() == '=') {	/* *= */
298 				input(); yylval.i = MULTEQ; RET(ASGNOP);
299 			} else if (peek() == '*') {	/* ** or **= */
300 				input();	/* eat 2nd * */
301 				if (peek() == '=') {
302 					input(); yylval.i = POWEQ; RET(ASGNOP);
303 				} else {
304 					RET(POWER);
305 				}
306 			} else
307 				RET('*');
308 		case '/':
309 			RET('/');
310 		case '%':
311 			if (peek() == '=') {
312 				input(); yylval.i = MODEQ; RET(ASGNOP);
313 			} else
314 				RET('%');
315 		case '^':
316 			if (peek() == '=') {
317 				input(); yylval.i = POWEQ; RET(ASGNOP);
318 			} else
319 				RET(POWER);
320 
321 		case '$':
322 			/* BUG: awkward, if not wrong */
323 			c = gettok(&buf, &bufsize);
324 			if (isalpha(c)) {
325 				if (strcmp(buf, "NF") == 0) {	/* very special */
326 					unputstr("(NF)");
327 					RET(INDIRECT);
328 				}
329 				c = peek();
330 				if (c == '(' || c == '[' || (infunc && isarg(buf) >= 0)) {
331 					unputstr(buf);
332 					RET(INDIRECT);
333 				}
334 				yylval.cp = setsymtab(buf, "", 0.0, STR|NUM, symtab);
335 				RET(IVAR);
336 			} else if (c == 0) {	/*  */
337 				SYNTAX( "unexpected end of input after $" );
338 				RET(';');
339 			} else {
340 				unputstr(buf);
341 				RET(INDIRECT);
342 			}
343 
344 		case '}':
345 			if (--bracecnt < 0)
346 				SYNTAX( "extra }" );
347 			sc = true;
348 			RET(';');
349 		case ']':
350 			if (--brackcnt < 0)
351 				SYNTAX( "extra ]" );
352 			RET(']');
353 		case ')':
354 			if (--parencnt < 0)
355 				SYNTAX( "extra )" );
356 			RET(')');
357 		case '{':
358 			bracecnt++;
359 			RET('{');
360 		case '[':
361 			brackcnt++;
362 			RET('[');
363 		case '(':
364 			parencnt++;
365 			RET('(');
366 
367 		case '"':
368 			return string();	/* BUG: should be like tran.c ? */
369 
370 		default:
371 			RET(c);
372 		}
373 	}
374 }
375 
376 int string(void)
377 {
378 	int c, n;
379 	char *s, *bp;
380 	static char *buf = NULL;
381 	static int bufsz = 500;
382 
383 	if (buf == NULL && (buf = malloc(bufsz)) == NULL)
384 		FATAL("out of space for strings");
385 	for (bp = buf; (c = input()) != '"'; ) {
386 		if (!adjbuf(&buf, &bufsz, bp-buf+2, 500, &bp, "string"))
387 			FATAL("out of space for string %.10s...", buf);
388 		switch (c) {
389 		case '\n':
390 		case '\r':
391 		case 0:
392 			*bp = '\0';
393 			SYNTAX( "non-terminated string %.10s...", buf );
394 			if (c == 0)	/* hopeless */
395 				FATAL( "giving up" );
396 			lineno++;
397 			break;
398 		case '\\':
399 			c = input();
400 			switch (c) {
401 			case '\n': break;
402 			case '"': *bp++ = '"'; break;
403 			case 'n': *bp++ = '\n'; break;
404 			case 't': *bp++ = '\t'; break;
405 			case 'f': *bp++ = '\f'; break;
406 			case 'r': *bp++ = '\r'; break;
407 			case 'b': *bp++ = '\b'; break;
408 			case 'v': *bp++ = '\v'; break;
409 			case 'a': *bp++ = '\a'; break;
410 			case '\\': *bp++ = '\\'; break;
411 
412 			case '0': case '1': case '2': /* octal: \d \dd \ddd */
413 			case '3': case '4': case '5': case '6': case '7':
414 				n = c - '0';
415 				if ((c = peek()) >= '0' && c < '8') {
416 					n = 8 * n + input() - '0';
417 					if ((c = peek()) >= '0' && c < '8')
418 						n = 8 * n + input() - '0';
419 				}
420 				*bp++ = n;
421 				break;
422 
423 			case 'x':	/* hex  \x0-9a-fA-F + */
424 			    {	char xbuf[100], *px;
425 				for (px = xbuf; (c = input()) != 0 && px-xbuf < 100-2; ) {
426 					if (isdigit(c)
427 					 || (c >= 'a' && c <= 'f')
428 					 || (c >= 'A' && c <= 'F'))
429 						*px++ = c;
430 					else
431 						break;
432 				}
433 				*px = 0;
434 				unput(c);
435 	  			sscanf(xbuf, "%x", (unsigned int *) &n);
436 				*bp++ = n;
437 				break;
438 			    }
439 
440 			default:
441 				*bp++ = c;
442 				break;
443 			}
444 			break;
445 		default:
446 			*bp++ = c;
447 			break;
448 		}
449 	}
450 	*bp = 0;
451 	s = tostring(buf);
452 	*bp++ = ' '; *bp++ = '\0';
453 	yylval.cp = setsymtab(buf, s, 0.0, CON|STR|DONTFREE, symtab);
454 	free(s);
455 	RET(STRING);
456 }
457 
458 
459 static int binsearch(char *w, const Keyword *kp, int n)
460 {
461 	int cond, low, mid, high;
462 
463 	low = 0;
464 	high = n - 1;
465 	while (low <= high) {
466 		mid = (low + high) / 2;
467 		if ((cond = strcmp(w, kp[mid].word)) < 0)
468 			high = mid - 1;
469 		else if (cond > 0)
470 			low = mid + 1;
471 		else
472 			return mid;
473 	}
474 	return -1;
475 }
476 
477 int word(char *w)
478 {
479 	const Keyword *kp;
480 	int c, n;
481 
482 	n = binsearch(w, keywords, sizeof(keywords)/sizeof(keywords[0]));
483 	if (n != -1) {	/* found in table */
484 		kp = keywords + n;
485 		yylval.i = kp->sub;
486 		switch (kp->type) {	/* special handling */
487 		case BLTIN:
488 			if (kp->sub == FSYSTEM && safe)
489 				SYNTAX( "system is unsafe" );
490 			RET(kp->type);
491 		case FUNC:
492 			if (infunc)
493 				SYNTAX( "illegal nested function" );
494 			RET(kp->type);
495 		case RETURN:
496 			if (!infunc)
497 				SYNTAX( "return not in function" );
498 			RET(kp->type);
499 		case VARNF:
500 			yylval.cp = setsymtab("NF", "", 0.0, NUM, symtab);
501 			RET(VARNF);
502 		default:
503 			RET(kp->type);
504 		}
505 	}
506 	c = peek();	/* look for '(' */
507 	if (c != '(' && infunc && (n=isarg(w)) >= 0) {
508 		yylval.i = n;
509 		RET(ARG);
510 	} else {
511 		yylval.cp = setsymtab(w, "", 0.0, STR|NUM|DONTFREE, symtab);
512 		if (c == '(') {
513 			RET(CALL);
514 		} else {
515 			RET(VAR);
516 		}
517 	}
518 }
519 
520 void startreg(void)	/* next call to yylex will return a regular expression */
521 {
522 	reg = true;
523 }
524 
525 int regexpr(void)
526 {
527 	int c, openclass = 0;
528 	static char *buf = NULL;
529 	static int bufsz = 500;
530 	char *bp, *cstart;
531 
532 	if (buf == NULL && (buf = malloc(bufsz)) == NULL)
533 		FATAL("out of space for rex expr");
534 	bp = buf;
535 	for ( ; ((c = input()) != '/' || openclass > 0) && c != 0; ) {
536 		if (!adjbuf(&buf, &bufsz, bp-buf+3, 500, &bp, "regexpr"))
537 			FATAL("out of space for reg expr %.10s...", buf);
538 		if (c == '\n') {
539 			*bp = '\0';
540 			SYNTAX( "newline in regular expression %.10s...", buf );
541 			unput('\n');
542 			break;
543 		} else if (c == '\\') {
544 			*bp++ = '\\';
545 			*bp++ = input();
546 		} else {
547 			/*
548 			 * POSIX requires a slash in a regexp to be escaped,
549 			 * other awks don't require it to be escaped inside
550 			 * a character class.
551 			 */
552 			if (!do_posix) {
553 				if (c == '[') {
554 					int nextc = peek();
555 					if (openclass == 0 || nextc == ':' ||
556 					    nextc == '.' || nextc == '=') {
557 						if (++openclass == 1)
558 							cstart = bp;
559 					}
560 				} else if (c == ']' && openclass > 0) {
561 					/*
562 					 * A ']' as the first char in a
563 					 * class is treated literally.
564 					 */
565 					if (cstart != bp - 1 &&
566 					    (cstart != bp - 2 || bp[-1] != '^'))
567 						openclass--;
568 				}
569 			}
570 			*bp++ = c;
571 		}
572 	}
573 	*bp = 0;
574 	if (c == 0)
575 		SYNTAX("non-terminated regular expression %.10s...", buf);
576 	yylval.s = tostring(buf);
577 	unput('/');
578 	RET(REGEXPR);
579 }
580 
581 /* low-level lexical stuff, sort of inherited from lex */
582 
583 char	ebuf[300];
584 char	*ep = ebuf;
585 char	yysbuf[100];	/* pushback buffer */
586 char	*yysptr = yysbuf;
587 FILE	*yyin = NULL;
588 
589 int input(void)	/* get next lexical input character */
590 {
591 	int c;
592 	extern char *lexprog;
593 
594 	if (yysptr > yysbuf)
595 		c = (uschar)*--yysptr;
596 	else if (lexprog != NULL) {	/* awk '...' */
597 		if ((c = (uschar)*lexprog) != 0)
598 			lexprog++;
599 	} else				/* awk -f ... */
600 		c = pgetc();
601 	if (c == EOF)
602 		c = 0;
603 	if (ep >= ebuf + sizeof ebuf)
604 		ep = ebuf;
605 	*ep = c;
606 	if (c != 0) {
607 		ep++;
608 	}
609 	return (c);
610 }
611 
612 void unput(int c)	/* put lexical character back on input */
613 {
614 	if (c == '\n')
615 		lineno--;
616 	if (yysptr >= yysbuf + sizeof(yysbuf))
617 		FATAL("pushed back too much: %.20s...", yysbuf);
618 	*yysptr++ = c;
619 	if (--ep < ebuf)
620 		ep = ebuf + sizeof(ebuf) - 1;
621 }
622 
623 void unputstr(const char *s)	/* put a string back on input */
624 {
625 	int i;
626 
627 	for (i = strlen(s)-1; i >= 0; i--)
628 		unput(s[i]);
629 }
630