xref: /dragonfly/contrib/awk/lex.c (revision 35e996c9)
1 /****************************************************************
2 Copyright (C) Lucent Technologies 1997
3 All Rights Reserved
4 
5 Permission to use, copy, modify, and distribute this software and
6 its documentation for any purpose and without fee is hereby
7 granted, provided that the above copyright notice appear in all
8 copies and that both that the copyright notice and this
9 permission notice and warranty disclaimer appear in supporting
10 documentation, and that the name Lucent Technologies or any of
11 its entities not be used in advertising or publicity pertaining
12 to distribution of the software without specific, written prior
13 permission.
14 
15 LUCENT DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
16 INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS.
17 IN NO EVENT SHALL LUCENT OR ANY OF ITS ENTITIES BE LIABLE FOR ANY
18 SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
19 WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER
20 IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,
21 ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF
22 THIS SOFTWARE.
23 ****************************************************************/
24 
25 #include <stdio.h>
26 #include <stdlib.h>
27 #include <string.h>
28 #include <ctype.h>
29 #include "awk.h"
30 #include "awkgram.tab.h"
31 
32 extern YYSTYPE	yylval;
33 extern bool	infunc;
34 
35 int	lineno	= 1;
36 int	bracecnt = 0;
37 int	brackcnt  = 0;
38 int	parencnt = 0;
39 
40 typedef struct Keyword {
41 	const char *word;
42 	int	sub;
43 	int	type;
44 } Keyword;
45 
46 const Keyword keywords[] = {	/* keep sorted: binary searched */
47 	{ "BEGIN",	XBEGIN,		XBEGIN },
48 	{ "END",	XEND,		XEND },
49 	{ "NF",		VARNF,		VARNF },
50 	{ "atan2",	FATAN,		BLTIN },
51 	{ "break",	BREAK,		BREAK },
52 	{ "close",	CLOSE,		CLOSE },
53 	{ "continue",	CONTINUE,	CONTINUE },
54 	{ "cos",	FCOS,		BLTIN },
55 	{ "delete",	DELETE,		DELETE },
56 	{ "do",		DO,		DO },
57 	{ "else",	ELSE,		ELSE },
58 	{ "exit",	EXIT,		EXIT },
59 	{ "exp",	FEXP,		BLTIN },
60 	{ "fflush",	FFLUSH,		BLTIN },
61 	{ "for",	FOR,		FOR },
62 	{ "func",	FUNC,		FUNC },
63 	{ "function",	FUNC,		FUNC },
64 	{ "getline",	GETLINE,	GETLINE },
65 	{ "gsub",	GSUB,		GSUB },
66 	{ "if",		IF,		IF },
67 	{ "in",		IN,		IN },
68 	{ "index",	INDEX,		INDEX },
69 	{ "int",	FINT,		BLTIN },
70 	{ "length",	FLENGTH,	BLTIN },
71 	{ "log",	FLOG,		BLTIN },
72 	{ "match",	MATCHFCN,	MATCHFCN },
73 	{ "next",	NEXT,		NEXT },
74 	{ "nextfile",	NEXTFILE,	NEXTFILE },
75 	{ "print",	PRINT,		PRINT },
76 	{ "printf",	PRINTF,		PRINTF },
77 	{ "rand",	FRAND,		BLTIN },
78 	{ "return",	RETURN,		RETURN },
79 	{ "sin",	FSIN,		BLTIN },
80 	{ "split",	SPLIT,		SPLIT },
81 	{ "sprintf",	SPRINTF,	SPRINTF },
82 	{ "sqrt",	FSQRT,		BLTIN },
83 	{ "srand",	FSRAND,		BLTIN },
84 	{ "sub",	SUB,		SUB },
85 	{ "substr",	SUBSTR,		SUBSTR },
86 	{ "system",	FSYSTEM,	BLTIN },
87 	{ "tolower",	FTOLOWER,	BLTIN },
88 	{ "toupper",	FTOUPPER,	BLTIN },
89 	{ "while",	WHILE,		WHILE },
90 };
91 
92 #define	RET(x)	{ if(dbg)printf("lex %s\n", tokname(x)); return(x); }
93 
94 static int peek(void)
95 {
96 	int c = input();
97 	unput(c);
98 	return c;
99 }
100 
101 static int gettok(char **pbuf, int *psz)	/* get next input token */
102 {
103 	int c, retc;
104 	char *buf = *pbuf;
105 	int sz = *psz;
106 	char *bp = buf;
107 
108 	c = input();
109 	if (c == 0)
110 		return 0;
111 	buf[0] = c;
112 	buf[1] = 0;
113 	if (!isalnum(c) && c != '.' && c != '_')
114 		return c;
115 
116 	*bp++ = c;
117 	if (isalpha(c) || c == '_') {	/* it's a varname */
118 		for ( ; (c = input()) != 0; ) {
119 			if (bp-buf >= sz)
120 				if (!adjbuf(&buf, &sz, bp-buf+2, 100, &bp, "gettok"))
121 					FATAL( "out of space for name %.10s...", buf );
122 			if (isalnum(c) || c == '_')
123 				*bp++ = c;
124 			else {
125 				*bp = 0;
126 				unput(c);
127 				break;
128 			}
129 		}
130 		*bp = 0;
131 		retc = 'a';	/* alphanumeric */
132 	} else {	/* maybe it's a number, but could be . */
133 		char *rem;
134 		/* read input until can't be a number */
135 		for ( ; (c = input()) != 0; ) {
136 			if (bp-buf >= sz)
137 				if (!adjbuf(&buf, &sz, bp-buf+2, 100, &bp, "gettok"))
138 					FATAL( "out of space for number %.10s...", buf );
139 			if (isdigit(c) || c == 'e' || c == 'E'
140 			  || c == '.' || c == '+' || c == '-')
141 				*bp++ = c;
142 			else {
143 				unput(c);
144 				break;
145 			}
146 		}
147 		*bp = 0;
148 		strtod(buf, &rem);	/* parse the number */
149 		if (rem == buf) {	/* it wasn't a valid number at all */
150 			buf[1] = 0;	/* return one character as token */
151 			retc = (uschar)buf[0];	/* character is its own type */
152 			unputstr(rem+1); /* put rest back for later */
153 		} else {	/* some prefix was a number */
154 			unputstr(rem);	/* put rest back for later */
155 			rem[0] = 0;	/* truncate buf after number part */
156 			retc = '0';	/* type is number */
157 		}
158 	}
159 	*pbuf = buf;
160 	*psz = sz;
161 	return retc;
162 }
163 
164 int	word(char *);
165 int	string(void);
166 int	regexpr(void);
167 bool	sc	= false;	/* true => return a } right now */
168 bool	reg	= false;	/* true => return a REGEXPR now */
169 
170 int yylex(void)
171 {
172 	int c;
173 	static char *buf = NULL;
174 	static int bufsize = 5; /* BUG: setting this small causes core dump! */
175 
176 	if (buf == NULL && (buf = (char *) malloc(bufsize)) == NULL)
177 		FATAL( "out of space in yylex" );
178 	if (sc) {
179 		sc = false;
180 		RET('}');
181 	}
182 	if (reg) {
183 		reg = false;
184 		return regexpr();
185 	}
186 	for (;;) {
187 		c = gettok(&buf, &bufsize);
188 		if (c == 0)
189 			return 0;
190 		if (isalpha(c) || c == '_')
191 			return word(buf);
192 		if (isdigit(c)) {
193 			char *cp = tostring(buf);
194 			double result;
195 
196 			if (is_number(cp, & result))
197 				yylval.cp = setsymtab(buf, cp, result, CON|NUM, symtab);
198 			else
199 				yylval.cp = setsymtab(buf, cp, 0.0, STR, symtab);
200 			free(cp);
201 			/* should this also have STR set? */
202 			RET(NUMBER);
203 		}
204 
205 		yylval.i = c;
206 		switch (c) {
207 		case '\n':	/* {EOL} */
208 			lineno++;
209 			RET(NL);
210 		case '\r':	/* assume \n is coming */
211 		case ' ':	/* {WS}+ */
212 		case '\t':
213 			break;
214 		case '#':	/* #.* strip comments */
215 			while ((c = input()) != '\n' && c != 0)
216 				;
217 			unput(c);
218 			/*
219 			 * Next line is a hack, itcompensates for
220 			 * unput's treatment of \n.
221 			 */
222 			lineno++;
223 			break;
224 		case ';':
225 			RET(';');
226 		case '\\':
227 			if (peek() == '\n') {
228 				input();
229 				lineno++;
230 			} else if (peek() == '\r') {
231 				input(); input();	/* \n */
232 				lineno++;
233 			} else {
234 				RET(c);
235 			}
236 			break;
237 		case '&':
238 			if (peek() == '&') {
239 				input(); RET(AND);
240 			} else
241 				RET('&');
242 		case '|':
243 			if (peek() == '|') {
244 				input(); RET(BOR);
245 			} else
246 				RET('|');
247 		case '!':
248 			if (peek() == '=') {
249 				input(); yylval.i = NE; RET(NE);
250 			} else if (peek() == '~') {
251 				input(); yylval.i = NOTMATCH; RET(MATCHOP);
252 			} else
253 				RET(NOT);
254 		case '~':
255 			yylval.i = MATCH;
256 			RET(MATCHOP);
257 		case '<':
258 			if (peek() == '=') {
259 				input(); yylval.i = LE; RET(LE);
260 			} else {
261 				yylval.i = LT; RET(LT);
262 			}
263 		case '=':
264 			if (peek() == '=') {
265 				input(); yylval.i = EQ; RET(EQ);
266 			} else {
267 				yylval.i = ASSIGN; RET(ASGNOP);
268 			}
269 		case '>':
270 			if (peek() == '=') {
271 				input(); yylval.i = GE; RET(GE);
272 			} else if (peek() == '>') {
273 				input(); yylval.i = APPEND; RET(APPEND);
274 			} else {
275 				yylval.i = GT; RET(GT);
276 			}
277 		case '+':
278 			if (peek() == '+') {
279 				input(); yylval.i = INCR; RET(INCR);
280 			} else if (peek() == '=') {
281 				input(); yylval.i = ADDEQ; RET(ASGNOP);
282 			} else
283 				RET('+');
284 		case '-':
285 			if (peek() == '-') {
286 				input(); yylval.i = DECR; RET(DECR);
287 			} else if (peek() == '=') {
288 				input(); yylval.i = SUBEQ; RET(ASGNOP);
289 			} else
290 				RET('-');
291 		case '*':
292 			if (peek() == '=') {	/* *= */
293 				input(); yylval.i = MULTEQ; RET(ASGNOP);
294 			} else if (peek() == '*') {	/* ** or **= */
295 				input();	/* eat 2nd * */
296 				if (peek() == '=') {
297 					input(); yylval.i = POWEQ; RET(ASGNOP);
298 				} else {
299 					RET(POWER);
300 				}
301 			} else
302 				RET('*');
303 		case '/':
304 			RET('/');
305 		case '%':
306 			if (peek() == '=') {
307 				input(); yylval.i = MODEQ; RET(ASGNOP);
308 			} else
309 				RET('%');
310 		case '^':
311 			if (peek() == '=') {
312 				input(); yylval.i = POWEQ; RET(ASGNOP);
313 			} else
314 				RET(POWER);
315 
316 		case '$':
317 			/* BUG: awkward, if not wrong */
318 			c = gettok(&buf, &bufsize);
319 			if (isalpha(c)) {
320 				if (strcmp(buf, "NF") == 0) {	/* very special */
321 					unputstr("(NF)");
322 					RET(INDIRECT);
323 				}
324 				c = peek();
325 				if (c == '(' || c == '[' || (infunc && isarg(buf) >= 0)) {
326 					unputstr(buf);
327 					RET(INDIRECT);
328 				}
329 				yylval.cp = setsymtab(buf, "", 0.0, STR|NUM, symtab);
330 				RET(IVAR);
331 			} else if (c == 0) {	/*  */
332 				SYNTAX( "unexpected end of input after $" );
333 				RET(';');
334 			} else {
335 				unputstr(buf);
336 				RET(INDIRECT);
337 			}
338 
339 		case '}':
340 			if (--bracecnt < 0)
341 				SYNTAX( "extra }" );
342 			sc = true;
343 			RET(';');
344 		case ']':
345 			if (--brackcnt < 0)
346 				SYNTAX( "extra ]" );
347 			RET(']');
348 		case ')':
349 			if (--parencnt < 0)
350 				SYNTAX( "extra )" );
351 			RET(')');
352 		case '{':
353 			bracecnt++;
354 			RET('{');
355 		case '[':
356 			brackcnt++;
357 			RET('[');
358 		case '(':
359 			parencnt++;
360 			RET('(');
361 
362 		case '"':
363 			return string();	/* BUG: should be like tran.c ? */
364 
365 		default:
366 			RET(c);
367 		}
368 	}
369 }
370 
371 extern int runetochar(char *str, int c);
372 
373 int string(void)
374 {
375 	int c, n;
376 	char *s, *bp;
377 	static char *buf = NULL;
378 	static int bufsz = 500;
379 
380 	if (buf == NULL && (buf = (char *) malloc(bufsz)) == NULL)
381 		FATAL("out of space for strings");
382 	for (bp = buf; (c = input()) != '"'; ) {
383 		if (!adjbuf(&buf, &bufsz, bp-buf+2, 500, &bp, "string"))
384 			FATAL("out of space for string %.10s...", buf);
385 		switch (c) {
386 		case '\n':
387 		case '\r':
388 		case 0:
389 			*bp = '\0';
390 			SYNTAX( "non-terminated string %.10s...", buf );
391 			if (c == 0)	/* hopeless */
392 				FATAL( "giving up" );
393 			lineno++;
394 			break;
395 		case '\\':
396 			c = input();
397 			switch (c) {
398 			case '\n': break;
399 			case '"': *bp++ = '"'; break;
400 			case 'n': *bp++ = '\n'; break;
401 			case 't': *bp++ = '\t'; break;
402 			case 'f': *bp++ = '\f'; break;
403 			case 'r': *bp++ = '\r'; break;
404 			case 'b': *bp++ = '\b'; break;
405 			case 'v': *bp++ = '\v'; break;
406 			case 'a': *bp++ = '\a'; break;
407 			case '\\': *bp++ = '\\'; break;
408 
409 			case '0': case '1': case '2': /* octal: \d \dd \ddd */
410 			case '3': case '4': case '5': case '6': case '7':
411 				n = c - '0';
412 				if ((c = peek()) >= '0' && c < '8') {
413 					n = 8 * n + input() - '0';
414 					if ((c = peek()) >= '0' && c < '8')
415 						n = 8 * n + input() - '0';
416 				}
417 				*bp++ = n;
418 				break;
419 
420 			case 'x':	/* hex  \x0-9a-fA-F (exactly two) */
421 			    {
422 				int i;
423 
424 				if (!isxdigit(peek())) {
425 					unput(c);
426 					break;
427 				}
428 				n = 0;
429 				for (i = 0; i < 2; i++) {
430 					c = input();
431 					if (c == 0)
432 						break;
433 					if (isxdigit(c)) {
434 						c = tolower(c);
435 						n *= 16;
436 						if (isdigit(c))
437 							n += (c - '0');
438 						else
439 							n += 10 + (c - 'a');
440 					} else {
441 						unput(c);
442 						break;
443 					}
444 				}
445 				if (i)
446 					*bp++ = n;
447 				break;
448 			    }
449 
450 			case 'u':	/* utf  \u0-9a-fA-F (1..8) */
451 			    {
452 				int i;
453 
454 				n = 0;
455 				for (i = 0; i < 8; i++) {
456 					c = input();
457 					if (!isxdigit(c) || c == 0)
458 						break;
459 					c = tolower(c);
460 					n *= 16;
461 					if (isdigit(c))
462 						n += (c - '0');
463 					else
464 						n += 10 + (c - 'a');
465 				}
466 				unput(c);
467 				bp += runetochar(bp, n);
468 				break;
469 			    }
470 
471 			default:
472 				*bp++ = c;
473 				break;
474 			}
475 			break;
476 		default:
477 			*bp++ = c;
478 			break;
479 		}
480 	}
481 	*bp = 0;
482 	s = tostring(buf);
483 	*bp++ = ' '; *bp++ = '\0';
484 	yylval.cp = setsymtab(buf, s, 0.0, CON|STR|DONTFREE, symtab);
485 	free(s);
486 	RET(STRING);
487 }
488 
489 
490 static int binsearch(char *w, const Keyword *kp, int n)
491 {
492 	int cond, low, mid, high;
493 
494 	low = 0;
495 	high = n - 1;
496 	while (low <= high) {
497 		mid = (low + high) / 2;
498 		if ((cond = strcmp(w, kp[mid].word)) < 0)
499 			high = mid - 1;
500 		else if (cond > 0)
501 			low = mid + 1;
502 		else
503 			return mid;
504 	}
505 	return -1;
506 }
507 
508 int word(char *w)
509 {
510 	const Keyword *kp;
511 	int c, n;
512 
513 	n = binsearch(w, keywords, sizeof(keywords)/sizeof(keywords[0]));
514 	if (n != -1) {	/* found in table */
515 		kp = keywords + n;
516 		yylval.i = kp->sub;
517 		switch (kp->type) {	/* special handling */
518 		case BLTIN:
519 			if (kp->sub == FSYSTEM && safe)
520 				SYNTAX( "system is unsafe" );
521 			RET(kp->type);
522 		case FUNC:
523 			if (infunc)
524 				SYNTAX( "illegal nested function" );
525 			RET(kp->type);
526 		case RETURN:
527 			if (!infunc)
528 				SYNTAX( "return not in function" );
529 			RET(kp->type);
530 		case VARNF:
531 			yylval.cp = setsymtab("NF", "", 0.0, NUM, symtab);
532 			RET(VARNF);
533 		default:
534 			RET(kp->type);
535 		}
536 	}
537 	c = peek();	/* look for '(' */
538 	if (c != '(' && infunc && (n=isarg(w)) >= 0) {
539 		yylval.i = n;
540 		RET(ARG);
541 	} else {
542 		yylval.cp = setsymtab(w, "", 0.0, STR|NUM|DONTFREE, symtab);
543 		if (c == '(') {
544 			RET(CALL);
545 		} else {
546 			RET(VAR);
547 		}
548 	}
549 }
550 
551 void startreg(void)	/* next call to yylex will return a regular expression */
552 {
553 	reg = true;
554 }
555 
556 int regexpr(void)
557 {
558 	int c;
559 	static char *buf = NULL;
560 	static int bufsz = 500;
561 	char *bp;
562 
563 	if (buf == NULL && (buf = (char *) malloc(bufsz)) == NULL)
564 		FATAL("out of space for reg expr");
565 	bp = buf;
566 	for ( ; (c = input()) != '/' && c != 0; ) {
567 		if (!adjbuf(&buf, &bufsz, bp-buf+3, 500, &bp, "regexpr"))
568 			FATAL("out of space for reg expr %.10s...", buf);
569 		if (c == '\n') {
570 			*bp = '\0';
571 			SYNTAX( "newline in regular expression %.10s...", buf );
572 			unput('\n');
573 			break;
574 		} else if (c == '\\') {
575 			*bp++ = '\\';
576 			*bp++ = input();
577 		} else {
578 			*bp++ = c;
579 		}
580 	}
581 	*bp = 0;
582 	if (c == 0)
583 		SYNTAX("non-terminated regular expression %.10s...", buf);
584 	yylval.s = tostring(buf);
585 	unput('/');
586 	RET(REGEXPR);
587 }
588 
589 /* low-level lexical stuff, sort of inherited from lex */
590 
591 char	ebuf[300];
592 char	*ep = ebuf;
593 char	yysbuf[100];	/* pushback buffer */
594 char	*yysptr = yysbuf;
595 FILE	*yyin = NULL;
596 
597 int input(void)	/* get next lexical input character */
598 {
599 	int c;
600 	extern char *lexprog;
601 
602 	if (yysptr > yysbuf)
603 		c = (uschar)*--yysptr;
604 	else if (lexprog != NULL) {	/* awk '...' */
605 		if ((c = (uschar)*lexprog) != 0)
606 			lexprog++;
607 	} else				/* awk -f ... */
608 		c = pgetc();
609 	if (c == EOF)
610 		c = 0;
611 	if (ep >= ebuf + sizeof ebuf)
612 		ep = ebuf;
613 	*ep = c;
614 	if (c != 0) {
615 		ep++;
616 	}
617 	return (c);
618 }
619 
620 void unput(int c)	/* put lexical character back on input */
621 {
622 	if (c == '\n')
623 		lineno--;
624 	if (yysptr >= yysbuf + sizeof(yysbuf))
625 		FATAL("pushed back too much: %.20s...", yysbuf);
626 	*yysptr++ = c;
627 	if (--ep < ebuf)
628 		ep = ebuf + sizeof(ebuf) - 1;
629 }
630 
631 void unputstr(const char *s)	/* put a string back on input */
632 {
633 	int i;
634 
635 	for (i = strlen(s)-1; i >= 0; i--)
636 		unput(s[i]);
637 }
638