xref: /dragonfly/contrib/awk/lex.c (revision f9993810)
1 /****************************************************************
2 Copyright (C) Lucent Technologies 1997
3 All Rights Reserved
4 
5 Permission to use, copy, modify, and distribute this software and
6 its documentation for any purpose and without fee is hereby
7 granted, provided that the above copyright notice appear in all
8 copies and that both that the copyright notice and this
9 permission notice and warranty disclaimer appear in supporting
10 documentation, and that the name Lucent Technologies or any of
11 its entities not be used in advertising or publicity pertaining
12 to distribution of the software without specific, written prior
13 permission.
14 
15 LUCENT DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
16 INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS.
17 IN NO EVENT SHALL LUCENT OR ANY OF ITS ENTITIES BE LIABLE FOR ANY
18 SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
19 WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER
20 IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,
21 ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF
22 THIS SOFTWARE.
23 ****************************************************************/
24 
25 #include <stdio.h>
26 #include <stdlib.h>
27 #include <string.h>
28 #include <ctype.h>
29 #include "awk.h"
30 #include "awkgram.tab.h"
31 
32 extern YYSTYPE	yylval;
33 extern bool	infunc;
34 
35 int	lineno	= 1;
36 int	bracecnt = 0;
37 int	brackcnt  = 0;
38 int	parencnt = 0;
39 
40 typedef struct Keyword {
41 	const char *word;
42 	int	sub;
43 	int	type;
44 } Keyword;
45 
46 const Keyword keywords[] = {	/* keep sorted: binary searched */
47 	{ "BEGIN",	XBEGIN,		XBEGIN },
48 	{ "END",	XEND,		XEND },
49 	{ "NF",		VARNF,		VARNF },
50 	{ "atan2",	FATAN,		BLTIN },
51 	{ "break",	BREAK,		BREAK },
52 	{ "close",	CLOSE,		CLOSE },
53 	{ "continue",	CONTINUE,	CONTINUE },
54 	{ "cos",	FCOS,		BLTIN },
55 	{ "delete",	DELETE,		DELETE },
56 	{ "do",		DO,		DO },
57 	{ "else",	ELSE,		ELSE },
58 	{ "exit",	EXIT,		EXIT },
59 	{ "exp",	FEXP,		BLTIN },
60 	{ "fflush",	FFLUSH,		BLTIN },
61 	{ "for",	FOR,		FOR },
62 	{ "func",	FUNC,		FUNC },
63 	{ "function",	FUNC,		FUNC },
64 	{ "getline",	GETLINE,	GETLINE },
65 	{ "gsub",	GSUB,		GSUB },
66 	{ "if",		IF,		IF },
67 	{ "in",		IN,		IN },
68 	{ "index",	INDEX,		INDEX },
69 	{ "int",	FINT,		BLTIN },
70 	{ "length",	FLENGTH,	BLTIN },
71 	{ "log",	FLOG,		BLTIN },
72 	{ "match",	MATCHFCN,	MATCHFCN },
73 	{ "next",	NEXT,		NEXT },
74 	{ "nextfile",	NEXTFILE,	NEXTFILE },
75 	{ "print",	PRINT,		PRINT },
76 	{ "printf",	PRINTF,		PRINTF },
77 	{ "rand",	FRAND,		BLTIN },
78 	{ "return",	RETURN,		RETURN },
79 	{ "sin",	FSIN,		BLTIN },
80 	{ "split",	SPLIT,		SPLIT },
81 	{ "sprintf",	SPRINTF,	SPRINTF },
82 	{ "sqrt",	FSQRT,		BLTIN },
83 	{ "srand",	FSRAND,		BLTIN },
84 	{ "sub",	SUB,		SUB },
85 	{ "substr",	SUBSTR,		SUBSTR },
86 	{ "system",	FSYSTEM,	BLTIN },
87 	{ "tolower",	FTOLOWER,	BLTIN },
88 	{ "toupper",	FTOUPPER,	BLTIN },
89 	{ "while",	WHILE,		WHILE },
90 };
91 
92 #define	RET(x)	{ if(dbg)printf("lex %s\n", tokname(x)); return(x); }
93 
94 static int peek(void)
95 {
96 	int c = input();
97 	unput(c);
98 	return c;
99 }
100 
101 static int gettok(char **pbuf, int *psz)	/* get next input token */
102 {
103 	int c, retc;
104 	char *buf = *pbuf;
105 	int sz = *psz;
106 	char *bp = buf;
107 
108 	c = input();
109 	if (c == 0)
110 		return 0;
111 	buf[0] = c;
112 	buf[1] = 0;
113 	if (!isalnum(c) && c != '.' && c != '_')
114 		return c;
115 
116 	*bp++ = c;
117 	if (isalpha(c) || c == '_') {	/* it's a varname */
118 		for ( ; (c = input()) != 0; ) {
119 			if (bp-buf >= sz)
120 				if (!adjbuf(&buf, &sz, bp-buf+2, 100, &bp, "gettok"))
121 					FATAL( "out of space for name %.10s...", buf );
122 			if (isalnum(c) || c == '_')
123 				*bp++ = c;
124 			else {
125 				*bp = 0;
126 				unput(c);
127 				break;
128 			}
129 		}
130 		*bp = 0;
131 		retc = 'a';	/* alphanumeric */
132 	} else {	/* maybe it's a number, but could be . */
133 		char *rem;
134 		/* read input until can't be a number */
135 		for ( ; (c = input()) != 0; ) {
136 			if (bp-buf >= sz)
137 				if (!adjbuf(&buf, &sz, bp-buf+2, 100, &bp, "gettok"))
138 					FATAL( "out of space for number %.10s...", buf );
139 			if (isdigit(c) || c == 'e' || c == 'E'
140 			  || c == '.' || c == '+' || c == '-')
141 				*bp++ = c;
142 			else {
143 				unput(c);
144 				break;
145 			}
146 		}
147 		*bp = 0;
148 		strtod(buf, &rem);	/* parse the number */
149 		if (rem == buf) {	/* it wasn't a valid number at all */
150 			buf[1] = 0;	/* return one character as token */
151 			retc = (uschar)buf[0];	/* character is its own type */
152 			unputstr(rem+1); /* put rest back for later */
153 		} else {	/* some prefix was a number */
154 			unputstr(rem);	/* put rest back for later */
155 			rem[0] = 0;	/* truncate buf after number part */
156 			retc = '0';	/* type is number */
157 		}
158 	}
159 	*pbuf = buf;
160 	*psz = sz;
161 	return retc;
162 }
163 
164 int	word(char *);
165 int	string(void);
166 int	regexpr(void);
167 bool	sc	= false;	/* true => return a } right now */
168 bool	reg	= false;	/* true => return a REGEXPR now */
169 
170 int yylex(void)
171 {
172 	int c;
173 	static char *buf = NULL;
174 	static int bufsize = 5; /* BUG: setting this small causes core dump! */
175 
176 	if (buf == NULL && (buf = (char *) malloc(bufsize)) == NULL)
177 		FATAL( "out of space in yylex" );
178 	if (sc) {
179 		sc = false;
180 		RET('}');
181 	}
182 	if (reg) {
183 		reg = false;
184 		return regexpr();
185 	}
186 	for (;;) {
187 		c = gettok(&buf, &bufsize);
188 		if (c == 0)
189 			return 0;
190 		if (isalpha(c) || c == '_')
191 			return word(buf);
192 		if (isdigit(c)) {
193 			char *cp = tostring(buf);
194 			double result;
195 
196 			if (is_number(cp, & result))
197 				yylval.cp = setsymtab(buf, cp, result, CON|NUM, symtab);
198 			else
199 				yylval.cp = setsymtab(buf, cp, 0.0, STR, symtab);
200 			free(cp);
201 			/* should this also have STR set? */
202 			RET(NUMBER);
203 		}
204 
205 		yylval.i = c;
206 		switch (c) {
207 		case '\n':	/* {EOL} */
208 			lineno++;
209 			RET(NL);
210 		case '\r':	/* assume \n is coming */
211 		case ' ':	/* {WS}+ */
212 		case '\t':
213 			break;
214 		case '#':	/* #.* strip comments */
215 			while ((c = input()) != '\n' && c != 0)
216 				;
217 			unput(c);
218 			/*
219 			 * Next line is a hack, itcompensates for
220 			 * unput's treatment of \n.
221 			 */
222 			lineno++;
223 			break;
224 		case ';':
225 			RET(';');
226 		case '\\':
227 			if (peek() == '\n') {
228 				input();
229 				lineno++;
230 			} else if (peek() == '\r') {
231 				input(); input();	/* \n */
232 				lineno++;
233 			} else {
234 				RET(c);
235 			}
236 			break;
237 		case '&':
238 			if (peek() == '&') {
239 				input(); RET(AND);
240 			} else
241 				RET('&');
242 		case '|':
243 			if (peek() == '|') {
244 				input(); RET(BOR);
245 			} else
246 				RET('|');
247 		case '!':
248 			if (peek() == '=') {
249 				input(); yylval.i = NE; RET(NE);
250 			} else if (peek() == '~') {
251 				input(); yylval.i = NOTMATCH; RET(MATCHOP);
252 			} else
253 				RET(NOT);
254 		case '~':
255 			yylval.i = MATCH;
256 			RET(MATCHOP);
257 		case '<':
258 			if (peek() == '=') {
259 				input(); yylval.i = LE; RET(LE);
260 			} else {
261 				yylval.i = LT; RET(LT);
262 			}
263 		case '=':
264 			if (peek() == '=') {
265 				input(); yylval.i = EQ; RET(EQ);
266 			} else {
267 				yylval.i = ASSIGN; RET(ASGNOP);
268 			}
269 		case '>':
270 			if (peek() == '=') {
271 				input(); yylval.i = GE; RET(GE);
272 			} else if (peek() == '>') {
273 				input(); yylval.i = APPEND; RET(APPEND);
274 			} else {
275 				yylval.i = GT; RET(GT);
276 			}
277 		case '+':
278 			if (peek() == '+') {
279 				input(); yylval.i = INCR; RET(INCR);
280 			} else if (peek() == '=') {
281 				input(); yylval.i = ADDEQ; RET(ASGNOP);
282 			} else
283 				RET('+');
284 		case '-':
285 			if (peek() == '-') {
286 				input(); yylval.i = DECR; RET(DECR);
287 			} else if (peek() == '=') {
288 				input(); yylval.i = SUBEQ; RET(ASGNOP);
289 			} else
290 				RET('-');
291 		case '*':
292 			if (peek() == '=') {	/* *= */
293 				input(); yylval.i = MULTEQ; RET(ASGNOP);
294 			} else if (peek() == '*') {	/* ** or **= */
295 				input();	/* eat 2nd * */
296 				if (peek() == '=') {
297 					input(); yylval.i = POWEQ; RET(ASGNOP);
298 				} else {
299 					RET(POWER);
300 				}
301 			} else
302 				RET('*');
303 		case '/':
304 			RET('/');
305 		case '%':
306 			if (peek() == '=') {
307 				input(); yylval.i = MODEQ; RET(ASGNOP);
308 			} else
309 				RET('%');
310 		case '^':
311 			if (peek() == '=') {
312 				input(); yylval.i = POWEQ; RET(ASGNOP);
313 			} else
314 				RET(POWER);
315 
316 		case '$':
317 			/* BUG: awkward, if not wrong */
318 			c = gettok(&buf, &bufsize);
319 			if (isalpha(c)) {
320 				if (strcmp(buf, "NF") == 0) {	/* very special */
321 					unputstr("(NF)");
322 					RET(INDIRECT);
323 				}
324 				c = peek();
325 				if (c == '(' || c == '[' || (infunc && isarg(buf) >= 0)) {
326 					unputstr(buf);
327 					RET(INDIRECT);
328 				}
329 				yylval.cp = setsymtab(buf, "", 0.0, STR|NUM, symtab);
330 				RET(IVAR);
331 			} else if (c == 0) {	/*  */
332 				SYNTAX( "unexpected end of input after $" );
333 				RET(';');
334 			} else {
335 				unputstr(buf);
336 				RET(INDIRECT);
337 			}
338 
339 		case '}':
340 			if (--bracecnt < 0)
341 				SYNTAX( "extra }" );
342 			sc = true;
343 			RET(';');
344 		case ']':
345 			if (--brackcnt < 0)
346 				SYNTAX( "extra ]" );
347 			RET(']');
348 		case ')':
349 			if (--parencnt < 0)
350 				SYNTAX( "extra )" );
351 			RET(')');
352 		case '{':
353 			bracecnt++;
354 			RET('{');
355 		case '[':
356 			brackcnt++;
357 			RET('[');
358 		case '(':
359 			parencnt++;
360 			RET('(');
361 
362 		case '"':
363 			return string();	/* BUG: should be like tran.c ? */
364 
365 		default:
366 			RET(c);
367 		}
368 	}
369 }
370 
371 int string(void)
372 {
373 	int c, n;
374 	char *s, *bp;
375 	static char *buf = NULL;
376 	static int bufsz = 500;
377 
378 	if (buf == NULL && (buf = (char *) malloc(bufsz)) == NULL)
379 		FATAL("out of space for strings");
380 	for (bp = buf; (c = input()) != '"'; ) {
381 		if (!adjbuf(&buf, &bufsz, bp-buf+2, 500, &bp, "string"))
382 			FATAL("out of space for string %.10s...", buf);
383 		switch (c) {
384 		case '\n':
385 		case '\r':
386 		case 0:
387 			*bp = '\0';
388 			SYNTAX( "non-terminated string %.10s...", buf );
389 			if (c == 0)	/* hopeless */
390 				FATAL( "giving up" );
391 			lineno++;
392 			break;
393 		case '\\':
394 			c = input();
395 			switch (c) {
396 			case '\n': break;
397 			case '"': *bp++ = '"'; break;
398 			case 'n': *bp++ = '\n'; break;
399 			case 't': *bp++ = '\t'; break;
400 			case 'f': *bp++ = '\f'; break;
401 			case 'r': *bp++ = '\r'; break;
402 			case 'b': *bp++ = '\b'; break;
403 			case 'v': *bp++ = '\v'; break;
404 			case 'a': *bp++ = '\a'; break;
405 			case '\\': *bp++ = '\\'; break;
406 
407 			case '0': case '1': case '2': /* octal: \d \dd \ddd */
408 			case '3': case '4': case '5': case '6': case '7':
409 				n = c - '0';
410 				if ((c = peek()) >= '0' && c < '8') {
411 					n = 8 * n + input() - '0';
412 					if ((c = peek()) >= '0' && c < '8')
413 						n = 8 * n + input() - '0';
414 				}
415 				*bp++ = n;
416 				break;
417 
418 			case 'x':	/* hex  \x0-9a-fA-F + */
419 			    {	char xbuf[100], *px;
420 				for (px = xbuf; (c = input()) != 0 && px-xbuf < 100-2; ) {
421 					if (isdigit(c)
422 					 || (c >= 'a' && c <= 'f')
423 					 || (c >= 'A' && c <= 'F'))
424 						*px++ = c;
425 					else
426 						break;
427 				}
428 				*px = 0;
429 				unput(c);
430 	  			sscanf(xbuf, "%x", (unsigned int *) &n);
431 				*bp++ = n;
432 				break;
433 			    }
434 
435 			default:
436 				*bp++ = c;
437 				break;
438 			}
439 			break;
440 		default:
441 			*bp++ = c;
442 			break;
443 		}
444 	}
445 	*bp = 0;
446 	s = tostring(buf);
447 	*bp++ = ' '; *bp++ = '\0';
448 	yylval.cp = setsymtab(buf, s, 0.0, CON|STR|DONTFREE, symtab);
449 	free(s);
450 	RET(STRING);
451 }
452 
453 
454 static int binsearch(char *w, const Keyword *kp, int n)
455 {
456 	int cond, low, mid, high;
457 
458 	low = 0;
459 	high = n - 1;
460 	while (low <= high) {
461 		mid = (low + high) / 2;
462 		if ((cond = strcmp(w, kp[mid].word)) < 0)
463 			high = mid - 1;
464 		else if (cond > 0)
465 			low = mid + 1;
466 		else
467 			return mid;
468 	}
469 	return -1;
470 }
471 
472 int word(char *w)
473 {
474 	const Keyword *kp;
475 	int c, n;
476 
477 	n = binsearch(w, keywords, sizeof(keywords)/sizeof(keywords[0]));
478 	if (n != -1) {	/* found in table */
479 		kp = keywords + n;
480 		yylval.i = kp->sub;
481 		switch (kp->type) {	/* special handling */
482 		case BLTIN:
483 			if (kp->sub == FSYSTEM && safe)
484 				SYNTAX( "system is unsafe" );
485 			RET(kp->type);
486 		case FUNC:
487 			if (infunc)
488 				SYNTAX( "illegal nested function" );
489 			RET(kp->type);
490 		case RETURN:
491 			if (!infunc)
492 				SYNTAX( "return not in function" );
493 			RET(kp->type);
494 		case VARNF:
495 			yylval.cp = setsymtab("NF", "", 0.0, NUM, symtab);
496 			RET(VARNF);
497 		default:
498 			RET(kp->type);
499 		}
500 	}
501 	c = peek();	/* look for '(' */
502 	if (c != '(' && infunc && (n=isarg(w)) >= 0) {
503 		yylval.i = n;
504 		RET(ARG);
505 	} else {
506 		yylval.cp = setsymtab(w, "", 0.0, STR|NUM|DONTFREE, symtab);
507 		if (c == '(') {
508 			RET(CALL);
509 		} else {
510 			RET(VAR);
511 		}
512 	}
513 }
514 
515 void startreg(void)	/* next call to yylex will return a regular expression */
516 {
517 	reg = true;
518 }
519 
520 int regexpr(void)
521 {
522 	int c;
523 	static char *buf = NULL;
524 	static int bufsz = 500;
525 	char *bp;
526 
527 	if (buf == NULL && (buf = (char *) malloc(bufsz)) == NULL)
528 		FATAL("out of space for reg expr");
529 	bp = buf;
530 	for ( ; (c = input()) != '/' && c != 0; ) {
531 		if (!adjbuf(&buf, &bufsz, bp-buf+3, 500, &bp, "regexpr"))
532 			FATAL("out of space for reg expr %.10s...", buf);
533 		if (c == '\n') {
534 			*bp = '\0';
535 			SYNTAX( "newline in regular expression %.10s...", buf );
536 			unput('\n');
537 			break;
538 		} else if (c == '\\') {
539 			*bp++ = '\\';
540 			*bp++ = input();
541 		} else {
542 			*bp++ = c;
543 		}
544 	}
545 	*bp = 0;
546 	if (c == 0)
547 		SYNTAX("non-terminated regular expression %.10s...", buf);
548 	yylval.s = buf;
549 	unput('/');
550 	RET(REGEXPR);
551 }
552 
553 /* low-level lexical stuff, sort of inherited from lex */
554 
555 char	ebuf[300];
556 char	*ep = ebuf;
557 char	yysbuf[100];	/* pushback buffer */
558 char	*yysptr = yysbuf;
559 FILE	*yyin = NULL;
560 
561 int input(void)	/* get next lexical input character */
562 {
563 	int c;
564 	extern char *lexprog;
565 
566 	if (yysptr > yysbuf)
567 		c = (uschar)*--yysptr;
568 	else if (lexprog != NULL) {	/* awk '...' */
569 		if ((c = (uschar)*lexprog) != 0)
570 			lexprog++;
571 	} else				/* awk -f ... */
572 		c = pgetc();
573 	if (c == EOF)
574 		c = 0;
575 	if (ep >= ebuf + sizeof ebuf)
576 		ep = ebuf;
577 	*ep = c;
578 	if (c != 0) {
579 		ep++;
580 	}
581 	return (c);
582 }
583 
584 void unput(int c)	/* put lexical character back on input */
585 {
586 	if (c == '\n')
587 		lineno--;
588 	if (yysptr >= yysbuf + sizeof(yysbuf))
589 		FATAL("pushed back too much: %.20s...", yysbuf);
590 	*yysptr++ = c;
591 	if (--ep < ebuf)
592 		ep = ebuf + sizeof(ebuf) - 1;
593 }
594 
595 void unputstr(const char *s)	/* put a string back on input */
596 {
597 	int i;
598 
599 	for (i = strlen(s)-1; i >= 0; i--)
600 		unput(s[i]);
601 }
602