xref: /freebsd/contrib/one-true-awk/lex.c (revision 4f52dfbb)
1 /****************************************************************
2 Copyright (C) Lucent Technologies 1997
3 All Rights Reserved
4 
5 Permission to use, copy, modify, and distribute this software and
6 its documentation for any purpose and without fee is hereby
7 granted, provided that the above copyright notice appear in all
8 copies and that both that the copyright notice and this
9 permission notice and warranty disclaimer appear in supporting
10 documentation, and that the name Lucent Technologies or any of
11 its entities not be used in advertising or publicity pertaining
12 to distribution of the software without specific, written prior
13 permission.
14 
15 LUCENT DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
16 INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS.
17 IN NO EVENT SHALL LUCENT OR ANY OF ITS ENTITIES BE LIABLE FOR ANY
18 SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
19 WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER
20 IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,
21 ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF
22 THIS SOFTWARE.
23 ****************************************************************/
24 
25 #include <stdio.h>
26 #include <stdlib.h>
27 #include <string.h>
28 #include <ctype.h>
29 #include "awk.h"
30 #include "ytab.h"
31 
32 extern YYSTYPE	yylval;
33 extern int	infunc;
34 
35 int	lineno	= 1;
36 int	bracecnt = 0;
37 int	brackcnt  = 0;
38 int	parencnt = 0;
39 
40 typedef struct Keyword {
41 	const char *word;
42 	int	sub;
43 	int	type;
44 } Keyword;
45 
46 Keyword keywords[] ={	/* keep sorted: binary searched */
47 	{ "BEGIN",	XBEGIN,		XBEGIN },
48 	{ "END",	XEND,		XEND },
49 	{ "NF",		VARNF,		VARNF },
50 	{ "and",	FAND,		BLTIN },
51 	{ "atan2",	FATAN,		BLTIN },
52 	{ "break",	BREAK,		BREAK },
53 	{ "close",	CLOSE,		CLOSE },
54 	{ "compl",	FCOMPL,		BLTIN },
55 	{ "continue",	CONTINUE,	CONTINUE },
56 	{ "cos",	FCOS,		BLTIN },
57 	{ "delete",	DELETE,		DELETE },
58 	{ "do",		DO,		DO },
59 	{ "else",	ELSE,		ELSE },
60 	{ "exit",	EXIT,		EXIT },
61 	{ "exp",	FEXP,		BLTIN },
62 	{ "fflush",	FFLUSH,		BLTIN },
63 	{ "for",	FOR,		FOR },
64 	{ "func",	FUNC,		FUNC },
65 	{ "function",	FUNC,		FUNC },
66 	{ "getline",	GETLINE,	GETLINE },
67 	{ "gsub",	GSUB,		GSUB },
68 	{ "if",		IF,		IF },
69 	{ "in",		IN,		IN },
70 	{ "index",	INDEX,		INDEX },
71 	{ "int",	FINT,		BLTIN },
72 	{ "length",	FLENGTH,	BLTIN },
73 	{ "log",	FLOG,		BLTIN },
74 	{ "lshift",	FLSHIFT,	BLTIN },
75 	{ "match",	MATCHFCN,	MATCHFCN },
76 	{ "next",	NEXT,		NEXT },
77 	{ "nextfile",	NEXTFILE,	NEXTFILE },
78 	{ "or",		FFOR,		BLTIN },
79 	{ "print",	PRINT,		PRINT },
80 	{ "printf",	PRINTF,		PRINTF },
81 	{ "rand",	FRAND,		BLTIN },
82 	{ "return",	RETURN,		RETURN },
83 	{ "rshift",	FRSHIFT,	BLTIN },
84 	{ "sin",	FSIN,		BLTIN },
85 	{ "split",	SPLIT,		SPLIT },
86 	{ "sprintf",	SPRINTF,	SPRINTF },
87 	{ "sqrt",	FSQRT,		BLTIN },
88 	{ "srand",	FSRAND,		BLTIN },
89 	{ "sub",	SUB,		SUB },
90 	{ "substr",	SUBSTR,		SUBSTR },
91 	{ "system",	FSYSTEM,	BLTIN },
92 	{ "tolower",	FTOLOWER,	BLTIN },
93 	{ "toupper",	FTOUPPER,	BLTIN },
94 	{ "while",	WHILE,		WHILE },
95 	{ "xor",	FXOR,		BLTIN },
96 };
97 
98 #define	RET(x)	{ if(dbg)printf("lex %s\n", tokname(x)); return(x); }
99 
100 int peek(void)
101 {
102 	int c = input();
103 	unput(c);
104 	return c;
105 }
106 
107 int gettok(char **pbuf, int *psz)	/* get next input token */
108 {
109 	int c, retc;
110 	char *buf = *pbuf;
111 	int sz = *psz;
112 	char *bp = buf;
113 
114 	c = input();
115 	if (c == 0)
116 		return 0;
117 	buf[0] = c;
118 	buf[1] = 0;
119 	if (!isalnum(c) && c != '.' && c != '_')
120 		return c;
121 
122 	*bp++ = c;
123 	if (isalpha(c) || c == '_') {	/* it's a varname */
124 		for ( ; (c = input()) != 0; ) {
125 			if (bp-buf >= sz)
126 				if (!adjbuf(&buf, &sz, bp-buf+2, 100, &bp, "gettok"))
127 					FATAL( "out of space for name %.10s...", buf );
128 			if (isalnum(c) || c == '_')
129 				*bp++ = c;
130 			else {
131 				*bp = 0;
132 				unput(c);
133 				break;
134 			}
135 		}
136 		*bp = 0;
137 		retc = 'a';	/* alphanumeric */
138 	} else {	/* maybe it's a number, but could be . */
139 		char *rem;
140 		/* read input until can't be a number */
141 		for ( ; (c = input()) != 0; ) {
142 			if (bp-buf >= sz)
143 				if (!adjbuf(&buf, &sz, bp-buf+2, 100, &bp, "gettok"))
144 					FATAL( "out of space for number %.10s...", buf );
145 			if (isdigit(c) || c == 'e' || c == 'E'
146 			  || c == '.' || c == '+' || c == '-')
147 				*bp++ = c;
148 			else {
149 				unput(c);
150 				break;
151 			}
152 		}
153 		*bp = 0;
154 		strtod(buf, &rem);	/* parse the number */
155 		if (rem == buf) {	/* it wasn't a valid number at all */
156 			buf[1] = 0;	/* return one character as token */
157 			retc = buf[0];	/* character is its own type */
158 			unputstr(rem+1); /* put rest back for later */
159 		} else {	/* some prefix was a number */
160 			unputstr(rem);	/* put rest back for later */
161 			rem[0] = 0;	/* truncate buf after number part */
162 			retc = '0';	/* type is number */
163 		}
164 	}
165 	*pbuf = buf;
166 	*psz = sz;
167 	return retc;
168 }
169 
170 int	word(char *);
171 int	string(void);
172 int	regexpr(void);
173 int	sc	= 0;	/* 1 => return a } right now */
174 int	reg	= 0;	/* 1 => return a REGEXPR now */
175 
176 int yylex(void)
177 {
178 	int c;
179 	static char *buf = NULL;
180 	static int bufsize = 5; /* BUG: setting this small causes core dump! */
181 
182 	if (buf == NULL && (buf = (char *) malloc(bufsize)) == NULL)
183 		FATAL( "out of space in yylex" );
184 	if (sc) {
185 		sc = 0;
186 		RET('}');
187 	}
188 	if (reg) {
189 		reg = 0;
190 		return regexpr();
191 	}
192 	for (;;) {
193 		c = gettok(&buf, &bufsize);
194 		if (c == 0)
195 			return 0;
196 		if (isalpha(c) || c == '_')
197 			return word(buf);
198 		if (isdigit(c)) {
199 			yylval.cp = setsymtab(buf, tostring(buf), atof(buf), CON|NUM, symtab);
200 			/* should this also have STR set? */
201 			RET(NUMBER);
202 		}
203 
204 		yylval.i = c;
205 		switch (c) {
206 		case '\n':	/* {EOL} */
207 			RET(NL);
208 		case '\r':	/* assume \n is coming */
209 		case ' ':	/* {WS}+ */
210 		case '\t':
211 			break;
212 		case '#':	/* #.* strip comments */
213 			while ((c = input()) != '\n' && c != 0)
214 				;
215 			unput(c);
216 			break;
217 		case ';':
218 			RET(';');
219 		case '\\':
220 			if (peek() == '\n') {
221 				input();
222 			} else if (peek() == '\r') {
223 				input(); input();	/* \n */
224 				lineno++;
225 			} else {
226 				RET(c);
227 			}
228 			break;
229 		case '&':
230 			if (peek() == '&') {
231 				input(); RET(AND);
232 			} else
233 				RET('&');
234 		case '|':
235 			if (peek() == '|') {
236 				input(); RET(BOR);
237 			} else
238 				RET('|');
239 		case '!':
240 			if (peek() == '=') {
241 				input(); yylval.i = NE; RET(NE);
242 			} else if (peek() == '~') {
243 				input(); yylval.i = NOTMATCH; RET(MATCHOP);
244 			} else
245 				RET(NOT);
246 		case '~':
247 			yylval.i = MATCH;
248 			RET(MATCHOP);
249 		case '<':
250 			if (peek() == '=') {
251 				input(); yylval.i = LE; RET(LE);
252 			} else {
253 				yylval.i = LT; RET(LT);
254 			}
255 		case '=':
256 			if (peek() == '=') {
257 				input(); yylval.i = EQ; RET(EQ);
258 			} else {
259 				yylval.i = ASSIGN; RET(ASGNOP);
260 			}
261 		case '>':
262 			if (peek() == '=') {
263 				input(); yylval.i = GE; RET(GE);
264 			} else if (peek() == '>') {
265 				input(); yylval.i = APPEND; RET(APPEND);
266 			} else {
267 				yylval.i = GT; RET(GT);
268 			}
269 		case '+':
270 			if (peek() == '+') {
271 				input(); yylval.i = INCR; RET(INCR);
272 			} else if (peek() == '=') {
273 				input(); yylval.i = ADDEQ; RET(ASGNOP);
274 			} else
275 				RET('+');
276 		case '-':
277 			if (peek() == '-') {
278 				input(); yylval.i = DECR; RET(DECR);
279 			} else if (peek() == '=') {
280 				input(); yylval.i = SUBEQ; RET(ASGNOP);
281 			} else
282 				RET('-');
283 		case '*':
284 			if (peek() == '=') {	/* *= */
285 				input(); yylval.i = MULTEQ; RET(ASGNOP);
286 			} else if (peek() == '*') {	/* ** or **= */
287 				input();	/* eat 2nd * */
288 				if (peek() == '=') {
289 					input(); yylval.i = POWEQ; RET(ASGNOP);
290 				} else {
291 					RET(POWER);
292 				}
293 			} else
294 				RET('*');
295 		case '/':
296 			RET('/');
297 		case '%':
298 			if (peek() == '=') {
299 				input(); yylval.i = MODEQ; RET(ASGNOP);
300 			} else
301 				RET('%');
302 		case '^':
303 			if (peek() == '=') {
304 				input(); yylval.i = POWEQ; RET(ASGNOP);
305 			} else
306 				RET(POWER);
307 
308 		case '$':
309 			/* BUG: awkward, if not wrong */
310 			c = gettok(&buf, &bufsize);
311 			if (isalpha(c)) {
312 				if (strcmp(buf, "NF") == 0) {	/* very special */
313 					unputstr("(NF)");
314 					RET(INDIRECT);
315 				}
316 				c = peek();
317 				if (c == '(' || c == '[' || (infunc && isarg(buf) >= 0)) {
318 					unputstr(buf);
319 					RET(INDIRECT);
320 				}
321 				yylval.cp = setsymtab(buf, "", 0.0, STR|NUM, symtab);
322 				RET(IVAR);
323 			} else if (c == 0) {	/*  */
324 				SYNTAX( "unexpected end of input after $" );
325 				RET(';');
326 			} else {
327 				unputstr(buf);
328 				RET(INDIRECT);
329 			}
330 
331 		case '}':
332 			if (--bracecnt < 0)
333 				SYNTAX( "extra }" );
334 			sc = 1;
335 			RET(';');
336 		case ']':
337 			if (--brackcnt < 0)
338 				SYNTAX( "extra ]" );
339 			RET(']');
340 		case ')':
341 			if (--parencnt < 0)
342 				SYNTAX( "extra )" );
343 			RET(')');
344 		case '{':
345 			bracecnt++;
346 			RET('{');
347 		case '[':
348 			brackcnt++;
349 			RET('[');
350 		case '(':
351 			parencnt++;
352 			RET('(');
353 
354 		case '"':
355 			return string();	/* BUG: should be like tran.c ? */
356 
357 		default:
358 			RET(c);
359 		}
360 	}
361 }
362 
363 int string(void)
364 {
365 	int c, n;
366 	char *s, *bp;
367 	static char *buf = NULL;
368 	static int bufsz = 500;
369 
370 	if (buf == NULL && (buf = (char *) malloc(bufsz)) == NULL)
371 		FATAL("out of space for strings");
372 	for (bp = buf; (c = input()) != '"'; ) {
373 		if (!adjbuf(&buf, &bufsz, bp-buf+2, 500, &bp, "string"))
374 			FATAL("out of space for string %.10s...", buf);
375 		switch (c) {
376 		case '\n':
377 		case '\r':
378 		case 0:
379 			SYNTAX( "non-terminated string %.10s...", buf );
380 			lineno++;
381 			if (c == 0)	/* hopeless */
382 				FATAL( "giving up" );
383 			break;
384 		case '\\':
385 			c = input();
386 			switch (c) {
387 			case '"': *bp++ = '"'; break;
388 			case 'n': *bp++ = '\n'; break;
389 			case 't': *bp++ = '\t'; break;
390 			case 'f': *bp++ = '\f'; break;
391 			case 'r': *bp++ = '\r'; break;
392 			case 'b': *bp++ = '\b'; break;
393 			case 'v': *bp++ = '\v'; break;
394 			case 'a': *bp++ = '\007'; break;
395 			case '\\': *bp++ = '\\'; break;
396 
397 			case '0': case '1': case '2': /* octal: \d \dd \ddd */
398 			case '3': case '4': case '5': case '6': case '7':
399 				n = c - '0';
400 				if ((c = peek()) >= '0' && c < '8') {
401 					n = 8 * n + input() - '0';
402 					if ((c = peek()) >= '0' && c < '8')
403 						n = 8 * n + input() - '0';
404 				}
405 				*bp++ = n;
406 				break;
407 
408 			case 'x':	/* hex  \x0-9a-fA-F + */
409 			    {	char xbuf[100], *px;
410 				for (px = xbuf; (c = input()) != 0 && px-xbuf < 100-2; ) {
411 					if (isdigit(c)
412 					 || (c >= 'a' && c <= 'f')
413 					 || (c >= 'A' && c <= 'F'))
414 						*px++ = c;
415 					else
416 						break;
417 				}
418 				*px = 0;
419 				unput(c);
420 	  			sscanf(xbuf, "%x", (unsigned int *) &n);
421 				*bp++ = n;
422 				break;
423 			    }
424 
425 			default:
426 				*bp++ = c;
427 				break;
428 			}
429 			break;
430 		default:
431 			*bp++ = c;
432 			break;
433 		}
434 	}
435 	*bp = 0;
436 	s = tostring(buf);
437 	*bp++ = ' '; *bp++ = 0;
438 	yylval.cp = setsymtab(buf, s, 0.0, CON|STR|DONTFREE, symtab);
439 	RET(STRING);
440 }
441 
442 
443 int binsearch(char *w, Keyword *kp, int n)
444 {
445 	int cond, low, mid, high;
446 
447 	low = 0;
448 	high = n - 1;
449 	while (low <= high) {
450 		mid = (low + high) / 2;
451 		if ((cond = strcmp(w, kp[mid].word)) < 0)
452 			high = mid - 1;
453 		else if (cond > 0)
454 			low = mid + 1;
455 		else
456 			return mid;
457 	}
458 	return -1;
459 }
460 
461 int word(char *w)
462 {
463 	Keyword *kp;
464 	int c, n;
465 
466 	n = binsearch(w, keywords, sizeof(keywords)/sizeof(keywords[0]));
467 /* BUG: this ought to be inside the if; in theory could fault (daniel barrett) */
468 	kp = keywords + n;
469 	if (n != -1) {	/* found in table */
470 		yylval.i = kp->sub;
471 		switch (kp->type) {	/* special handling */
472 		case BLTIN:
473 			if (kp->sub == FSYSTEM && safe)
474 				SYNTAX( "system is unsafe" );
475 			RET(kp->type);
476 		case FUNC:
477 			if (infunc)
478 				SYNTAX( "illegal nested function" );
479 			RET(kp->type);
480 		case RETURN:
481 			if (!infunc)
482 				SYNTAX( "return not in function" );
483 			RET(kp->type);
484 		case VARNF:
485 			yylval.cp = setsymtab("NF", "", 0.0, NUM, symtab);
486 			RET(VARNF);
487 		default:
488 			RET(kp->type);
489 		}
490 	}
491 	c = peek();	/* look for '(' */
492 	if (c != '(' && infunc && (n=isarg(w)) >= 0) {
493 		yylval.i = n;
494 		RET(ARG);
495 	} else {
496 		yylval.cp = setsymtab(w, "", 0.0, STR|NUM|DONTFREE, symtab);
497 		if (c == '(') {
498 			RET(CALL);
499 		} else {
500 			RET(VAR);
501 		}
502 	}
503 }
504 
505 void startreg(void)	/* next call to yylex will return a regular expression */
506 {
507 	reg = 1;
508 }
509 
510 int regexpr(void)
511 {
512 	int c;
513 	static char *buf = NULL;
514 	static int bufsz = 500;
515 	char *bp;
516 
517 	if (buf == NULL && (buf = (char *) malloc(bufsz)) == NULL)
518 		FATAL("out of space for rex expr");
519 	bp = buf;
520 	for ( ; (c = input()) != '/' && c != 0; ) {
521 		if (!adjbuf(&buf, &bufsz, bp-buf+3, 500, &bp, "regexpr"))
522 			FATAL("out of space for reg expr %.10s...", buf);
523 		if (c == '\n') {
524 			SYNTAX( "newline in regular expression %.10s...", buf );
525 			unput('\n');
526 			break;
527 		} else if (c == '\\') {
528 			*bp++ = '\\';
529 			*bp++ = input();
530 		} else {
531 			*bp++ = c;
532 		}
533 	}
534 	*bp = 0;
535 	if (c == 0)
536 		SYNTAX("non-terminated regular expression %.10s...", buf);
537 	yylval.s = tostring(buf);
538 	unput('/');
539 	RET(REGEXPR);
540 }
541 
542 /* low-level lexical stuff, sort of inherited from lex */
543 
544 char	ebuf[300];
545 char	*ep = ebuf;
546 char	yysbuf[100];	/* pushback buffer */
547 char	*yysptr = yysbuf;
548 FILE	*yyin = NULL;
549 
550 int input(void)	/* get next lexical input character */
551 {
552 	int c;
553 	extern char *lexprog;
554 
555 	if (yysptr > yysbuf)
556 		c = (uschar)*--yysptr;
557 	else if (lexprog != NULL) {	/* awk '...' */
558 		if ((c = (uschar)*lexprog) != 0)
559 			lexprog++;
560 	} else				/* awk -f ... */
561 		c = pgetc();
562 	if (c == '\n')
563 		lineno++;
564 	else if (c == EOF)
565 		c = 0;
566 	if (ep >= ebuf + sizeof ebuf)
567 		ep = ebuf;
568 	return *ep++ = c;
569 }
570 
571 void unput(int c)	/* put lexical character back on input */
572 {
573 	if (c == '\n')
574 		lineno--;
575 	if (yysptr >= yysbuf + sizeof(yysbuf))
576 		FATAL("pushed back too much: %.20s...", yysbuf);
577 	*yysptr++ = c;
578 	if (--ep < ebuf)
579 		ep = ebuf + sizeof(ebuf) - 1;
580 }
581 
582 void unputstr(const char *s)	/* put a string back on input */
583 {
584 	int i;
585 
586 	for (i = strlen(s)-1; i >= 0; i--)
587 		unput(s[i]);
588 }
589