xref: /dragonfly/contrib/awk/lex.c (revision 65030a6a)
1 /****************************************************************
2 Copyright (C) Lucent Technologies 1997
3 All Rights Reserved
4 
5 Permission to use, copy, modify, and distribute this software and
6 its documentation for any purpose and without fee is hereby
7 granted, provided that the above copyright notice appear in all
8 copies and that both that the copyright notice and this
9 permission notice and warranty disclaimer appear in supporting
10 documentation, and that the name Lucent Technologies or any of
11 its entities not be used in advertising or publicity pertaining
12 to distribution of the software without specific, written prior
13 permission.
14 
15 LUCENT DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
16 INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS.
17 IN NO EVENT SHALL LUCENT OR ANY OF ITS ENTITIES BE LIABLE FOR ANY
18 SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
19 WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER
20 IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,
21 ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF
22 THIS SOFTWARE.
23 ****************************************************************/
24 
25 #include <stdio.h>
26 #include <stdlib.h>
27 #include <string.h>
28 #include <ctype.h>
29 #include "awk.h"
30 #include "ytab.h"
31 
32 extern YYSTYPE	yylval;
33 extern bool	infunc;
34 
35 int	lineno	= 1;
36 int	bracecnt = 0;
37 int	brackcnt  = 0;
38 int	parencnt = 0;
39 
40 typedef struct Keyword {
41 	const char *word;
42 	int	sub;
43 	int	type;
44 } Keyword;
45 
46 const Keyword keywords[] = {	/* keep sorted: binary searched */
47 	{ "BEGIN",	XBEGIN,		XBEGIN },
48 	{ "END",	XEND,		XEND },
49 	{ "NF",		VARNF,		VARNF },
50 	{ "atan2",	FATAN,		BLTIN },
51 	{ "break",	BREAK,		BREAK },
52 	{ "close",	CLOSE,		CLOSE },
53 	{ "continue",	CONTINUE,	CONTINUE },
54 	{ "cos",	FCOS,		BLTIN },
55 	{ "delete",	DELETE,		DELETE },
56 	{ "do",		DO,		DO },
57 	{ "else",	ELSE,		ELSE },
58 	{ "exit",	EXIT,		EXIT },
59 	{ "exp",	FEXP,		BLTIN },
60 	{ "fflush",	FFLUSH,		BLTIN },
61 	{ "for",	FOR,		FOR },
62 	{ "func",	FUNC,		FUNC },
63 	{ "function",	FUNC,		FUNC },
64 	{ "getline",	GETLINE,	GETLINE },
65 	{ "gsub",	GSUB,		GSUB },
66 	{ "if",		IF,		IF },
67 	{ "in",		IN,		IN },
68 	{ "index",	INDEX,		INDEX },
69 	{ "int",	FINT,		BLTIN },
70 	{ "length",	FLENGTH,	BLTIN },
71 	{ "log",	FLOG,		BLTIN },
72 	{ "match",	MATCHFCN,	MATCHFCN },
73 	{ "next",	NEXT,		NEXT },
74 	{ "nextfile",	NEXTFILE,	NEXTFILE },
75 	{ "print",	PRINT,		PRINT },
76 	{ "printf",	PRINTF,		PRINTF },
77 	{ "rand",	FRAND,		BLTIN },
78 	{ "return",	RETURN,		RETURN },
79 	{ "sin",	FSIN,		BLTIN },
80 	{ "split",	SPLIT,		SPLIT },
81 	{ "sprintf",	SPRINTF,	SPRINTF },
82 	{ "sqrt",	FSQRT,		BLTIN },
83 	{ "srand",	FSRAND,		BLTIN },
84 	{ "sub",	SUB,		SUB },
85 	{ "substr",	SUBSTR,		SUBSTR },
86 	{ "system",	FSYSTEM,	BLTIN },
87 	{ "tolower",	FTOLOWER,	BLTIN },
88 	{ "toupper",	FTOUPPER,	BLTIN },
89 	{ "while",	WHILE,		WHILE },
90 };
91 
92 #define	RET(x)	{ if(dbg)printf("lex %s\n", tokname(x)); return(x); }
93 
94 static int peek(void)
95 {
96 	int c = input();
97 	unput(c);
98 	return c;
99 }
100 
101 static int gettok(char **pbuf, int *psz)	/* get next input token */
102 {
103 	int c, retc;
104 	char *buf = *pbuf;
105 	int sz = *psz;
106 	char *bp = buf;
107 
108 	c = input();
109 	if (c == 0)
110 		return 0;
111 	buf[0] = c;
112 	buf[1] = 0;
113 	if (!isalnum(c) && c != '.' && c != '_')
114 		return c;
115 
116 	*bp++ = c;
117 	if (isalpha(c) || c == '_') {	/* it's a varname */
118 		for ( ; (c = input()) != 0; ) {
119 			if (bp-buf >= sz)
120 				if (!adjbuf(&buf, &sz, bp-buf+2, 100, &bp, "gettok"))
121 					FATAL( "out of space for name %.10s...", buf );
122 			if (isalnum(c) || c == '_')
123 				*bp++ = c;
124 			else {
125 				*bp = 0;
126 				unput(c);
127 				break;
128 			}
129 		}
130 		*bp = 0;
131 		retc = 'a';	/* alphanumeric */
132 	} else {	/* maybe it's a number, but could be . */
133 		char *rem;
134 		/* read input until can't be a number */
135 		for ( ; (c = input()) != 0; ) {
136 			if (bp-buf >= sz)
137 				if (!adjbuf(&buf, &sz, bp-buf+2, 100, &bp, "gettok"))
138 					FATAL( "out of space for number %.10s...", buf );
139 			if (isdigit(c) || c == 'e' || c == 'E'
140 			  || c == '.' || c == '+' || c == '-')
141 				*bp++ = c;
142 			else {
143 				unput(c);
144 				break;
145 			}
146 		}
147 		*bp = 0;
148 		strtod(buf, &rem);	/* parse the number */
149 		if (rem == buf) {	/* it wasn't a valid number at all */
150 			buf[1] = 0;	/* return one character as token */
151 			retc = buf[0];	/* character is its own type */
152 			unputstr(rem+1); /* put rest back for later */
153 		} else {	/* some prefix was a number */
154 			unputstr(rem);	/* put rest back for later */
155 			rem[0] = 0;	/* truncate buf after number part */
156 			retc = '0';	/* type is number */
157 		}
158 	}
159 	*pbuf = buf;
160 	*psz = sz;
161 	return retc;
162 }
163 
164 int	word(char *);
165 int	string(void);
166 int	regexpr(void);
167 bool	sc	= false;	/* true => return a } right now */
168 bool	reg	= false;	/* true => return a REGEXPR now */
169 
170 int yylex(void)
171 {
172 	int c;
173 	static char *buf = NULL;
174 	static int bufsize = 5; /* BUG: setting this small causes core dump! */
175 
176 	if (buf == NULL && (buf = malloc(bufsize)) == NULL)
177 		FATAL( "out of space in yylex" );
178 	if (sc) {
179 		sc = false;
180 		RET('}');
181 	}
182 	if (reg) {
183 		reg = false;
184 		return regexpr();
185 	}
186 	for (;;) {
187 		c = gettok(&buf, &bufsize);
188 		if (c == 0)
189 			return 0;
190 		if (isalpha(c) || c == '_')
191 			return word(buf);
192 		if (isdigit(c)) {
193 			char *cp = tostring(buf);
194 			yylval.cp = setsymtab(buf, cp, atof(buf), CON|NUM, symtab);
195 			free(cp);
196 			/* should this also have STR set? */
197 			RET(NUMBER);
198 		}
199 
200 		yylval.i = c;
201 		switch (c) {
202 		case '\n':	/* {EOL} */
203 			lineno++;
204 			RET(NL);
205 		case '\r':	/* assume \n is coming */
206 		case ' ':	/* {WS}+ */
207 		case '\t':
208 			break;
209 		case '#':	/* #.* strip comments */
210 			while ((c = input()) != '\n' && c != 0)
211 				;
212 			unput(c);
213 			/*
214 			 * Next line is a hack, itcompensates for
215 			 * unput's treatment of \n.
216 			 */
217 			lineno++;
218 			break;
219 		case ';':
220 			RET(';');
221 		case '\\':
222 			if (peek() == '\n') {
223 				input();
224 				lineno++;
225 			} else if (peek() == '\r') {
226 				input(); input();	/* \n */
227 				lineno++;
228 			} else {
229 				RET(c);
230 			}
231 			break;
232 		case '&':
233 			if (peek() == '&') {
234 				input(); RET(AND);
235 			} else
236 				RET('&');
237 		case '|':
238 			if (peek() == '|') {
239 				input(); RET(BOR);
240 			} else
241 				RET('|');
242 		case '!':
243 			if (peek() == '=') {
244 				input(); yylval.i = NE; RET(NE);
245 			} else if (peek() == '~') {
246 				input(); yylval.i = NOTMATCH; RET(MATCHOP);
247 			} else
248 				RET(NOT);
249 		case '~':
250 			yylval.i = MATCH;
251 			RET(MATCHOP);
252 		case '<':
253 			if (peek() == '=') {
254 				input(); yylval.i = LE; RET(LE);
255 			} else {
256 				yylval.i = LT; RET(LT);
257 			}
258 		case '=':
259 			if (peek() == '=') {
260 				input(); yylval.i = EQ; RET(EQ);
261 			} else {
262 				yylval.i = ASSIGN; RET(ASGNOP);
263 			}
264 		case '>':
265 			if (peek() == '=') {
266 				input(); yylval.i = GE; RET(GE);
267 			} else if (peek() == '>') {
268 				input(); yylval.i = APPEND; RET(APPEND);
269 			} else {
270 				yylval.i = GT; RET(GT);
271 			}
272 		case '+':
273 			if (peek() == '+') {
274 				input(); yylval.i = INCR; RET(INCR);
275 			} else if (peek() == '=') {
276 				input(); yylval.i = ADDEQ; RET(ASGNOP);
277 			} else
278 				RET('+');
279 		case '-':
280 			if (peek() == '-') {
281 				input(); yylval.i = DECR; RET(DECR);
282 			} else if (peek() == '=') {
283 				input(); yylval.i = SUBEQ; RET(ASGNOP);
284 			} else
285 				RET('-');
286 		case '*':
287 			if (peek() == '=') {	/* *= */
288 				input(); yylval.i = MULTEQ; RET(ASGNOP);
289 			} else if (peek() == '*') {	/* ** or **= */
290 				input();	/* eat 2nd * */
291 				if (peek() == '=') {
292 					input(); yylval.i = POWEQ; RET(ASGNOP);
293 				} else {
294 					RET(POWER);
295 				}
296 			} else
297 				RET('*');
298 		case '/':
299 			RET('/');
300 		case '%':
301 			if (peek() == '=') {
302 				input(); yylval.i = MODEQ; RET(ASGNOP);
303 			} else
304 				RET('%');
305 		case '^':
306 			if (peek() == '=') {
307 				input(); yylval.i = POWEQ; RET(ASGNOP);
308 			} else
309 				RET(POWER);
310 
311 		case '$':
312 			/* BUG: awkward, if not wrong */
313 			c = gettok(&buf, &bufsize);
314 			if (isalpha(c)) {
315 				if (strcmp(buf, "NF") == 0) {	/* very special */
316 					unputstr("(NF)");
317 					RET(INDIRECT);
318 				}
319 				c = peek();
320 				if (c == '(' || c == '[' || (infunc && isarg(buf) >= 0)) {
321 					unputstr(buf);
322 					RET(INDIRECT);
323 				}
324 				yylval.cp = setsymtab(buf, "", 0.0, STR|NUM, symtab);
325 				RET(IVAR);
326 			} else if (c == 0) {	/*  */
327 				SYNTAX( "unexpected end of input after $" );
328 				RET(';');
329 			} else {
330 				unputstr(buf);
331 				RET(INDIRECT);
332 			}
333 
334 		case '}':
335 			if (--bracecnt < 0)
336 				SYNTAX( "extra }" );
337 			sc = true;
338 			RET(';');
339 		case ']':
340 			if (--brackcnt < 0)
341 				SYNTAX( "extra ]" );
342 			RET(']');
343 		case ')':
344 			if (--parencnt < 0)
345 				SYNTAX( "extra )" );
346 			RET(')');
347 		case '{':
348 			bracecnt++;
349 			RET('{');
350 		case '[':
351 			brackcnt++;
352 			RET('[');
353 		case '(':
354 			parencnt++;
355 			RET('(');
356 
357 		case '"':
358 			return string();	/* BUG: should be like tran.c ? */
359 
360 		default:
361 			RET(c);
362 		}
363 	}
364 }
365 
366 int string(void)
367 {
368 	int c, n;
369 	char *s, *bp;
370 	static char *buf = NULL;
371 	static int bufsz = 500;
372 
373 	if (buf == NULL && (buf = malloc(bufsz)) == NULL)
374 		FATAL("out of space for strings");
375 	for (bp = buf; (c = input()) != '"'; ) {
376 		if (!adjbuf(&buf, &bufsz, bp-buf+2, 500, &bp, "string"))
377 			FATAL("out of space for string %.10s...", buf);
378 		switch (c) {
379 		case '\n':
380 		case '\r':
381 		case 0:
382 			*bp = '\0';
383 			SYNTAX( "non-terminated string %.10s...", buf );
384 			if (c == 0)	/* hopeless */
385 				FATAL( "giving up" );
386 			lineno++;
387 			break;
388 		case '\\':
389 			c = input();
390 			switch (c) {
391 			case '\n': break;
392 			case '"': *bp++ = '"'; break;
393 			case 'n': *bp++ = '\n'; break;
394 			case 't': *bp++ = '\t'; break;
395 			case 'f': *bp++ = '\f'; break;
396 			case 'r': *bp++ = '\r'; break;
397 			case 'b': *bp++ = '\b'; break;
398 			case 'v': *bp++ = '\v'; break;
399 			case 'a': *bp++ = '\a'; break;
400 			case '\\': *bp++ = '\\'; break;
401 
402 			case '0': case '1': case '2': /* octal: \d \dd \ddd */
403 			case '3': case '4': case '5': case '6': case '7':
404 				n = c - '0';
405 				if ((c = peek()) >= '0' && c < '8') {
406 					n = 8 * n + input() - '0';
407 					if ((c = peek()) >= '0' && c < '8')
408 						n = 8 * n + input() - '0';
409 				}
410 				*bp++ = n;
411 				break;
412 
413 			case 'x':	/* hex  \x0-9a-fA-F + */
414 			    {	char xbuf[100], *px;
415 				for (px = xbuf; (c = input()) != 0 && px-xbuf < 100-2; ) {
416 					if (isdigit(c)
417 					 || (c >= 'a' && c <= 'f')
418 					 || (c >= 'A' && c <= 'F'))
419 						*px++ = c;
420 					else
421 						break;
422 				}
423 				*px = 0;
424 				unput(c);
425 	  			sscanf(xbuf, "%x", (unsigned int *) &n);
426 				*bp++ = n;
427 				break;
428 			    }
429 
430 			default:
431 				*bp++ = c;
432 				break;
433 			}
434 			break;
435 		default:
436 			*bp++ = c;
437 			break;
438 		}
439 	}
440 	*bp = 0;
441 	s = tostring(buf);
442 	*bp++ = ' '; *bp++ = '\0';
443 	yylval.cp = setsymtab(buf, s, 0.0, CON|STR|DONTFREE, symtab);
444 	free(s);
445 	RET(STRING);
446 }
447 
448 
449 static int binsearch(char *w, const Keyword *kp, int n)
450 {
451 	int cond, low, mid, high;
452 
453 	low = 0;
454 	high = n - 1;
455 	while (low <= high) {
456 		mid = (low + high) / 2;
457 		if ((cond = strcmp(w, kp[mid].word)) < 0)
458 			high = mid - 1;
459 		else if (cond > 0)
460 			low = mid + 1;
461 		else
462 			return mid;
463 	}
464 	return -1;
465 }
466 
467 int word(char *w)
468 {
469 	const Keyword *kp;
470 	int c, n;
471 
472 	n = binsearch(w, keywords, sizeof(keywords)/sizeof(keywords[0]));
473 	if (n != -1) {	/* found in table */
474 		kp = keywords + n;
475 		yylval.i = kp->sub;
476 		switch (kp->type) {	/* special handling */
477 		case BLTIN:
478 			if (kp->sub == FSYSTEM && safe)
479 				SYNTAX( "system is unsafe" );
480 			RET(kp->type);
481 		case FUNC:
482 			if (infunc)
483 				SYNTAX( "illegal nested function" );
484 			RET(kp->type);
485 		case RETURN:
486 			if (!infunc)
487 				SYNTAX( "return not in function" );
488 			RET(kp->type);
489 		case VARNF:
490 			yylval.cp = setsymtab("NF", "", 0.0, NUM, symtab);
491 			RET(VARNF);
492 		default:
493 			RET(kp->type);
494 		}
495 	}
496 	c = peek();	/* look for '(' */
497 	if (c != '(' && infunc && (n=isarg(w)) >= 0) {
498 		yylval.i = n;
499 		RET(ARG);
500 	} else {
501 		yylval.cp = setsymtab(w, "", 0.0, STR|NUM|DONTFREE, symtab);
502 		if (c == '(') {
503 			RET(CALL);
504 		} else {
505 			RET(VAR);
506 		}
507 	}
508 }
509 
510 void startreg(void)	/* next call to yylex will return a regular expression */
511 {
512 	reg = true;
513 }
514 
515 int regexpr(void)
516 {
517 	int c;
518 	static char *buf = NULL;
519 	static int bufsz = 500;
520 	char *bp;
521 
522 	if (buf == NULL && (buf = malloc(bufsz)) == NULL)
523 		FATAL("out of space for rex expr");
524 	bp = buf;
525 	for ( ; (c = input()) != '/' && c != 0; ) {
526 		if (!adjbuf(&buf, &bufsz, bp-buf+3, 500, &bp, "regexpr"))
527 			FATAL("out of space for reg expr %.10s...", buf);
528 		if (c == '\n') {
529 			*bp = '\0';
530 			SYNTAX( "newline in regular expression %.10s...", buf );
531 			unput('\n');
532 			break;
533 		} else if (c == '\\') {
534 			*bp++ = '\\';
535 			*bp++ = input();
536 		} else {
537 			*bp++ = c;
538 		}
539 	}
540 	*bp = 0;
541 	if (c == 0)
542 		SYNTAX("non-terminated regular expression %.10s...", buf);
543 	yylval.s = tostring(buf);
544 	unput('/');
545 	RET(REGEXPR);
546 }
547 
548 /* low-level lexical stuff, sort of inherited from lex */
549 
550 char	ebuf[300];
551 char	*ep = ebuf;
552 char	yysbuf[100];	/* pushback buffer */
553 char	*yysptr = yysbuf;
554 FILE	*yyin = NULL;
555 
556 int input(void)	/* get next lexical input character */
557 {
558 	int c;
559 	extern char *lexprog;
560 
561 	if (yysptr > yysbuf)
562 		c = (uschar)*--yysptr;
563 	else if (lexprog != NULL) {	/* awk '...' */
564 		if ((c = (uschar)*lexprog) != 0)
565 			lexprog++;
566 	} else				/* awk -f ... */
567 		c = pgetc();
568 	if (c == EOF)
569 		c = 0;
570 	if (ep >= ebuf + sizeof ebuf)
571 		ep = ebuf;
572 	*ep = c;
573 	if (c != 0) {
574 		ep++;
575 	}
576 	return (c);
577 }
578 
579 void unput(int c)	/* put lexical character back on input */
580 {
581 	if (c == '\n')
582 		lineno--;
583 	if (yysptr >= yysbuf + sizeof(yysbuf))
584 		FATAL("pushed back too much: %.20s...", yysbuf);
585 	*yysptr++ = c;
586 	if (--ep < ebuf)
587 		ep = ebuf + sizeof(ebuf) - 1;
588 }
589 
590 void unputstr(const char *s)	/* put a string back on input */
591 {
592 	int i;
593 
594 	for (i = strlen(s)-1; i >= 0; i--)
595 		unput(s[i]);
596 }
597