1 /****************************************************************
2 Copyright (C) Lucent Technologies 1997
3 All Rights Reserved
4 
5 Permission to use, copy, modify, and distribute this software and
6 its documentation for any purpose and without fee is hereby
7 granted, provided that the above copyright notice appear in all
8 copies and that both that the copyright notice and this
9 permission notice and warranty disclaimer appear in supporting
10 documentation, and that the name Lucent Technologies or any of
11 its entities not be used in advertising or publicity pertaining
12 to distribution of the software without specific, written prior
13 permission.
14 
15 LUCENT DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
16 INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS.
17 IN NO EVENT SHALL LUCENT OR ANY OF ITS ENTITIES BE LIABLE FOR ANY
18 SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
19 WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER
20 IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,
21 ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF
22 THIS SOFTWARE.
23 ****************************************************************/
24 
25 #include <stdio.h>
26 #include <stdlib.h>
27 #include <string.h>
28 #include <ctype.h>
29 #include "awk.h"
30 #include "ytab.h"
31 
32 extern YYSTYPE	yylval;
33 extern int	infunc;
34 
35 int	lineno	= 1;
36 int	bracecnt = 0;
37 int	brackcnt  = 0;
38 int	parencnt = 0;
39 
40 typedef struct Keyword {
41 	const char *word;
42 	int	sub;
43 	int	type;
44 } Keyword;
45 
46 Keyword keywords[] ={	/* keep sorted: binary searched */
47 	{ "BEGIN",	XBEGIN,		XBEGIN },
48 	{ "END",	XEND,		XEND },
49 	{ "NF",		VARNF,		VARNF },
50 	{ "atan2",	FATAN,		BLTIN },
51 	{ "break",	BREAK,		BREAK },
52 	{ "close",	CLOSE,		CLOSE },
53 	{ "continue",	CONTINUE,	CONTINUE },
54 	{ "cos",	FCOS,		BLTIN },
55 	{ "delete",	DELETE,		DELETE },
56 	{ "do",		DO,		DO },
57 	{ "else",	ELSE,		ELSE },
58 	{ "exit",	EXIT,		EXIT },
59 	{ "exp",	FEXP,		BLTIN },
60 	{ "fflush",	FFLUSH,		BLTIN },
61 	{ "for",	FOR,		FOR },
62 	{ "func",	FUNC,		FUNC },
63 	{ "function",	FUNC,		FUNC },
64 	{ "getline",	GETLINE,	GETLINE },
65 	{ "gsub",	GSUB,		GSUB },
66 	{ "if",		IF,		IF },
67 	{ "in",		IN,		IN },
68 	{ "index",	INDEX,		INDEX },
69 	{ "int",	FINT,		BLTIN },
70 	{ "length",	FLENGTH,	BLTIN },
71 	{ "log",	FLOG,		BLTIN },
72 	{ "match",	MATCHFCN,	MATCHFCN },
73 	{ "next",	NEXT,		NEXT },
74 	{ "nextfile",	NEXTFILE,	NEXTFILE },
75 	{ "print",	PRINT,		PRINT },
76 	{ "printf",	PRINTF,		PRINTF },
77 	{ "rand",	FRAND,		BLTIN },
78 	{ "return",	RETURN,		RETURN },
79 	{ "sin",	FSIN,		BLTIN },
80 	{ "split",	SPLIT,		SPLIT },
81 	{ "sprintf",	SPRINTF,	SPRINTF },
82 	{ "sqrt",	FSQRT,		BLTIN },
83 	{ "srand",	FSRAND,		BLTIN },
84 	{ "sub",	SUB,		SUB },
85 	{ "substr",	SUBSTR,		SUBSTR },
86 	{ "system",	FSYSTEM,	BLTIN },
87 	{ "tolower",	FTOLOWER,	BLTIN },
88 	{ "toupper",	FTOUPPER,	BLTIN },
89 	{ "while",	WHILE,		WHILE },
90 };
91 
92 #define DEBUG
93 #ifdef	DEBUG
94 #define	RET(x)	{ if(dbg)printf("lex %s\n", tokname(x)); return(x); }
95 #else
96 #define	RET(x)	return(x)
97 #endif
98 
peek(void)99 int peek(void)
100 {
101 	int c = input();
102 	unput(c);
103 	return c;
104 }
105 
gettok(char ** pbuf,int * psz)106 int gettok(char **pbuf, int *psz)	/* get next input token */
107 {
108 	int c, retc;
109 	char *buf = *pbuf;
110 	int sz = *psz;
111 	char *bp = buf;
112 
113 	c = input();
114 	if (c == 0)
115 		return 0;
116 	buf[0] = c;
117 	buf[1] = 0;
118 	if (!isalnum(c) && c != '.' && c != '_')
119 		return c;
120 
121 	*bp++ = c;
122 	if (isalpha(c) || c == '_') {	/* it's a varname */
123 		for ( ; (c = input()) != 0; ) {
124 			if (bp-buf >= sz)
125 				if (!adjbuf(&buf, &sz, bp-buf+2, 100, &bp, 0))
126 					FATAL( "out of space for name %.10s...", buf );
127 			if (isalnum(c) || c == '_')
128 				*bp++ = c;
129 			else {
130 				*bp = 0;
131 				unput(c);
132 				break;
133 			}
134 		}
135 		*bp = 0;
136 		retc = 'a';	/* alphanumeric */
137 	} else {	/* it's a number */
138 		char *rem;
139 		/* read input until can't be a number */
140 		for ( ; (c = input()) != 0; ) {
141 			if (bp-buf >= sz)
142 				if (!adjbuf(&buf, &sz, bp-buf+2, 100, &bp, 0))
143 					FATAL( "out of space for number %.10s...", buf );
144 			if (isdigit(c) || c == 'e' || c == 'E'
145 			  || c == '.' || c == '+' || c == '-')
146 				*bp++ = c;
147 			else {
148 				unput(c);
149 				break;
150 			}
151 		}
152 		*bp = 0;
153 		strtod(buf, &rem);	/* parse the number */
154 		unputstr(rem);		/* put rest back for later */
155 		if (rem == buf) {	/* it wasn't a valid number at all */
156 			buf[1] = 0;	/* so return one character as token */
157 			retc = buf[0];	/* character is its own type */
158 		} else {	/* some prefix was a number */
159 			rem[0] = 0;	/* so truncate where failure started */
160 			retc = '0';	/* number */
161 		}
162 	}
163 	*pbuf = buf;
164 	*psz = sz;
165 	return retc;
166 }
167 
168 int	word(char *);
169 int	string(void);
170 int	regexpr(void);
171 int	sc	= 0;	/* 1 => return a } right now */
172 int	reg	= 0;	/* 1 => return a REGEXPR now */
173 
yylex(void)174 int yylex(void)
175 {
176 	int c;
177 	static char *buf = 0;
178 	static int bufsize = 500;
179 
180 	if (buf == 0 && (buf = (char *) malloc(bufsize)) == NULL)
181 		FATAL( "out of space in yylex" );
182 	if (sc) {
183 		sc = 0;
184 		RET('}');
185 	}
186 	if (reg) {
187 		reg = 0;
188 		return regexpr();
189 	}
190 	for (;;) {
191 		c = gettok(&buf, &bufsize);
192 		if (c == 0)
193 			return 0;
194 		if (isalpha(c) || c == '_')
195 			return word(buf);
196 		if (isdigit(c)) {
197 			yylval.cp = setsymtab(buf, tostring(buf), atof(buf), CON|NUM, symtab);
198 			/* should this also have STR set? */
199 			RET(NUMBER);
200 		}
201 
202 		yylval.i = c;
203 		switch (c) {
204 		case '\n':	/* {EOL} */
205 			RET(NL);
206 		case '\r':	/* assume \n is coming */
207 		case ' ':	/* {WS}+ */
208 		case '\t':
209 			break;
210 		case '#':	/* #.* strip comments */
211 			while ((c = input()) != '\n' && c != 0)
212 				;
213 			unput(c);
214 			break;
215 		case ';':
216 			RET(';');
217 		case '\\':
218 			if (peek() == '\n') {
219 				input();
220 			} else if (peek() == '\r') {
221 				input(); input();	/* \n */
222 				lineno++;
223 			} else {
224 				RET(c);
225 			}
226 			break;
227 		case '&':
228 			if (peek() == '&') {
229 				input(); RET(AND);
230 			} else
231 				RET('&');
232 		case '|':
233 			if (peek() == '|') {
234 				input(); RET(BOR);
235 			} else
236 				RET('|');
237 		case '!':
238 			if (peek() == '=') {
239 				input(); yylval.i = NE; RET(NE);
240 			} else if (peek() == '~') {
241 				input(); yylval.i = NOTMATCH; RET(MATCHOP);
242 			} else
243 				RET(NOT);
244 		case '~':
245 			yylval.i = MATCH;
246 			RET(MATCHOP);
247 		case '<':
248 			if (peek() == '=') {
249 				input(); yylval.i = LE; RET(LE);
250 			} else {
251 				yylval.i = LT; RET(LT);
252 			}
253 		case '=':
254 			if (peek() == '=') {
255 				input(); yylval.i = EQ; RET(EQ);
256 			} else {
257 				yylval.i = ASSIGN; RET(ASGNOP);
258 			}
259 		case '>':
260 			if (peek() == '=') {
261 				input(); yylval.i = GE; RET(GE);
262 			} else if (peek() == '>') {
263 				input(); yylval.i = APPEND; RET(APPEND);
264 			} else {
265 				yylval.i = GT; RET(GT);
266 			}
267 		case '+':
268 			if (peek() == '+') {
269 				input(); yylval.i = INCR; RET(INCR);
270 			} else if (peek() == '=') {
271 				input(); yylval.i = ADDEQ; RET(ASGNOP);
272 			} else
273 				RET('+');
274 		case '-':
275 			if (peek() == '-') {
276 				input(); yylval.i = DECR; RET(DECR);
277 			} else if (peek() == '=') {
278 				input(); yylval.i = SUBEQ; RET(ASGNOP);
279 			} else
280 				RET('-');
281 		case '*':
282 			if (peek() == '=') {	/* *= */
283 				input(); yylval.i = MULTEQ; RET(ASGNOP);
284 			} else if (peek() == '*') {	/* ** or **= */
285 				input();	/* eat 2nd * */
286 				if (peek() == '=') {
287 					input(); yylval.i = POWEQ; RET(ASGNOP);
288 				} else {
289 					RET(POWER);
290 				}
291 			} else
292 				RET('*');
293 		case '/':
294 			RET('/');
295 		case '%':
296 			if (peek() == '=') {
297 				input(); yylval.i = MODEQ; RET(ASGNOP);
298 			} else
299 				RET('%');
300 		case '^':
301 			if (peek() == '=') {
302 				input(); yylval.i = POWEQ; RET(ASGNOP);
303 			} else
304 				RET(POWER);
305 
306 		case '$':
307 			/* BUG: awkward, if not wrong */
308 			c = gettok(&buf, &bufsize);
309 			if (isalpha(c)) {
310 				if (strcmp(buf, "NF") == 0) {	/* very special */
311 					unputstr("(NF)");
312 					RET(INDIRECT);
313 				}
314 				c = peek();
315 				if (c == '(' || c == '[' || (infunc && isarg(buf) >= 0)) {
316 					unputstr(buf);
317 					RET(INDIRECT);
318 				}
319 				yylval.cp = setsymtab(buf, "", 0.0, STR|NUM, symtab);
320 				RET(IVAR);
321 			} else if (c == 0) {	/*  */
322 				SYNTAX( "unexpected end of input after $" );
323 				RET(';');
324 			} else {
325 				unputstr(buf);
326 				RET(INDIRECT);
327 			}
328 
329 		case '}':
330 			if (--bracecnt < 0)
331 				SYNTAX( "extra }" );
332 			sc = 1;
333 			RET(';');
334 		case ']':
335 			if (--brackcnt < 0)
336 				SYNTAX( "extra ]" );
337 			RET(']');
338 		case ')':
339 			if (--parencnt < 0)
340 				SYNTAX( "extra )" );
341 			RET(')');
342 		case '{':
343 			bracecnt++;
344 			RET('{');
345 		case '[':
346 			brackcnt++;
347 			RET('[');
348 		case '(':
349 			parencnt++;
350 			RET('(');
351 
352 		case '"':
353 			return string();	/* BUG: should be like tran.c ? */
354 
355 		default:
356 			RET(c);
357 		}
358 	}
359 }
360 
string(void)361 int string(void)
362 {
363 	int c, n;
364 	char *s, *bp;
365 	static char *buf = 0;
366 	static int bufsz = 500;
367 
368 	if (buf == 0 && (buf = (char *) malloc(bufsz)) == NULL)
369 		FATAL("out of space for strings");
370 	for (bp = buf; (c = input()) != '"'; ) {
371 		if (!adjbuf(&buf, &bufsz, bp-buf+2, 500, &bp, 0))
372 			FATAL("out of space for string %.10s...", buf);
373 		switch (c) {
374 		case '\n':
375 		case '\r':
376 		case 0:
377 			SYNTAX( "non-terminated string %.10s...", buf );
378 			lineno++;
379 			if (c == 0)	/* hopeless */
380 				FATAL( "giving up" );
381 			break;
382 		case '\\':
383 			c = input();
384 			switch (c) {
385 			case '"': *bp++ = '"'; break;
386 			case 'n': *bp++ = '\n'; break;
387 			case 't': *bp++ = '\t'; break;
388 			case 'f': *bp++ = '\f'; break;
389 			case 'r': *bp++ = '\r'; break;
390 			case 'b': *bp++ = '\b'; break;
391 			case 'v': *bp++ = '\v'; break;
392 			case 'a': *bp++ = '\007'; break;
393 			case '\\': *bp++ = '\\'; break;
394 
395 			case '0': case '1': case '2': /* octal: \d \dd \ddd */
396 			case '3': case '4': case '5': case '6': case '7':
397 				n = c - '0';
398 				if ((c = peek()) >= '0' && c < '8') {
399 					n = 8 * n + input() - '0';
400 					if ((c = peek()) >= '0' && c < '8')
401 						n = 8 * n + input() - '0';
402 				}
403 				*bp++ = n;
404 				break;
405 
406 			case 'x':	/* hex  \x0-9a-fA-F + */
407 			    {	char xbuf[100], *px;
408 				for (px = xbuf; (c = input()) != 0 && px-xbuf < 100-2; ) {
409 					if (isdigit(c)
410 					 || (c >= 'a' && c <= 'f')
411 					 || (c >= 'A' && c <= 'F'))
412 						*px++ = c;
413 					else
414 						break;
415 				}
416 				*px = 0;
417 				unput(c);
418 	  			sscanf(xbuf, "%x", &n);
419 				*bp++ = n;
420 				break;
421 			    }
422 
423 			default:
424 				*bp++ = c;
425 				break;
426 			}
427 			break;
428 		default:
429 			*bp++ = c;
430 			break;
431 		}
432 	}
433 	*bp = 0;
434 	s = tostring(buf);
435 	*bp++ = ' '; *bp++ = 0;
436 	yylval.cp = setsymtab(buf, s, 0.0, CON|STR|DONTFREE, symtab);
437 	RET(STRING);
438 }
439 
440 
binsearch(char * w,Keyword * kp,int n)441 int binsearch(char *w, Keyword *kp, int n)
442 {
443 	int cond, low, mid, high;
444 
445 	low = 0;
446 	high = n - 1;
447 	while (low <= high) {
448 		mid = (low + high) / 2;
449 		if ((cond = strcmp(w, kp[mid].word)) < 0)
450 			high = mid - 1;
451 		else if (cond > 0)
452 			low = mid + 1;
453 		else
454 			return mid;
455 	}
456 	return -1;
457 }
458 
word(char * w)459 int word(char *w)
460 {
461 	Keyword *kp;
462 	int c, n;
463 
464 	n = binsearch(w, keywords, sizeof(keywords)/sizeof(keywords[0]));
465 	kp = keywords + n;
466 	if (n != -1) {	/* found in table */
467 		yylval.i = kp->sub;
468 		switch (kp->type) {	/* special handling */
469 		case FSYSTEM:
470 			if (safe)
471 				SYNTAX( "system is unsafe" );
472 			RET(kp->type);
473 		case FUNC:
474 			if (infunc)
475 				SYNTAX( "illegal nested function" );
476 			RET(kp->type);
477 		case RETURN:
478 			if (!infunc)
479 				SYNTAX( "return not in function" );
480 			RET(kp->type);
481 		case VARNF:
482 			yylval.cp = setsymtab("NF", "", 0.0, NUM, symtab);
483 			RET(VARNF);
484 		default:
485 			RET(kp->type);
486 		}
487 	}
488 	c = peek();	/* look for '(' */
489 	if (c != '(' && infunc && (n=isarg(w)) >= 0) {
490 		yylval.i = n;
491 		RET(ARG);
492 	} else {
493 		yylval.cp = setsymtab(w, "", 0.0, STR|NUM|DONTFREE, symtab);
494 		if (c == '(') {
495 			RET(CALL);
496 		} else {
497 			RET(VAR);
498 		}
499 	}
500 }
501 
startreg(void)502 void startreg(void)	/* next call to yylex will return a regular expression */
503 {
504 	reg = 1;
505 }
506 
regexpr(void)507 int regexpr(void)
508 {
509 	int c;
510 	static char *buf = 0;
511 	static int bufsz = 500;
512 	char *bp;
513 
514 	if (buf == 0 && (buf = (char *) malloc(bufsz)) == NULL)
515 		FATAL("out of space for rex expr");
516 	bp = buf;
517 	for ( ; (c = input()) != '/' && c != 0; ) {
518 		if (!adjbuf(&buf, &bufsz, bp-buf+3, 500, &bp, 0))
519 			FATAL("out of space for reg expr %.10s...", buf);
520 		if (c == '\n') {
521 			SYNTAX( "newline in regular expression %.10s...", buf );
522 			unput('\n');
523 			break;
524 		} else if (c == '\\') {
525 			*bp++ = '\\';
526 			*bp++ = input();
527 		} else {
528 			*bp++ = c;
529 		}
530 	}
531 	*bp = 0;
532 	yylval.s = tostring(buf);
533 	unput('/');
534 	RET(REGEXPR);
535 }
536 
537 /* low-level lexical stuff, sort of inherited from lex */
538 
539 char	ebuf[300];
540 char	*ep = ebuf;
541 char	yysbuf[100];	/* pushback buffer */
542 char	*yysptr = yysbuf;
543 FILE	*yyin = 0;
544 
input(void)545 int input(void)	/* get next lexical input character */
546 {
547 	int c;
548 	extern char *lexprog;
549 
550 	if (yysptr > yysbuf)
551 		c = *--yysptr;
552 	else if (lexprog != NULL) {	/* awk '...' */
553 		if ((c = *lexprog) != 0)
554 			lexprog++;
555 	} else				/* awk -f ... */
556 		c = pgetc();
557 	if (c == '\n')
558 		lineno++;
559 	else if (c == EOF)
560 		c = 0;
561 	if (ep >= ebuf + sizeof ebuf)
562 		ep = ebuf;
563 	return *ep++ = c;
564 }
565 
unput(int c)566 void unput(int c)	/* put lexical character back on input */
567 {
568 	if (c == '\n')
569 		lineno--;
570 	if (yysptr >= yysbuf + sizeof(yysbuf))
571 		FATAL("pushed back too much: %.20s...", yysbuf);
572 	*yysptr++ = c;
573 	if (--ep < ebuf)
574 		ep = ebuf + sizeof(ebuf) - 1;
575 }
576 
unputstr(const char * s)577 void unputstr(const char *s)	/* put a string back on input */
578 {
579 	int i;
580 
581 	for (i = strlen(s)-1; i >= 0; i--)
582 		unput(s[i]);
583 }
584