xref: /openbsd/usr.bin/awk/lex.c (revision d485f761)
1 /*	$OpenBSD: lex.c,v 1.5 2001/09/08 00:12:40 millert Exp $	*/
2 /****************************************************************
3 Copyright (C) Lucent Technologies 1997
4 All Rights Reserved
5 
6 Permission to use, copy, modify, and distribute this software and
7 its documentation for any purpose and without fee is hereby
8 granted, provided that the above copyright notice appear in all
9 copies and that both that the copyright notice and this
10 permission notice and warranty disclaimer appear in supporting
11 documentation, and that the name Lucent Technologies or any of
12 its entities not be used in advertising or publicity pertaining
13 to distribution of the software without specific, written prior
14 permission.
15 
16 LUCENT DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
17 INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS.
18 IN NO EVENT SHALL LUCENT OR ANY OF ITS ENTITIES BE LIABLE FOR ANY
19 SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
20 WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER
21 IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,
22 ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF
23 THIS SOFTWARE.
24 ****************************************************************/
25 
26 #include <stdio.h>
27 #include <stdlib.h>
28 #include <string.h>
29 #include <ctype.h>
30 #include "awk.h"
31 #include "ytab.h"
32 
33 extern YYSTYPE	yylval;
34 extern int	infunc;
35 
36 int	lineno	= 1;
37 int	bracecnt = 0;
38 int	brackcnt  = 0;
39 int	parencnt = 0;
40 
41 typedef struct Keyword {
42 	char	*word;
43 	int	sub;
44 	int	type;
45 } Keyword;
46 
47 Keyword keywords[] ={	/* keep sorted: binary searched */
48 	{ "BEGIN",	XBEGIN,		XBEGIN },
49 	{ "END",	XEND,		XEND },
50 	{ "NF",		VARNF,		VARNF },
51 	{ "atan2",	FATAN,		BLTIN },
52 	{ "break",	BREAK,		BREAK },
53 	{ "close",	CLOSE,		CLOSE },
54 	{ "continue",	CONTINUE,	CONTINUE },
55 	{ "cos",	FCOS,		BLTIN },
56 	{ "delete",	DELETE,		DELETE },
57 	{ "do",		DO,		DO },
58 	{ "else",	ELSE,		ELSE },
59 	{ "exit",	EXIT,		EXIT },
60 	{ "exp",	FEXP,		BLTIN },
61 	{ "fflush",	FFLUSH,		BLTIN },
62 	{ "for",	FOR,		FOR },
63 	{ "func",	FUNC,		FUNC },
64 	{ "function",	FUNC,		FUNC },
65 	{ "getline",	GETLINE,	GETLINE },
66 	{ "gsub",	GSUB,		GSUB },
67 	{ "if",		IF,		IF },
68 	{ "in",		IN,		IN },
69 	{ "index",	INDEX,		INDEX },
70 	{ "int",	FINT,		BLTIN },
71 	{ "length",	FLENGTH,	BLTIN },
72 	{ "log",	FLOG,		BLTIN },
73 	{ "match",	MATCHFCN,	MATCHFCN },
74 	{ "next",	NEXT,		NEXT },
75 	{ "nextfile",	NEXTFILE,	NEXTFILE },
76 	{ "print",	PRINT,		PRINT },
77 	{ "printf",	PRINTF,		PRINTF },
78 	{ "rand",	FRAND,		BLTIN },
79 	{ "return",	RETURN,		RETURN },
80 	{ "sin",	FSIN,		BLTIN },
81 	{ "split",	SPLIT,		SPLIT },
82 	{ "sprintf",	SPRINTF,	SPRINTF },
83 	{ "sqrt",	FSQRT,		BLTIN },
84 	{ "srand",	FSRAND,		BLTIN },
85 	{ "sub",	SUB,		SUB },
86 	{ "substr",	SUBSTR,		SUBSTR },
87 	{ "system",	FSYSTEM,	BLTIN },
88 	{ "tolower",	FTOLOWER,	BLTIN },
89 	{ "toupper",	FTOUPPER,	BLTIN },
90 	{ "while",	WHILE,		WHILE },
91 };
92 
93 #define DEBUG
94 #ifdef	DEBUG
95 #define	RET(x)	{ if(dbg)printf("lex %s\n", tokname(x)); return(x); }
96 #else
97 #define	RET(x)	return(x)
98 #endif
99 
100 int peek(void)
101 {
102 	int c = input();
103 	unput(c);
104 	return c;
105 }
106 
107 int gettok(char **pbuf, int *psz)	/* get next input token */
108 {
109 	int c;
110 	char *buf = *pbuf;
111 	int sz = *psz;
112 	char *bp = buf;
113 
114 	c = input();
115 	if (c == 0)
116 		return 0;
117 	buf[0] = c;
118 	buf[1] = 0;
119 	if (!isalnum(c) && c != '.' && c != '_')
120 		return c;
121 
122 	*bp++ = c;
123 	if (isalpha(c) || c == '_') {	/* it's a varname */
124 		for ( ; (c = input()) != 0; ) {
125 			if (bp-buf >= sz)
126 				if (!adjbuf(&buf, &sz, bp-buf+2, 100, &bp, 0))
127 					FATAL( "out of space for name %.10s...", buf );
128 			if (isalnum(c) || c == '_')
129 				*bp++ = c;
130 			else {
131 				*bp = 0;
132 				unput(c);
133 				break;
134 			}
135 		}
136 		*bp = 0;
137 	} else {	/* it's a number */
138 		char *rem;
139 		/* read input until can't be a number */
140 		for ( ; (c = input()) != 0; ) {
141 			if (bp-buf >= sz)
142 				if (!adjbuf(&buf, &sz, bp-buf+2, 100, &bp, 0))
143 					FATAL( "out of space for number %.10s...", buf );
144 			if (isdigit(c) || c == 'e' || c == 'E'
145 			  || c == '.' || c == '+' || c == '-')
146 				*bp++ = c;
147 			else {
148 				unput(c);
149 				break;
150 			}
151 		}
152 		*bp = 0;
153 		strtod(buf, &rem);	/* parse the number */
154 		unputstr(rem);		/* put rest back for later */
155 		rem[0] = 0;
156 	}
157 	*pbuf = buf;
158 	*psz = sz;
159 	return buf[0];
160 }
161 
162 int	word(char *);
163 int	string(void);
164 int	regexpr(void);
165 int	sc	= 0;	/* 1 => return a } right now */
166 int	reg	= 0;	/* 1 => return a REGEXPR now */
167 
168 int yylex(void)
169 {
170 	int c;
171 	static char *buf = 0;
172 	static int bufsize = 500;
173 
174 	if (buf == 0 && (buf = (char *) malloc(bufsize)) == NULL)
175 		FATAL( "out of space in yylex" );
176 	if (sc) {
177 		sc = 0;
178 		RET('}');
179 	}
180 	if (reg) {
181 		reg = 0;
182 		return regexpr();
183 	}
184 	for (;;) {
185 		c = gettok(&buf, &bufsize);
186 		if (c == 0)
187 			return 0;
188 		if (isalpha(c) || c == '_')
189 			return word(buf);
190 		if (isdigit(c) || c == '.') {
191 			yylval.cp = setsymtab(buf, tostring(buf), atof(buf), CON|NUM, symtab);
192 			/* should this also have STR set? */
193 			RET(NUMBER);
194 		}
195 
196 		yylval.i = c;
197 		switch (c) {
198 		case '\n':	/* {EOL} */
199 			RET(NL);
200 		case '\r':	/* assume \n is coming */
201 		case ' ':	/* {WS}+ */
202 		case '\t':
203 			break;
204 		case '#':	/* #.* strip comments */
205 			while ((c = input()) != '\n' && c != 0)
206 				;
207 			unput(c);
208 			break;
209 		case ';':
210 			RET(';');
211 		case '\\':
212 			if (peek() == '\n') {
213 				input();
214 			} else if (peek() == '\r') {
215 				input(); input();	/* \n */
216 				lineno++;
217 			} else {
218 				RET(c);
219 			}
220 			break;
221 		case '&':
222 			if (peek() == '&') {
223 				input(); RET(AND);
224 			} else
225 				RET('&');
226 		case '|':
227 			if (peek() == '|') {
228 				input(); RET(BOR);
229 			} else
230 				RET('|');
231 		case '!':
232 			if (peek() == '=') {
233 				input(); yylval.i = NE; RET(NE);
234 			} else if (peek() == '~') {
235 				input(); yylval.i = NOTMATCH; RET(MATCHOP);
236 			} else
237 				RET(NOT);
238 		case '~':
239 			yylval.i = MATCH;
240 			RET(MATCHOP);
241 		case '<':
242 			if (peek() == '=') {
243 				input(); yylval.i = LE; RET(LE);
244 			} else {
245 				yylval.i = LT; RET(LT);
246 			}
247 		case '=':
248 			if (peek() == '=') {
249 				input(); yylval.i = EQ; RET(EQ);
250 			} else {
251 				yylval.i = ASSIGN; RET(ASGNOP);
252 			}
253 		case '>':
254 			if (peek() == '=') {
255 				input(); yylval.i = GE; RET(GE);
256 			} else if (peek() == '>') {
257 				input(); yylval.i = APPEND; RET(APPEND);
258 			} else {
259 				yylval.i = GT; RET(GT);
260 			}
261 		case '+':
262 			if (peek() == '+') {
263 				input(); yylval.i = INCR; RET(INCR);
264 			} else if (peek() == '=') {
265 				input(); yylval.i = ADDEQ; RET(ASGNOP);
266 			} else
267 				RET('+');
268 		case '-':
269 			if (peek() == '-') {
270 				input(); yylval.i = DECR; RET(DECR);
271 			} else if (peek() == '=') {
272 				input(); yylval.i = SUBEQ; RET(ASGNOP);
273 			} else
274 				RET('-');
275 		case '*':
276 			if (peek() == '=') {	/* *= */
277 				input(); yylval.i = MULTEQ; RET(ASGNOP);
278 			} else if (peek() == '*') {	/* ** or **= */
279 				input();	/* eat 2nd * */
280 				if (peek() == '=') {
281 					input(); yylval.i = POWEQ; RET(ASGNOP);
282 				} else {
283 					RET(POWER);
284 				}
285 			} else
286 				RET('*');
287 		case '/':
288 			RET('/');
289 		case '%':
290 			if (peek() == '=') {
291 				input(); yylval.i = MODEQ; RET(ASGNOP);
292 			} else
293 				RET('%');
294 		case '^':
295 			if (peek() == '=') {
296 				input(); yylval.i = POWEQ; RET(ASGNOP);
297 			} else
298 				RET(POWER);
299 
300 		case '$':
301 			/* BUG: awkward, if not wrong */
302 			c = gettok(&buf, &bufsize);
303 			if (isalpha(c)) {
304 				if (strcmp(buf, "NF") == 0) {	/* very special */
305 					unputstr("(NF)");
306 					RET(INDIRECT);
307 				}
308 				c = peek();
309 				if (c == '(' || c == '[' || (infunc && isarg(buf) >= 0)) {
310 					unputstr(buf);
311 					RET(INDIRECT);
312 				}
313 				yylval.cp = setsymtab(buf, "", 0.0, STR|NUM, symtab);
314 				RET(IVAR);
315 			} else {
316 				unputstr(buf);
317 				RET(INDIRECT);
318 			}
319 
320 		case '}':
321 			if (--bracecnt < 0)
322 				SYNTAX( "extra }" );
323 			sc = 1;
324 			RET(';');
325 		case ']':
326 			if (--brackcnt < 0)
327 				SYNTAX( "extra ]" );
328 			RET(']');
329 		case ')':
330 			if (--parencnt < 0)
331 				SYNTAX( "extra )" );
332 			RET(')');
333 		case '{':
334 			bracecnt++;
335 			RET('{');
336 		case '[':
337 			brackcnt++;
338 			RET('[');
339 		case '(':
340 			parencnt++;
341 			RET('(');
342 
343 		case '"':
344 			return string();	/* BUG: should be like tran.c ? */
345 
346 		default:
347 			RET(c);
348 		}
349 	}
350 }
351 
352 int string(void)
353 {
354 	int c, n;
355 	char *s, *bp;
356 	static char *buf = 0;
357 	static int bufsz = 500;
358 
359 	if (buf == 0 && (buf = (char *) malloc(bufsz)) == NULL)
360 		FATAL("out of space for strings");
361 	for (bp = buf; (c = input()) != '"'; ) {
362 		if (!adjbuf(&buf, &bufsz, bp-buf+2, 500, &bp, 0))
363 			FATAL("out of space for string %.10s...", buf);
364 		switch (c) {
365 		case '\n':
366 		case '\r':
367 		case 0:
368 			SYNTAX( "non-terminated string %.10s...", buf );
369 			lineno++;
370 			break;
371 		case '\\':
372 			c = input();
373 			switch (c) {
374 			case '"': *bp++ = '"'; break;
375 			case 'n': *bp++ = '\n'; break;
376 			case 't': *bp++ = '\t'; break;
377 			case 'f': *bp++ = '\f'; break;
378 			case 'r': *bp++ = '\r'; break;
379 			case 'b': *bp++ = '\b'; break;
380 			case 'v': *bp++ = '\v'; break;
381 			case 'a': *bp++ = '\007'; break;
382 			case '\\': *bp++ = '\\'; break;
383 
384 			case '0': case '1': case '2': /* octal: \d \dd \ddd */
385 			case '3': case '4': case '5': case '6': case '7':
386 				n = c - '0';
387 				if ((c = peek()) >= '0' && c < '8') {
388 					n = 8 * n + input() - '0';
389 					if ((c = peek()) >= '0' && c < '8')
390 						n = 8 * n + input() - '0';
391 				}
392 				*bp++ = n;
393 				break;
394 
395 			case 'x':	/* hex  \x0-9a-fA-F + */
396 			    {	char xbuf[100], *px;
397 				for (px = xbuf; (c = input()) != 0 && px-xbuf < 100-2; ) {
398 					if (isdigit(c)
399 					 || (c >= 'a' && c <= 'f')
400 					 || (c >= 'A' && c <= 'F'))
401 						*px++ = c;
402 					else
403 						break;
404 				}
405 				*px = 0;
406 				unput(c);
407 	  			sscanf(xbuf, "%x", &n);
408 				*bp++ = n;
409 				break;
410 			    }
411 
412 			default:
413 				*bp++ = c;
414 				break;
415 			}
416 			break;
417 		default:
418 			*bp++ = c;
419 			break;
420 		}
421 	}
422 	*bp = 0;
423 	s = tostring(buf);
424 	*bp++ = ' '; *bp++ = 0;
425 	yylval.cp = setsymtab(buf, s, 0.0, CON|STR|DONTFREE, symtab);
426 	RET(STRING);
427 }
428 
429 
430 int binsearch(char *w, Keyword *kp, int n)
431 {
432 	int cond, low, mid, high;
433 
434 	low = 0;
435 	high = n - 1;
436 	while (low <= high) {
437 		mid = (low + high) / 2;
438 		if ((cond = strcmp(w, kp[mid].word)) < 0)
439 			high = mid - 1;
440 		else if (cond > 0)
441 			low = mid + 1;
442 		else
443 			return mid;
444 	}
445 	return -1;
446 }
447 
448 int word(char *w)
449 {
450 	Keyword *kp;
451 	int c, n;
452 
453 	n = binsearch(w, keywords, sizeof(keywords)/sizeof(keywords[0]));
454 	kp = keywords + n;
455 	if (n != -1) {	/* found in table */
456 		yylval.i = kp->sub;
457 		switch (kp->type) {	/* special handling */
458 		case FSYSTEM:
459 			if (safe)
460 				SYNTAX( "system is unsafe" );
461 			RET(kp->type);
462 		case FUNC:
463 			if (infunc)
464 				SYNTAX( "illegal nested function" );
465 			RET(kp->type);
466 		case RETURN:
467 			if (!infunc)
468 				SYNTAX( "return not in function" );
469 			RET(kp->type);
470 		case VARNF:
471 			yylval.cp = setsymtab("NF", "", 0.0, NUM, symtab);
472 			RET(VARNF);
473 		default:
474 			RET(kp->type);
475 		}
476 	}
477 	c = peek();	/* look for '(' */
478 	if (c != '(' && infunc && (n=isarg(w)) >= 0) {
479 		yylval.i = n;
480 		RET(ARG);
481 	} else {
482 		yylval.cp = setsymtab(w, "", 0.0, STR|NUM|DONTFREE, symtab);
483 		if (c == '(') {
484 			RET(CALL);
485 		} else {
486 			RET(VAR);
487 		}
488 	}
489 }
490 
491 void startreg(void)	/* next call to yyles will return a regular expression */
492 {
493 	reg = 1;
494 }
495 
496 int regexpr(void)
497 {
498 	int c;
499 	static char *buf = 0;
500 	static int bufsz = 500;
501 	char *bp;
502 
503 	if (buf == 0 && (buf = (char *) malloc(bufsz)) == NULL)
504 		FATAL("out of space for rex expr");
505 	bp = buf;
506 	for ( ; (c = input()) != '/' && c != 0; ) {
507 		if (!adjbuf(&buf, &bufsz, bp-buf+3, 500, &bp, 0))
508 			FATAL("out of space for reg expr %.10s...", buf);
509 		if (c == '\n') {
510 			SYNTAX( "newline in regular expression %.10s...", buf );
511 			unput('\n');
512 			break;
513 		} else if (c == '\\') {
514 			*bp++ = '\\';
515 			*bp++ = input();
516 		} else {
517 			*bp++ = c;
518 		}
519 	}
520 	*bp = 0;
521 	yylval.s = tostring(buf);
522 	unput('/');
523 	RET(REGEXPR);
524 }
525 
526 /* low-level lexical stuff, sort of inherited from lex */
527 
528 char	ebuf[300];
529 char	*ep = ebuf;
530 char	yysbuf[100];	/* pushback buffer */
531 char	*yysptr = yysbuf;
532 FILE	*yyin = 0;
533 
534 int input(void)	/* get next lexical input character */
535 {
536 	int c;
537 	extern char *lexprog;
538 
539 	if (yysptr > yysbuf)
540 		c = *--yysptr;
541 	else if (lexprog != NULL) {	/* awk '...' */
542 		if ((c = *lexprog) != 0)
543 			lexprog++;
544 	} else				/* awk -f ... */
545 		c = pgetc();
546 	if (c == '\n')
547 		lineno++;
548 	else if (c == EOF)
549 		c = 0;
550 	if (ep >= ebuf + sizeof ebuf)
551 		ep = ebuf;
552 	return *ep++ = c;
553 }
554 
555 void unput(int c)	/* put lexical character back on input */
556 {
557 	if (c == '\n')
558 		lineno--;
559 	if (yysptr >= yysbuf + sizeof(yysbuf))
560 		FATAL("pushed back too much: %.20s...", yysbuf);
561 	*yysptr++ = c;
562 	if (--ep < ebuf)
563 		ep = ebuf + sizeof(ebuf) - 1;
564 }
565 
566 void unputstr(char *s)	/* put a string back on input */
567 {
568 	int i;
569 
570 	for (i = strlen(s)-1; i >= 0; i--)
571 		unput(s[i]);
572 }
573