xref: /netbsd/external/historical/nawk/dist/lex.c (revision e66db18e)
1 /****************************************************************
2 Copyright (C) Lucent Technologies 1997
3 All Rights Reserved
4 
5 Permission to use, copy, modify, and distribute this software and
6 its documentation for any purpose and without fee is hereby
7 granted, provided that the above copyright notice appear in all
8 copies and that both that the copyright notice and this
9 permission notice and warranty disclaimer appear in supporting
10 documentation, and that the name Lucent Technologies or any of
11 its entities not be used in advertising or publicity pertaining
12 to distribution of the software without specific, written prior
13 permission.
14 
15 LUCENT DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
16 INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS.
17 IN NO EVENT SHALL LUCENT OR ANY OF ITS ENTITIES BE LIABLE FOR ANY
18 SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
19 WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER
20 IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,
21 ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF
22 THIS SOFTWARE.
23 ****************************************************************/
24 
25 #if HAVE_NBTOOL_CONFIG_H
26 #include "nbtool_config.h"
27 #endif
28 
29 #include <stdio.h>
30 #include <stdlib.h>
31 #include <string.h>
32 #include <ctype.h>
33 #include "awk.h"
34 #include "awkgram.h"
35 
36 extern YYSTYPE	yylval;
37 extern int	infunc;
38 
39 int	lineno	= 1;
40 int	bracecnt = 0;
41 int	brackcnt  = 0;
42 int	parencnt = 0;
43 
44 typedef struct Keyword {
45 	const char *word;
46 	int	sub;
47 	int	type;
48 } Keyword;
49 
50 const Keyword keywords[] = {	/* keep sorted: binary searched */
51 	{ "BEGIN",	XBEGIN,		XBEGIN },
52 	{ "END",	XEND,		XEND },
53 	{ "NF",		VARNF,		VARNF },
54 	{ "and",	FAND,		BLTIN },
55 	{ "atan2",	FATAN,		BLTIN },
56 	{ "break",	BREAK,		BREAK },
57 	{ "close",	CLOSE,		CLOSE },
58 	{ "compl",	FCOMPL,		BLTIN },
59 	{ "continue",	CONTINUE,	CONTINUE },
60 	{ "cos",	FCOS,		BLTIN },
61 	{ "delete",	DELETE,		DELETE },
62 	{ "do",		DO,		DO },
63 	{ "else",	ELSE,		ELSE },
64 	{ "exit",	EXIT,		EXIT },
65 	{ "exp",	FEXP,		BLTIN },
66 	{ "fflush",	FFLUSH,		BLTIN },
67 	{ "for",	FOR,		FOR },
68 	{ "func",	FUNC,		FUNC },
69 	{ "function",	FUNC,		FUNC },
70 	{ "gensub",	GENSUB,		GENSUB },
71 	{ "getline",	GETLINE,	GETLINE },
72 	{ "gsub",	GSUB,		GSUB },
73 	{ "if",		IF,		IF },
74 	{ "in",		IN,		IN },
75 	{ "index",	INDEX,		INDEX },
76 	{ "int",	FINT,		BLTIN },
77 	{ "length",	FLENGTH,	BLTIN },
78 	{ "log",	FLOG,		BLTIN },
79 	{ "lshift",	FLSHIFT,	BLTIN },
80 	{ "match",	MATCHFCN,	MATCHFCN },
81 	{ "next",	NEXT,		NEXT },
82 	{ "nextfile",	NEXTFILE,	NEXTFILE },
83 	{ "or",		FFOR,		BLTIN },
84 	{ "print",	PRINT,		PRINT },
85 	{ "printf",	PRINTF,		PRINTF },
86 	{ "rand",	FRAND,		BLTIN },
87 	{ "return",	RETURN,		RETURN },
88 	{ "rshift",	FRSHIFT,	BLTIN },
89 	{ "sin",	FSIN,		BLTIN },
90 	{ "split",	SPLIT,		SPLIT },
91 	{ "sprintf",	SPRINTF,	SPRINTF },
92 	{ "sqrt",	FSQRT,		BLTIN },
93 	{ "srand",	FSRAND,		BLTIN },
94 	{ "strftime",	FSTRFTIME,	BLTIN },
95 	{ "sub",	SUB,		SUB },
96 	{ "substr",	SUBSTR,		SUBSTR },
97 	{ "system",	FSYSTEM,	BLTIN },
98 	{ "systime",	FSYSTIME,	BLTIN },
99 	{ "tolower",	FTOLOWER,	BLTIN },
100 	{ "toupper",	FTOUPPER,	BLTIN },
101 	{ "while",	WHILE,		WHILE },
102 	{ "xor",	FXOR,		BLTIN },
103 };
104 
105 #define	RET(x)	{ if(dbg)printf("lex %s\n", tokname(x)); return(x); }
106 
peek(void)107 static int peek(void)
108 {
109 	int c = input();
110 	unput(c);
111 	return c;
112 }
113 
gettok(char ** pbuf,int * psz)114 static int gettok(char **pbuf, int *psz)	/* get next input token */
115 {
116 	int c, retc;
117 	char *buf = *pbuf;
118 	int sz = *psz;
119 	char *bp = buf;
120 
121 	c = input();
122 	if (c == 0)
123 		return 0;
124 	buf[0] = c;
125 	buf[1] = 0;
126 	if (!isalnum(c) && c != '.' && c != '_')
127 		return c;
128 
129 	*bp++ = c;
130 	if (isalpha(c) || c == '_') {	/* it's a varname */
131 		for ( ; (c = input()) != 0; ) {
132 			if (bp-buf >= sz)
133 				if (!adjbuf(&buf, &sz, bp-buf+2, 100, &bp, "gettok"))
134 					FATAL( "out of space for name %.10s...", buf );
135 			if (isalnum(c) || c == '_')
136 				*bp++ = c;
137 			else {
138 				*bp = 0;
139 				unput(c);
140 				break;
141 			}
142 		}
143 		*bp = 0;
144 		retc = 'a';	/* alphanumeric */
145 	} else {	/* maybe it's a number, but could be . */
146 		char *rem;
147 		/* read input until can't be a number */
148 		for ( ; (c = input()) != 0; ) {
149 			if (bp-buf >= sz)
150 				if (!adjbuf(&buf, &sz, bp-buf+2, 100, &bp, "gettok"))
151 					FATAL( "out of space for number %.10s...", buf );
152 			if (isdigit(c) || c == 'e' || c == 'E'
153 			  || c == '.' || c == '+' || c == '-')
154 				*bp++ = c;
155 			else {
156 				unput(c);
157 				break;
158 			}
159 		}
160 		*bp = 0;
161 		strtod(buf, &rem);	/* parse the number */
162 		if (rem == buf) {	/* it wasn't a valid number at all */
163 			buf[1] = 0;	/* return one character as token */
164 			retc = buf[0];	/* character is its own type */
165 			unputstr(rem+1); /* put rest back for later */
166 		} else {	/* some prefix was a number */
167 			unputstr(rem);	/* put rest back for later */
168 			rem[0] = 0;	/* truncate buf after number part */
169 			retc = '0';	/* type is number */
170 		}
171 	}
172 	*pbuf = buf;
173 	*psz = sz;
174 	return retc;
175 }
176 
177 int	word(char *);
178 int	string(void);
179 int	regexpr(void);
180 bool	sc	= false;	/* true => return a } right now */
181 bool	reg	= false;	/* true => return a REGEXPR now */
182 
yylex(void)183 int yylex(void)
184 {
185 	int c;
186 	static char *buf = NULL;
187 	static int bufsize = 5; /* BUG: setting this small causes core dump! */
188 
189 	if (buf == NULL && (buf = malloc(bufsize)) == NULL)
190 		FATAL( "out of space in yylex" );
191 	if (sc) {
192 		sc = false;
193 		RET('}');
194 	}
195 	if (reg) {
196 		reg = false;
197 		return regexpr();
198 	}
199 	for (;;) {
200 		c = gettok(&buf, &bufsize);
201 		if (c == 0)
202 			return 0;
203 		if (isalpha(c) || c == '_')
204 			return word(buf);
205 		if (isdigit(c)) {
206 			char *cp = tostring(buf);
207 			yylval.cp = setsymtab(buf, cp, atof(buf), CON|NUM, symtab);
208 			free(cp);
209 			/* should this also have STR set? */
210 			RET(NUMBER);
211 		}
212 
213 		yylval.i = c;
214 		switch (c) {
215 		case '\n':	/* {EOL} */
216 			lineno++;
217 			RET(NL);
218 		case '\r':	/* assume \n is coming */
219 		case ' ':	/* {WS}+ */
220 		case '\t':
221 			break;
222 		case '#':	/* #.* strip comments */
223 			while ((c = input()) != '\n' && c != 0)
224 				;
225 			unput(c);
226 			/*
227 			 * Next line is a hack, itcompensates for
228 			 * unput's treatment of \n.
229 			 */
230 			lineno++;
231 			break;
232 		case ';':
233 			RET(';');
234 		case '\\':
235 			if (peek() == '\n') {
236 				input();
237 				lineno++;
238 			} else if (peek() == '\r') {
239 				input(); input();	/* \n */
240 				lineno++;
241 			} else {
242 				RET(c);
243 			}
244 			break;
245 		case '&':
246 			if (peek() == '&') {
247 				input(); RET(AND);
248 			} else
249 				RET('&');
250 		case '|':
251 			if (peek() == '|') {
252 				input(); RET(BOR);
253 			} else
254 				RET('|');
255 		case '!':
256 			if (peek() == '=') {
257 				input(); yylval.i = NE; RET(NE);
258 			} else if (peek() == '~') {
259 				input(); yylval.i = NOTMATCH; RET(MATCHOP);
260 			} else
261 				RET(NOT);
262 		case '~':
263 			yylval.i = MATCH;
264 			RET(MATCHOP);
265 		case '<':
266 			if (peek() == '=') {
267 				input(); yylval.i = LE; RET(LE);
268 			} else {
269 				yylval.i = LT; RET(LT);
270 			}
271 		case '=':
272 			if (peek() == '=') {
273 				input(); yylval.i = EQ; RET(EQ);
274 			} else {
275 				yylval.i = ASSIGN; RET(ASGNOP);
276 			}
277 		case '>':
278 			if (peek() == '=') {
279 				input(); yylval.i = GE; RET(GE);
280 			} else if (peek() == '>') {
281 				input(); yylval.i = APPEND; RET(APPEND);
282 			} else {
283 				yylval.i = GT; RET(GT);
284 			}
285 		case '+':
286 			if (peek() == '+') {
287 				input(); yylval.i = INCR; RET(INCR);
288 			} else if (peek() == '=') {
289 				input(); yylval.i = ADDEQ; RET(ASGNOP);
290 			} else
291 				RET('+');
292 		case '-':
293 			if (peek() == '-') {
294 				input(); yylval.i = DECR; RET(DECR);
295 			} else if (peek() == '=') {
296 				input(); yylval.i = SUBEQ; RET(ASGNOP);
297 			} else
298 				RET('-');
299 		case '*':
300 			if (peek() == '=') {	/* *= */
301 				input(); yylval.i = MULTEQ; RET(ASGNOP);
302 			} else if (peek() == '*') {	/* ** or **= */
303 				input();	/* eat 2nd * */
304 				if (peek() == '=') {
305 					input(); yylval.i = POWEQ; RET(ASGNOP);
306 				} else {
307 					RET(POWER);
308 				}
309 			} else
310 				RET('*');
311 		case '/':
312 			RET('/');
313 		case '%':
314 			if (peek() == '=') {
315 				input(); yylval.i = MODEQ; RET(ASGNOP);
316 			} else
317 				RET('%');
318 		case '^':
319 			if (peek() == '=') {
320 				input(); yylval.i = POWEQ; RET(ASGNOP);
321 			} else
322 				RET(POWER);
323 
324 		case '$':
325 			/* BUG: awkward, if not wrong */
326 			c = gettok(&buf, &bufsize);
327 			if (isalpha(c)) {
328 				if (strcmp(buf, "NF") == 0) {	/* very special */
329 					unputstr("(NF)");
330 					RET(INDIRECT);
331 				}
332 				c = peek();
333 				if (c == '(' || c == '[' || (infunc && isarg(buf) >= 0)) {
334 					unputstr(buf);
335 					RET(INDIRECT);
336 				}
337 				yylval.cp = setsymtab(buf, "", 0.0, STR|NUM, symtab);
338 				RET(IVAR);
339 			} else if (c == 0) {	/*  */
340 				SYNTAX( "unexpected end of input after $" );
341 				RET(';');
342 			} else {
343 				unputstr(buf);
344 				RET(INDIRECT);
345 			}
346 
347 		case '}':
348 			if (--bracecnt < 0)
349 				SYNTAX( "extra }" );
350 			sc = true;
351 			RET(';');
352 		case ']':
353 			if (--brackcnt < 0)
354 				SYNTAX( "extra ]" );
355 			RET(']');
356 		case ')':
357 			if (--parencnt < 0)
358 				SYNTAX( "extra )" );
359 			RET(')');
360 		case '{':
361 			bracecnt++;
362 			RET('{');
363 		case '[':
364 			brackcnt++;
365 			RET('[');
366 		case '(':
367 			parencnt++;
368 			RET('(');
369 
370 		case '"':
371 			return string();	/* BUG: should be like tran.c ? */
372 
373 		default:
374 			RET(c);
375 		}
376 	}
377 }
378 
string(void)379 int string(void)
380 {
381 	int c, n;
382 	char *s, *bp;
383 	static char *buf = NULL;
384 	static int bufsz = 500;
385 
386 	if (buf == NULL && (buf = malloc(bufsz)) == NULL)
387 		FATAL("out of space for strings");
388 	for (bp = buf; (c = input()) != '"'; ) {
389 		if (!adjbuf(&buf, &bufsz, bp-buf+2, 500, &bp, "string"))
390 			FATAL("out of space for string %.10s...", buf);
391 		switch (c) {
392 		case '\n':
393 		case '\r':
394 		case 0:
395 			*bp = '\0';
396 			SYNTAX( "non-terminated string %.10s...", buf );
397 			if (c == 0)	/* hopeless */
398 				FATAL( "giving up" );
399 			lineno++;
400 			break;
401 		case '\\':
402 			c = input();
403 			switch (c) {
404 			case '\n': break;
405 			case '"': *bp++ = '"'; break;
406 			case 'n': *bp++ = '\n'; break;
407 			case 't': *bp++ = '\t'; break;
408 			case 'f': *bp++ = '\f'; break;
409 			case 'r': *bp++ = '\r'; break;
410 			case 'b': *bp++ = '\b'; break;
411 			case 'v': *bp++ = '\v'; break;
412 			case 'a': *bp++ = '\a'; break;
413 			case '\\': *bp++ = '\\'; break;
414 
415 			case '0': case '1': case '2': /* octal: \d \dd \ddd */
416 			case '3': case '4': case '5': case '6': case '7':
417 				n = c - '0';
418 				if ((c = peek()) >= '0' && c < '8') {
419 					n = 8 * n + input() - '0';
420 					if ((c = peek()) >= '0' && c < '8')
421 						n = 8 * n + input() - '0';
422 				}
423 				*bp++ = n;
424 				break;
425 
426 			case 'x':	/* hex  \x0-9a-fA-F + */
427 			    {	char xbuf[100], *px;
428 				for (px = xbuf; (c = input()) != 0 && px-xbuf < 100-2; ) {
429 					if (isdigit(c)
430 					 || (c >= 'a' && c <= 'f')
431 					 || (c >= 'A' && c <= 'F'))
432 						*px++ = c;
433 					else
434 						break;
435 				}
436 				*px = 0;
437 				unput(c);
438 	  			sscanf(xbuf, "%x", (unsigned int *) &n);
439 				*bp++ = n;
440 				break;
441 			    }
442 
443 			default:
444 				*bp++ = c;
445 				break;
446 			}
447 			break;
448 		default:
449 			*bp++ = c;
450 			break;
451 		}
452 	}
453 	*bp = 0;
454 	s = tostring(buf);
455 	*bp++ = ' '; *bp++ = '\0';
456 	yylval.cp = setsymtab(buf, s, 0.0, CON|STR|DONTFREE, symtab);
457 	free(s);
458 	RET(STRING);
459 }
460 
461 
binsearch(char * w,const Keyword * kp,int n)462 static int binsearch(char *w, const Keyword *kp, int n)
463 {
464 	int cond, low, mid, high;
465 
466 	low = 0;
467 	high = n - 1;
468 	while (low <= high) {
469 		mid = (low + high) / 2;
470 		if ((cond = strcmp(w, kp[mid].word)) < 0)
471 			high = mid - 1;
472 		else if (cond > 0)
473 			low = mid + 1;
474 		else
475 			return mid;
476 	}
477 	return -1;
478 }
479 
word(char * w)480 int word(char *w)
481 {
482 	const Keyword *kp;
483 	int c, n;
484 
485 	n = binsearch(w, keywords, sizeof(keywords)/sizeof(keywords[0]));
486 	if (n != -1) {	/* found in table */
487 		kp = keywords + n;
488 		yylval.i = kp->sub;
489 		switch (kp->type) {	/* special handling */
490 		case BLTIN:
491 			if (kp->sub == FSYSTEM && safe)
492 				SYNTAX( "system is unsafe" );
493 			RET(kp->type);
494 		case FUNC:
495 			if (infunc)
496 				SYNTAX( "illegal nested function" );
497 			RET(kp->type);
498 		case RETURN:
499 			if (!infunc)
500 				SYNTAX( "return not in function" );
501 			RET(kp->type);
502 		case VARNF:
503 			yylval.cp = setsymtab("NF", "", 0.0, NUM, symtab);
504 			RET(VARNF);
505 		default:
506 			RET(kp->type);
507 		}
508 	}
509 	c = peek();	/* look for '(' */
510 	if (c != '(' && infunc && (n=isarg(w)) >= 0) {
511 		yylval.i = n;
512 		RET(ARG);
513 	} else {
514 		yylval.cp = setsymtab(w, "", 0.0, STR|NUM|DONTFREE, symtab);
515 		if (c == '(') {
516 			RET(CALL);
517 		} else {
518 			RET(VAR);
519 		}
520 	}
521 }
522 
startreg(void)523 void startreg(void)	/* next call to yylex will return a regular expression */
524 {
525 	reg = true;
526 }
527 
regexpr(void)528 int regexpr(void)
529 {
530 	int c;
531 	static char *buf = NULL;
532 	static int bufsz = 500;
533 	char *bp;
534 
535 	if (buf == NULL && (buf = malloc(bufsz)) == NULL)
536 		FATAL("out of space for rex expr");
537 	bp = buf;
538 	for ( ; (c = input()) != '/' && c != 0; ) {
539 		if (!adjbuf(&buf, &bufsz, bp-buf+3, 500, &bp, "regexpr"))
540 			FATAL("out of space for reg expr %.10s...", buf);
541 		if (c == '\n') {
542 			*bp = '\0';
543 			SYNTAX( "newline in regular expression %.10s...", buf );
544 			unput('\n');
545 			break;
546 		} else if (c == '\\') {
547 			*bp++ = '\\';
548 			*bp++ = input();
549 		} else {
550 			*bp++ = c;
551 		}
552 	}
553 	*bp = 0;
554 	if (c == 0)
555 		SYNTAX("non-terminated regular expression %.10s...", buf);
556 	yylval.s = tostring(buf);
557 	unput('/');
558 	RET(REGEXPR);
559 }
560 
561 /* low-level lexical stuff, sort of inherited from lex */
562 
563 char	ebuf[300];
564 char	*ep = ebuf;
565 char	yysbuf[100];	/* pushback buffer */
566 char	*yysptr = yysbuf;
567 FILE	*yyin = NULL;
568 
input(void)569 int input(void)	/* get next lexical input character */
570 {
571 	int c;
572 	extern char *lexprog;
573 
574 	if (yysptr > yysbuf)
575 		c = (uschar)*--yysptr;
576 	else if (lexprog != NULL) {	/* awk '...' */
577 		if ((c = (uschar)*lexprog) != 0)
578 			lexprog++;
579 	} else				/* awk -f ... */
580 		c = pgetc();
581 	if (c == EOF)
582 		c = 0;
583 	if (ep >= ebuf + sizeof ebuf)
584 		ep = ebuf;
585 	*ep = c;
586 	if (c != 0) {
587 		ep++;
588 	}
589 	return (c);
590 }
591 
unput(int c)592 void unput(int c)	/* put lexical character back on input */
593 {
594 	if (c == '\n')
595 		lineno--;
596 	if (yysptr >= yysbuf + sizeof(yysbuf))
597 		FATAL("pushed back too much: %.20s...", yysbuf);
598 	*yysptr++ = c;
599 	if (--ep < ebuf)
600 		ep = ebuf + sizeof(ebuf) - 1;
601 }
602 
unputstr(const char * s)603 void unputstr(const char *s)	/* put a string back on input */
604 {
605 	int i;
606 
607 	for (i = strlen(s)-1; i >= 0; i--)
608 		unput(s[i]);
609 }
610