xref: /illumos-gate/usr/src/cmd/awk_xpg4/awk1.c (revision f808c858)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License, Version 1.0 only
6  * (the "License").  You may not use this file except in compliance
7  * with the License.
8  *
9  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10  * or http://www.opensolaris.org/os/licensing.
11  * See the License for the specific language governing permissions
12  * and limitations under the License.
13  *
14  * When distributing Covered Code, include this CDDL HEADER in each
15  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16  * If applicable, add the following below this CDDL HEADER, with the
17  * fields enclosed by brackets "[]" replaced with your own identifying
18  * information: Portions Copyright [yyyy] [name of copyright owner]
19  *
20  * CDDL HEADER END
21  */
22 /*
23  * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 /*
28  * Copyright 1986, 1994 by Mortice Kern Systems Inc.  All rights reserved.
29  */
30 
31 #pragma ident	"%Z%%M%	%I%	%E% SMI"
32 
33 /*
34  * awk -- mainline, yylex, etc.
35  *
36  * Based on MKS awk(1) ported to be /usr/xpg4/bin/awk with POSIX/XCU4 changes
37  */
38 
39 #include "awk.h"
40 #include "y.tab.h"
41 #include <stdarg.h>
42 #include <unistd.h>
43 #include <locale.h>
44 
45 static char	*progfiles[NPFILE];	/* Programmes files for yylex */
46 static char	**progfilep = &progfiles[0]; /* Pointer to last file */
47 static wchar_t	*progptr;		/* In-memory programme */
48 static int	proglen;		/* Length of progptr */
49 static wchar_t	context[NCONTEXT];	/* Circular buffer of context */
50 static wchar_t	*conptr = &context[0];	/* context ptr */
51 static FILE	*progfp;		/* Stdio stream for programme */
52 static char	*filename;
53 #ifdef	DEBUG
54 static int	dflag;
55 #endif
56 
57 #define	AWK_EXEC_MAGIC	"<MKS AWKC>"
58 #define	LEN_EXEC_MAGIC	10
59 
60 static char	unbal[] = "unbalanced E char";
61 
62 static void	awkarginit(int c, char **av);
63 static int	lexid(wint_t c);
64 static int	lexnumber(wint_t c);
65 static int	lexstring(wint_t endc);
66 static int	lexregexp(wint_t endc);
67 
68 static void	awkvarinit(void);
69 static wint_t	lexgetc(void);
70 static void	lexungetc(wint_t c);
71 static size_t	lexescape(wint_t endc, int regx, int cmd_line_operand);
72 static void	awkierr(int perr, char *fmt, va_list ap);
73 static int	usage(void);
74 void		strescape(wchar_t *str);
75 static const char	*toprint(wint_t);
76 char *_cmdname;
77 static wchar_t *mbconvert(char *str);
78 
79 extern int	isclvar(wchar_t *arg);
80 
81 /*
82  * mainline for awk
83  */
84 int
85 main(int argc, char *argv[])
86 {
87 	wchar_t *ap;
88 	char *cmd;
89 
90 	cmd = argv[0];
91 	_cmdname = cmd;
92 
93 	linebuf = emalloc(NLINE * sizeof (wchar_t));
94 
95 	/*
96 	 * At this point only messaging should be internationalized.
97 	 * numbers are still scanned as in the Posix locale.
98 	 */
99 	(void) setlocale(LC_ALL, "");
100 	(void) setlocale(LC_NUMERIC, "C");
101 #if !defined(TEXT_DOMAIN)
102 #define	TEXT_DOMAIN	"SYS_TEST"
103 #endif
104 	(void) textdomain(TEXT_DOMAIN);
105 
106 	awkvarinit();
107 	/* running = 1; */
108 	while (argc > 1 && *argv[1] == '-') {
109 		void *save_ptr = NULL;
110 		ap = mbstowcsdup(&argv[1][1]);
111 		if (ap == NULL)
112 			break;
113 		if (*ap == '\0') {
114 			free(ap);
115 			break;
116 		}
117 		save_ptr = (void *) ap;
118 		++argv;
119 		--argc;
120 		if (*ap == '-' && ap[1] == '\0')
121 			break;
122 		for (; *ap != '\0'; ++ap) {
123 			switch (*ap) {
124 #ifdef DEBUG
125 			case 'd':
126 				dflag = 1;
127 				continue;
128 
129 #endif
130 			case 'f':
131 				if (argc < 2) {
132 					(void) fprintf(stderr,
133 				gettext("Missing script file\n"));
134 					return (1);
135 				}
136 				*progfilep++ = argv[1];
137 				--argc;
138 				++argv;
139 				continue;
140 
141 			case 'F':
142 				if (ap[1] == '\0') {
143 					if (argc < 2) {
144 						(void) fprintf(stderr,
145 				gettext("Missing field separator\n"));
146 						return (1);
147 					}
148 					ap = mbstowcsdup(argv[1]);
149 					--argc;
150 					++argv;
151 				} else
152 					++ap;
153 				strescape(ap);
154 				strassign(varFS, linebuf, FALLOC,
155 				    wcslen(linebuf));
156 				break;
157 
158 			case 'v': {
159 				wchar_t *vp;
160 				wchar_t *arg;
161 
162 				if (argc < 2) {
163 					(void) fprintf(stderr,
164 		gettext("Missing variable assignment\n"));
165 					return (1);
166 				}
167 				arg = mbconvert(argv[1]);
168 				/*
169 				 * Ensure the variable expression
170 				 * is valid (correct form).
171 				 */
172 				if (((vp = wcschr(arg, '=')) != NULL) &&
173 				    isclvar(arg)) {
174 					*vp = '\0';
175 					strescape(vp+1);
176 					strassign(vlook(arg), linebuf,
177 					    FALLOC|FSENSE,
178 					    wcslen(linebuf));
179 					*vp = '=';
180 				} else {
181 					(void) fprintf(stderr, gettext(
182 					    "Invalid form for variable "
183 					    "assignment: %S\n"), arg);
184 					return (1);
185 				}
186 				--argc;
187 				++argv;
188 				continue;
189 			}
190 
191 			default:
192 				(void) fprintf(stderr,
193 				gettext("Unknown option \"-%S\"\n"), ap);
194 				return (usage());
195 			}
196 			break;
197 		}
198 		if (save_ptr)
199 			free(save_ptr);
200 	}
201 	if (progfilep == &progfiles[0]) {
202 		if (argc < 2)
203 			return (usage());
204 		filename = "[command line]";	/* BUG: NEEDS TRANSLATION */
205 		progptr = mbstowcsdup(argv[1]);
206 		proglen = wcslen(progptr);
207 		--argc;
208 		++argv;
209 	}
210 
211 	argv[0] = cmd;
212 
213 	awkarginit(argc, argv);
214 
215 	/* running = 0; */
216 	(void) yyparse();
217 
218 	lineno = 0;
219 	/*
220 	 * Ok, done parsing, so now activate the rest of the nls stuff, set
221 	 * the radix character.
222 	 */
223 	(void) setlocale(LC_ALL, "");
224 	radixpoint = *localeconv()->decimal_point;
225 	awk();
226 	/* NOTREACHED */
227 	return (0);
228 }
229 
230 /*
231  * Do initial setup of buffers, etc.
232  * This must be called before most processing
233  * and especially before lexical analysis.
234  * Variables initialised here will be overruled by command
235  * line parameter initialisation.
236  */
237 static void
238 awkvarinit()
239 {
240 	NODE *np;
241 
242 	(void) setvbuf(stderr, NULL, _IONBF, 0);
243 
244 	if ((NIOSTREAM = sysconf(_SC_OPEN_MAX) - 4) <= 0) {
245 		(void) fprintf(stderr,
246 	gettext("not enough available file descriptors"));
247 		exit(1);
248 	}
249 	ofiles = (OFILE *)emalloc(sizeof (OFILE)*NIOSTREAM);
250 #ifdef A_ZERO_POINTERS
251 	(void) memset((wchar_t *)ofiles, 0, sizeof (OFILE) * NIOSTREAM);
252 #else
253 	{
254 		/* initialize file descriptor table */
255 		OFILE *fp;
256 		for (fp = ofiles; fp < &ofiles[NIOSTREAM]; fp += 1) {
257 			fp->f_fp = FNULL;
258 					fp->f_mode = 0;
259 					fp->f_name = (char *)0;
260 		}
261 	}
262 #endif
263 	constant = intnode((INT)0);
264 
265 	const0 = intnode((INT)0);
266 	const1 = intnode((INT)1);
267 	constundef = emptynode(CONSTANT, 0);
268 	constundef->n_flags = FSTRING|FVINT;
269 	constundef->n_string = _null;
270 	constundef->n_strlen = 0;
271 	inc_oper = emptynode(ADD, 0);
272 	inc_oper->n_right = const1;
273 	asn_oper = emptynode(ADD, 0);
274 	field0 = node(FIELD, const0, NNULL);
275 
276 	{
277 		RESFUNC near*rp;
278 
279 		for (rp = &resfuncs[0]; rp->rf_name != (LOCCHARP)NULL; ++rp) {
280 			np = finstall(rp->rf_name, rp->rf_func, rp->rf_type);
281 		}
282 	}
283 	{
284 		RESERVED near*rp;
285 
286 		for (rp = &reserved[0]; rp->r_name != (LOCCHARP)NULL; ++rp) {
287 			switch (rp->r_type) {
288 			case SVAR:
289 			case VAR:
290 				running = 1;
291 				np = vlook(rp->r_name);
292 				if (rp->r_type == SVAR)
293 					np->n_flags |= FSPECIAL;
294 				if (rp->r_svalue != NULL)
295 					strassign(np, rp->r_svalue, FSTATIC,
296 					    (size_t)rp->r_ivalue);
297 				else {
298 					constant->n_int = rp->r_ivalue;
299 					(void) assign(np, constant);
300 				}
301 				running = 0;
302 				break;
303 
304 			case KEYWORD:
305 				kinstall(rp->r_name, (int)rp->r_ivalue);
306 				break;
307 			}
308 		}
309 	}
310 
311 	varNR = vlook(s_NR);
312 	varFNR = vlook(s_FNR);
313 	varNF = vlook(s_NF);
314 	varOFMT = vlook(s_OFMT);
315 	varCONVFMT = vlook(s_CONVFMT);
316 	varOFS = vlook(s_OFS);
317 	varORS = vlook(s_ORS);
318 	varRS = vlook(s_RS);
319 	varFS = vlook(s_FS);
320 	varARGC = vlook(s_ARGC);
321 	varSUBSEP = vlook(s_SUBSEP);
322 	varENVIRON = vlook(s_ENVIRON);
323 	varFILENAME = vlook(s_FILENAME);
324 	varSYMTAB = vlook(s_SYMTAB);
325 	incNR = node(ASG, varNR, node(ADD, varNR, const1));
326 	incFNR = node(ASG, varFNR, node(ADD, varFNR, const1));
327 	clrFNR = node(ASG, varFNR, const0);
328 }
329 
330 /*
331  * Initialise awk ARGC, ARGV variables.
332  */
333 static void
334 awkarginit(int ac, char **av)
335 {
336 	int i;
337 	wchar_t *cp;
338 
339 	ARGVsubi = node(INDEX, vlook(s_ARGV), constant);
340 	running = 1;
341 	constant->n_int = ac;
342 	(void) assign(varARGC, constant);
343 	for (i = 0; i < ac; ++i) {
344 		cp = mbstowcsdup(av[i]);
345 		constant->n_int = i;
346 		strassign(exprreduce(ARGVsubi), cp,
347 		    FSTATIC|FSENSE, wcslen(cp));
348 	}
349 	running = 0;
350 }
351 
352 /*
353  * Clean up when done parsing a function.
354  * All formal parameters, because of a deal (funparm) in
355  * yylex, get put into the symbol table in front of any
356  * global variable of the same name.  When the entire
357  * function is parsed, remove these formal dummy nodes
358  * from the symbol table but retain the nodes because
359  * the generated tree points at them.
360  */
361 void
362 uexit(NODE *np)
363 {
364 	NODE *formal;
365 
366 	while ((formal = getlist(&np)) != NNULL)
367 		delsymtab(formal, 0);
368 }
369 
370 /*
371  * The lexical analyzer.
372  */
373 int
374 yylex()
375 #ifdef	DEBUG
376 {
377 	int l;
378 
379 	l = yyhex();
380 	if (dflag)
381 		(void) printf("%d\n", l);
382 	return (l);
383 }
384 yyhex()
385 #endif
386 {
387 	wint_t c, c1;
388 	int i;
389 	static int savetoken = 0;
390 	static int wasfield;
391 	static int isfuncdef;
392 	static int nbrace, nparen, nbracket;
393 	static struct ctosymstruct {
394 		wint_t c, sym;
395 	} ctosym[] = {
396 		{ '|', BAR },		{ '^', CARAT },
397 		{ '~', TILDE },		{ '<', LANGLE },
398 		{ '>', RANGLE },	{ '+', PLUSC },
399 		{ '-', HYPHEN },	{ '*', STAR },
400 		{ '/', SLASH },		{ '%', PERCENT },
401 		{ '!', EXCLAMATION },	{ '$', DOLLAR },
402 		{ '[', LSQUARE },	{ ']', RSQUARE },
403 		{ '(', LPAREN },	{ ')', RPAREN },
404 		{ ';', SEMI },		{ '{', LBRACE },
405 		{ '}', RBRACE },	{   0, 0 }
406 	};
407 
408 	if (savetoken) {
409 		c = savetoken;
410 		savetoken = 0;
411 	} else if (redelim != '\0') {
412 		c = redelim;
413 		redelim = 0;
414 		catterm = 0;
415 		savetoken = c;
416 		return (lexlast = lexregexp(c));
417 	} else while ((c = lexgetc()) != WEOF) {
418 		if (iswalpha(c) || c == '_') {
419 			c = lexid(c);
420 		} else if (iswdigit(c) || c == '.') {
421 			c = lexnumber(c);
422 		} else if (isWblank(c)) {
423 			continue;
424 		} else switch (c) {
425 #if DOS || OS2
426 		case 032:		/* ^Z */
427 			continue;
428 #endif
429 
430 		case '"':
431 			c = lexstring(c);
432 			break;
433 
434 		case '#':
435 			while ((c = lexgetc()) != '\n' && c != WEOF)
436 				;
437 			lexungetc(c);
438 			continue;
439 
440 		case '+':
441 			if ((c1 = lexgetc()) == '+')
442 				c = INC;
443 			else if (c1 == '=')
444 				c = AADD;
445 			else
446 				lexungetc(c1);
447 			break;
448 
449 		case '-':
450 			if ((c1 = lexgetc()) == '-')
451 				c = DEC;
452 			else if (c1 == '=')
453 				c = ASUB;
454 			else
455 				lexungetc(c1);
456 			break;
457 
458 		case '*':
459 			if ((c1 = lexgetc()) == '=')
460 				c = AMUL;
461 			else if (c1 == '*') {
462 				if ((c1 = lexgetc()) == '=')
463 					c = AEXP;
464 				else {
465 					c = EXP;
466 					lexungetc(c1);
467 				}
468 			} else
469 				lexungetc(c1);
470 			break;
471 
472 		case '^':
473 			if ((c1 = lexgetc()) == '=') {
474 				c = AEXP;
475 			} else {
476 				c = EXP;
477 				lexungetc(c1);
478 			}
479 			break;
480 
481 		case '/':
482 			if ((c1 = lexgetc()) == '=' &&
483 			    lexlast != RE && lexlast != NRE &&
484 			    lexlast != ';' && lexlast != '\n' &&
485 			    lexlast != ',' && lexlast != '(')
486 				c = ADIV;
487 			else
488 				lexungetc(c1);
489 			break;
490 
491 		case '%':
492 			if ((c1 = lexgetc()) == '=')
493 				c = AREM;
494 			else
495 				lexungetc(c1);
496 			break;
497 
498 		case '&':
499 			if ((c1 = lexgetc()) == '&')
500 				c = AND;
501 			else
502 				lexungetc(c1);
503 			break;
504 
505 		case '|':
506 			if ((c1 = lexgetc()) == '|')
507 				c = OR;
508 			else {
509 				lexungetc(c1);
510 				if (inprint)
511 					c = PIPE;
512 			}
513 			break;
514 
515 		case '>':
516 			if ((c1 = lexgetc()) == '=')
517 				c = GE;
518 			else if (c1 == '>')
519 				c = APPEND;
520 			else {
521 				lexungetc(c1);
522 				if (nparen == 0 && inprint)
523 					c = WRITE;
524 			}
525 			break;
526 
527 		case '<':
528 			if ((c1 = lexgetc()) == '=')
529 				c = LE;
530 			else
531 				lexungetc(c1);
532 			break;
533 
534 		case '!':
535 			if ((c1 = lexgetc()) == '=')
536 				c = NE;
537 			else if (c1 == '~')
538 				c = NRE;
539 			else
540 				lexungetc(c1);
541 			break;
542 
543 		case '=':
544 			if ((c1 = lexgetc()) == '=')
545 				c = EQ;
546 			else {
547 				lexungetc(c1);
548 				c = ASG;
549 			}
550 			break;
551 
552 		case '\n':
553 			switch (lexlast) {
554 			case ')':
555 				if (catterm || inprint) {
556 					c = ';';
557 					break;
558 				}
559 			case AND:
560 			case OR:
561 			case COMMA:
562 			case '{':
563 			case ELSE:
564 			case ';':
565 			case DO:
566 				continue;
567 
568 			case '}':
569 				if (nbrace != 0)
570 					continue;
571 
572 			default:
573 				c = ';';
574 				break;
575 			}
576 			break;
577 
578 		case ELSE:
579 			if (lexlast != ';') {
580 				savetoken = ELSE;
581 				c = ';';
582 			}
583 			break;
584 
585 		case '(':
586 			++nparen;
587 			break;
588 
589 		case ')':
590 			if (--nparen < 0)
591 				awkerr(unbal, "()");
592 			break;
593 
594 		case '{':
595 			nbrace++;
596 			break;
597 
598 		case '}':
599 			if (--nbrace < 0) {
600 				char brk[3];
601 
602 				brk[0] = '{';
603 				brk[1] = '}';
604 				brk[2] = '\0';
605 				awkerr(unbal, brk);
606 			}
607 			if (lexlast != ';') {
608 				savetoken = c;
609 				c = ';';
610 			}
611 			break;
612 
613 		case '[':
614 			++nbracket;
615 			break;
616 
617 		case ']':
618 			if (--nbracket < 0) {
619 				char brk[3];
620 
621 				brk[0] = '[';
622 				brk[1] = ']';
623 				brk[2] = '\0';
624 				awkerr(unbal, brk);
625 			}
626 			break;
627 
628 		case '\\':
629 			if ((c1 = lexgetc()) == '\n')
630 				continue;
631 			lexungetc(c1);
632 			break;
633 
634 		case ',':
635 			c = COMMA;
636 			break;
637 
638 		case '?':
639 			c = QUEST;
640 			break;
641 
642 		case ':':
643 			c = COLON;
644 			break;
645 
646 		default:
647 			if (!iswprint(c))
648 				awkerr(
649 				    gettext("invalid character \"%s\""),
650 				    toprint(c));
651 			break;
652 		}
653 		break;
654 	}
655 
656 	switch (c) {
657 	case ']':
658 		++catterm;
659 		break;
660 
661 	case VAR:
662 		if (catterm) {
663 			savetoken = c;
664 			c = CONCAT;
665 			catterm = 0;
666 		} else if (!isfuncdef) {
667 			if ((c1 = lexgetc()) != '(')
668 				++catterm;
669 			lexungetc(c1);
670 		}
671 		isfuncdef = 0;
672 		break;
673 
674 	case PARM:
675 	case CONSTANT:
676 		if (catterm) {
677 			savetoken = c;
678 			c = CONCAT;
679 			catterm = 0;
680 		} else {
681 			if (lexlast == '$')
682 				wasfield = 2;
683 			++catterm;
684 		}
685 		break;
686 
687 	case INC:
688 	case DEC:
689 		if (!catterm || lexlast != CONSTANT || wasfield)
690 			break;
691 
692 	case UFUNC:
693 	case FUNC:
694 	case GETLINE:
695 	case '!':
696 	case '$':
697 	case '(':
698 		if (catterm) {
699 			savetoken = c;
700 			c = CONCAT;
701 			catterm = 0;
702 		}
703 		break;
704 
705 	/* { */ case '}':
706 		if (nbrace == 0)
707 			savetoken = ';';
708 	case ';':
709 		inprint = 0;
710 	default:
711 		if (c == DEFFUNC)
712 			isfuncdef = 1;
713 		catterm = 0;
714 	}
715 	lexlast = c;
716 	if (wasfield)
717 		wasfield--;
718 	/*
719 	 * Map character constants to symbolic names.
720 	 */
721 	for (i = 0; ctosym[i].c != 0; i++)
722 		if (c == ctosym[i].c) {
723 			c = ctosym[i].sym;
724 			break;
725 		}
726 	return ((int)c);
727 }
728 
729 /*
730  * Read a number for the lexical analyzer.
731  * Input is the first character of the number.
732  * Return value is the lexical type.
733  */
734 static int
735 lexnumber(wint_t c)
736 {
737 	wchar_t *cp;
738 	int dotfound = 0;
739 	int efound = 0;
740 	INT number;
741 
742 	cp = linebuf;
743 	do {
744 		if (iswdigit(c))
745 			;
746 		else if (c == '.') {
747 			if (dotfound++)
748 				break;
749 		} else if (c == 'e' || c == 'E') {
750 			if ((c = lexgetc()) != '-' && c != '+') {
751 				lexungetc(c);
752 				c = 'e';
753 			} else
754 				*cp++ = 'e';
755 			if (efound++)
756 				break;
757 		} else
758 			break;
759 		*cp++ = c;
760 	} while ((c = lexgetc()) != WEOF);
761 	*cp = '\0';
762 	if (dotfound && cp == linebuf+1)
763 		return (DOT);
764 	lexungetc(c);
765 	errno = 0;
766 	if (!dotfound && !efound &&
767 	    ((number = wcstol(linebuf, (wchar_t **)0, 10)), errno != ERANGE))
768 		yylval.node = intnode(number);
769 	else
770 		yylval.node = realnode((REAL)wcstod(linebuf, (wchar_t **)0));
771 	return (CONSTANT);
772 }
773 
774 /*
775  * Read an identifier.
776  * Input is first character of identifier.
777  * Return VAR.
778  */
779 static int
780 lexid(wint_t c)
781 {
782 	wchar_t *cp;
783 	size_t i;
784 	NODE *np;
785 
786 	cp = linebuf;
787 	do {
788 		*cp++ = c;
789 		c = lexgetc();
790 	} while (iswalpha(c) || iswdigit(c) || c == '_');
791 	*cp = '\0';
792 	lexungetc(c);
793 	yylval.node = np = vlook(linebuf);
794 
795 	switch (np->n_type) {
796 	case KEYWORD:
797 		switch (np->n_keywtype) {
798 		case PRINT:
799 		case PRINTF:
800 			++inprint;
801 		default:
802 			return ((int)np->n_keywtype);
803 		}
804 		/* NOTREACHED */
805 
806 	case ARRAY:
807 	case VAR:
808 		/*
809 		 * If reading the argument list, create a dummy node
810 		 * for the duration of that function. These variables
811 		 * can be removed from the symbol table at function end
812 		 * but they must still exist because the execution tree
813 		 * knows about them.
814 		 */
815 		if (funparm) {
816 do_funparm:
817 			np = emptynode(PARM, i = (cp-linebuf));
818 			np->n_flags = FSTRING;
819 			np->n_string = _null;
820 			np->n_strlen = 0;
821 			(void) memcpy(np->n_name, linebuf,
822 			    (i+1) * sizeof (wchar_t));
823 			addsymtab(np);
824 			yylval.node = np;
825 		} else if (np == varNF || (np == varFS &&
826 		    (!doing_begin || begin_getline))) {
827 			/*
828 			 * If the user program references NF or sets
829 			 * FS either outside of a begin block or
830 			 * in a begin block after a getline then the
831 			 * input line will be split immediately upon read
832 			 * rather than when a field is first referenced.
833 			 */
834 			needsplit = 1;
835 		} else if (np == varENVIRON)
836 			needenviron = 1;
837 	case PARM:
838 		return (VAR);
839 
840 	case UFUNC:
841 		/*
842 		 * It is ok to redefine functions as parameters
843 		 */
844 		if (funparm) goto do_funparm;
845 	case FUNC:
846 	case GETLINE:
847 		/*
848 		 * When a getline is encountered, clear the 'doing_begin' flag.
849 		 * This will force the 'needsplit' flag to be set, even inside
850 		 * a begin block, if FS is altered. (See VAR case above)
851 		 */
852 		if (doing_begin)
853 			begin_getline = 1;
854 		return (np->n_type);
855 	}
856 	/* NOTREACHED */
857 	return (0);
858 }
859 
860 /*
861  * Read a string for the lexical analyzer.
862  * `endc' terminates the string.
863  */
864 static int
865 lexstring(wint_t endc)
866 {
867 	size_t length = lexescape(endc, 0, 0);
868 
869 	yylval.node = stringnode(linebuf, FALLOC, length);
870 	return (CONSTANT);
871 }
872 
873 /*
874  * Read a regular expression.
875  */
876 static int
877 lexregexp(wint_t endc)
878 {
879 	(void) lexescape(endc, 1, 0);
880 	yylval.node = renode(linebuf);
881 	return (URE);
882 }
883 
884 /*
885  * Process a string, converting the escape characters as required by
886  * 1003.2. The processed string ends up in the global linebuf[]. This
887  * routine also changes the value of 'progfd' - the program file
888  * descriptor, so it should be used with some care. It is presently used to
889  * process -v (awk1.c) and var=str type arguments (awk2.c, nextrecord()).
890  */
891 void
892 strescape(wchar_t *str)
893 {
894 	progptr = str;
895 	proglen = wcslen(str) + 1;	/* Include \0 */
896 	(void) lexescape('\0', 0, 1);
897 	progptr = NULL;
898 }
899 
900 /*
901  * Read a string or regular expression, terminated by ``endc'',
902  * for lexical analyzer, processing escape sequences.
903  * Return string length.
904  */
905 static size_t
906 lexescape(wint_t endc, int regx, int cmd_line_operand)
907 {
908 	static char nlre[256];
909 	static char nlstr[256];
910 	static char eofre[256];
911 	static char eofstr[256];
912 	int first_time = 1;
913 	wint_t c;
914 	wchar_t *cp;
915 	int n, max;
916 
917 	if (first_time == 1) {
918 		(void) strcpy(nlre, gettext("Newline in regular expression\n"));
919 		(void) strcpy(nlstr, gettext("Newline in string\n"));
920 		(void) strcpy(eofre, gettext("EOF in regular expression\n"));
921 		(void) strcpy(eofstr, gettext("EOF in string\n"));
922 		first_time = 0;
923 	}
924 
925 	cp = linebuf;
926 	while ((c = lexgetc()) != endc) {
927 		if (c == '\n')
928 			awkerr(regx ? nlre : nlstr);
929 		if (c == '\\') {
930 			switch (c = lexgetc(), c) {
931 			case '\\':
932 				if (regx)
933 					*cp++ = '\\';
934 				break;
935 
936 			case '/':
937 				c = '/';
938 				break;
939 
940 			case 'n':
941 				c = '\n';
942 				break;
943 
944 			case 'b':
945 				c = '\b';
946 				break;
947 
948 			case 't':
949 				c = '\t';
950 				break;
951 
952 			case 'r':
953 				c = '\r';
954 				break;
955 
956 			case 'f':
957 				c = '\f';
958 				break;
959 
960 			case 'v':
961 				c = '\v';
962 				break;
963 
964 			case 'a':
965 				c = (char)0x07;
966 				break;
967 
968 			case 'x':
969 				n = 0;
970 				while (iswxdigit(c = lexgetc())) {
971 					if (iswdigit(c))
972 						c -= '0';
973 					else if (iswupper(c))
974 						c -= 'A'-10;
975 					else
976 						c -= 'a'-10;
977 					n = (n<<4) + c;
978 				}
979 				lexungetc(c);
980 				c = n;
981 				break;
982 
983 			case '0':
984 			case '1':
985 			case '2':
986 			case '3':
987 			case '4':
988 			case '5':
989 			case '6':
990 			case '7':
991 #if 0
992 /*
993  * Posix.2 draft 10 disallows the use of back-referencing - it explicitly
994  * requires processing of the octal escapes both in strings and
995  * regular expressions. The following code is disabled instead of
996  * removed as back-referencing may be reintroduced in a future draft
997  * of the standard.
998  */
999 				/*
1000 				 * For regular expressions, we disallow
1001 				 * \ooo to mean octal character, in favour
1002 				 * of back referencing.
1003 				 */
1004 				if (regx) {
1005 					*cp++ = '\\';
1006 					break;
1007 				}
1008 #endif
1009 				max = 3;
1010 				n = 0;
1011 				do {
1012 					n = (n<<3) + c-'0';
1013 					if ((c = lexgetc()) > '7' || c < '0')
1014 						break;
1015 				} while (--max);
1016 				lexungetc(c);
1017 				/*
1018 				 * an octal escape sequence must have at least
1019 				 * 2 digits after the backslash, otherwise
1020 				 * it gets passed straight thru for possible
1021 				 * use in backreferencing.
1022 				 */
1023 				if (max == 3) {
1024 					*cp++ = '\\';
1025 					n += '0';
1026 				}
1027 				c = n;
1028 				break;
1029 
1030 			case '\n':
1031 				continue;
1032 
1033 			default:
1034 				if (c != endc || cmd_line_operand) {
1035 					*cp++ = '\\';
1036 					if (c == endc)
1037 						lexungetc(c);
1038 				}
1039 			}
1040 		}
1041 		if (c == WEOF)
1042 			awkerr(regx ? eofre : eofstr);
1043 		*cp++ = c;
1044 	}
1045 	*cp = '\0';
1046 	return (cp - linebuf);
1047 }
1048 
1049 /*
1050  * Build a regular expression NODE.
1051  * Argument is the string holding the expression.
1052  */
1053 NODE *
1054 renode(wchar_t *s)
1055 {
1056 	NODE *np;
1057 	int n;
1058 
1059 	np = emptynode(RE, 0);
1060 	np->n_left = np->n_right = NNULL;
1061 	np->n_regexp = (REGEXP)emalloc(sizeof (regex_t));
1062 	if ((n = REGWCOMP(np->n_regexp, s, REG_EXTENDED)) != REG_OK) {
1063 		int m;
1064 		char *p;
1065 
1066 		m = regerror(n, np->n_regexp, NULL, 0);
1067 		p = (char *)emalloc(m);
1068 		regerror(n, np->n_regexp, p, m);
1069 		awkerr("/%S/: %s", s, p);
1070 	}
1071 	return (np);
1072 }
1073 /*
1074  * Get a character for the lexical analyser routine.
1075  */
1076 static wint_t
1077 lexgetc()
1078 {
1079 	wint_t c;
1080 	static char **files = &progfiles[0];
1081 
1082 	if (progfp != FNULL && (c = fgetwc(progfp)) != WEOF)
1083 		;
1084 	else {
1085 		if (progptr != NULL) {
1086 			if (proglen-- <= 0)
1087 				c = WEOF;
1088 			else
1089 				c = *progptr++;
1090 		} else {
1091 			if (progfp != FNULL)
1092 				if (progfp != stdin)
1093 					(void) fclose(progfp);
1094 				else
1095 					clearerr(progfp);
1096 				progfp = FNULL;
1097 			if (files < progfilep) {
1098 				filename = *files++;
1099 				lineno = 1;
1100 				if (filename[0] == '-' && filename[1] == '\0')
1101 					progfp = stdin;
1102 				else if ((progfp = fopen(filename, r))
1103 				    == FNULL) {
1104 					(void) fprintf(stderr,
1105 				gettext("script file \"%s\""), filename);
1106 					exit(1);
1107 				}
1108 				c = fgetwc(progfp);
1109 			}
1110 		}
1111 	}
1112 	if (c == '\n')
1113 		++lineno;
1114 	if (conptr >= &context[NCONTEXT])
1115 		conptr = &context[0];
1116 	if (c != WEOF)
1117 		*conptr++ = c;
1118 	return (c);
1119 }
1120 
1121 /*
1122  * Return a character for lexical analyser.
1123  * Only one returned character is (not enforced) legitimite.
1124  */
1125 static void
1126 lexungetc(wint_t c)
1127 {
1128 	if (c == '\n')
1129 		--lineno;
1130 	if (c != WEOF) {
1131 		if (conptr == &context[0])
1132 			conptr = &context[NCONTEXT];
1133 		*--conptr = '\0';
1134 	}
1135 	if (progfp != FNULL) {
1136 		(void) ungetwc(c, progfp);
1137 		return;
1138 	}
1139 	if (c == WEOF)
1140 		return;
1141 	*--progptr = c;
1142 	proglen++;
1143 }
1144 
1145 /*
1146  * Syntax errors during parsing.
1147  */
1148 void
1149 yyerror(char *s, ...)
1150 {
1151 	if (lexlast == FUNC || lexlast == GETLINE || lexlast == KEYWORD)
1152 		if (lexlast == KEYWORD)
1153 			awkerr(gettext("inadmissible use of reserved keyword"));
1154 		else
1155 			awkerr(gettext("attempt to redefine builtin function"));
1156 	awkerr(s);
1157 }
1158 
1159 /*
1160  * Error routine for all awk errors.
1161  */
1162 /* ARGSUSED */
1163 void
1164 awkerr(char *fmt, ...)
1165 {
1166 	va_list args;
1167 
1168 	va_start(args, fmt);
1169 	awkierr(0, fmt, args);
1170 	va_end(args);
1171 }
1172 
1173 /*
1174  * Error routine like "awkerr" except that it prints out
1175  * a message that includes an errno-specific indication.
1176  */
1177 /* ARGSUSED */
1178 void
1179 awkperr(char *fmt, ...)
1180 {
1181 	va_list args;
1182 
1183 	va_start(args, fmt);
1184 	awkierr(1, fmt, args);
1185 	va_end(args);
1186 }
1187 
1188 /*
1189  * Common internal routine for awkerr, awkperr
1190  */
1191 static void
1192 awkierr(int perr, char *fmt, va_list ap)
1193 {
1194 	static char sep1[] = "\n>>>\t";
1195 	static char sep2[] = "\t<<<";
1196 	int saveerr = errno;
1197 
1198 	(void) fprintf(stderr, "%s: ", _cmdname);
1199 	if (running) {
1200 		(void) fprintf(stderr, gettext("line %u ("),
1201 		    curnode == NNULL ? 0 : curnode->n_lineno);
1202 		if (phase == 0)
1203 			(void) fprintf(stderr, "NR=%lld): ",
1204 			    (INT)exprint(varNR));
1205 		else
1206 			(void) fprintf(stderr, "%s): ",
1207 			    phase == BEGIN ? s_BEGIN : s_END);
1208 	} else if (lineno != 0) {
1209 		(void) fprintf(stderr, gettext("file \"%s\": "), filename);
1210 		(void) fprintf(stderr, gettext("line %u: "), lineno);
1211 	}
1212 	(void) vfprintf(stderr, gettext(fmt), ap);
1213 	if (perr == 1)
1214 		(void) fprintf(stderr, ": %s", strerror(saveerr));
1215 	if (perr != 2 && !running) {
1216 		wchar_t *cp;
1217 		int n;
1218 		int c;
1219 
1220 		(void) fprintf(stderr, gettext("  Context is:%s"), sep1);
1221 		cp = conptr;
1222 		n = NCONTEXT;
1223 		do {
1224 			if (cp >= &context[NCONTEXT])
1225 				cp = &context[0];
1226 			if ((c = *cp++) != '\0')
1227 				(void) fputs(c == '\n' ? sep1 : toprint(c),
1228 				    stderr);
1229 		} while (--n != 0);
1230 		(void) fputs(sep2, stderr);
1231 	}
1232 	(void) fprintf(stderr, "\n");
1233 	exit(1);
1234 }
1235 
1236 wchar_t *
1237 emalloc(unsigned n)
1238 {
1239 	wchar_t *cp;
1240 
1241 	if ((cp = malloc(n)) == NULL)
1242 		awkerr(nomem);
1243 	return (cp);
1244 }
1245 
1246 wchar_t *
1247 erealloc(wchar_t *p, unsigned n)
1248 {
1249 	wchar_t *cp;
1250 
1251 	if ((cp = realloc(p, n)) == NULL)
1252 		awkerr(nomem);
1253 	return (cp);
1254 }
1255 
1256 
1257 /*
1258  * usage message for awk
1259  */
1260 static int
1261 usage()
1262 {
1263 	(void) fprintf(stderr, gettext(
1264 "Usage:	awk [-F ERE] [-v var=val] 'program' [var=val ...] [file ...]\n"
1265 "	awk [-F ERE] -f progfile ... [-v var=val] [var=val ...] [file ...]\n"));
1266 	return (2);
1267 }
1268 
1269 
1270 static wchar_t *
1271 mbconvert(char *str)
1272 {
1273 	static wchar_t *op = 0;
1274 
1275 	if (op != 0)
1276 		free(op);
1277 	return (op = mbstowcsdup(str));
1278 }
1279 
1280 char *
1281 mbunconvert(wchar_t *str)
1282 {
1283 	static char *op = 0;
1284 
1285 	if (op != 0)
1286 		free(op);
1287 	return (op = wcstombsdup(str));
1288 }
1289 
1290 /*
1291  * Solaris port - following functions are typical MKS functions written
1292  * to work for Solaris.
1293  */
1294 
1295 wchar_t *
1296 mbstowcsdup(s)
1297 char *s;
1298 {
1299 	int n;
1300 	wchar_t *w;
1301 
1302 	n = strlen(s) + 1;
1303 	if ((w = (wchar_t *)malloc(n * sizeof (wchar_t))) == NULL)
1304 		return (NULL);
1305 
1306 	if (mbstowcs(w, s, n) == -1)
1307 		return (NULL);
1308 	return (w);
1309 
1310 }
1311 
1312 char *
1313 wcstombsdup(wchar_t *w)
1314 {
1315 	int n;
1316 	char *mb;
1317 
1318 	/* Fetch memory for worst case string length */
1319 	n = wslen(w) + 1;
1320 	n *= MB_CUR_MAX;
1321 	if ((mb = (char *)malloc(n)) == NULL) {
1322 		return (NULL);
1323 	}
1324 
1325 	/* Convert the string */
1326 	if ((n = wcstombs(mb, w, n)) == -1) {
1327 		int saverr = errno;
1328 
1329 		free(mb);
1330 		errno = saverr;
1331 		return (0);
1332 	}
1333 
1334 	/* Shrink the string down */
1335 	if ((mb = (char *)realloc(mb, strlen(mb)+1)) == NULL)  {
1336 		return (NULL);
1337 	}
1338 	return (mb);
1339 }
1340 
1341 /*
1342  * The upe_ctrls[] table contains the printable 'control-sequences' for the
1343  * character values 0..31 and 127.  The first entry is for value 127, thus the
1344  * entries for the remaining character values are from 1..32.
1345  */
1346 static const char *const upe_ctrls[] =
1347 {
1348 	"^?",
1349 	"^@",  "^A",  "^B",  "^C",  "^D",  "^E",  "^F",  "^G",
1350 	"^H",  "^I",  "^J",  "^K",  "^L",  "^M",  "^N",  "^O",
1351 	"^P",  "^Q",  "^R",  "^S",  "^T",  "^U",  "^V",  "^W",
1352 	"^X",  "^Y",  "^Z",  "^[",  "^\\", "^]",  "^^",  "^_"
1353 };
1354 
1355 
1356 /*
1357  * Return a printable string corresponding to the given character value.  If
1358  * the character is printable, simply return it as the string.  If it is in
1359  * the range specified by table 5-101 in the UPE, return the corresponding
1360  * string.  Otherwise, return an octal escape sequence.
1361  */
1362 static const char *
1363 toprint(c)
1364 wchar_t c;
1365 {
1366 	int n, len;
1367 	unsigned char *ptr;
1368 	static char mbch[MB_LEN_MAX+1];
1369 	static char buf[5 * MB_LEN_MAX + 1];
1370 
1371 	if ((n = wctomb(mbch, c)) == -1) {
1372 		/* Should never happen */
1373 		(void) sprintf(buf, "\\%x", c);
1374 		return (buf);
1375 	}
1376 	mbch[n] = '\0';
1377 	if (iswprint(c)) {
1378 		return (mbch);
1379 	} else if (c == 127) {
1380 		return (upe_ctrls[0]);
1381 	} else if (c < 32) {
1382 		/* Print as in Table 5-101 in the UPE */
1383 		return (upe_ctrls[c+1]);
1384 	} else {
1385 		/* Print as an octal escape sequence */
1386 		for (len = 0, ptr = (unsigned char *) mbch; 0 < n; --n, ++ptr)
1387 			len += sprintf(buf+len, "\\%03o", *ptr);
1388 	}
1389 	return (buf);
1390 }
1391 
1392 static int
1393 wcoff(const wchar_t *astring, const int off)
1394 {
1395 	const wchar_t *s = astring;
1396 	int c = 0;
1397 	char mb[MB_LEN_MAX];
1398 
1399 	while (c < off) {
1400 		int n;
1401 		if ((n = wctomb(mb, *s)) == 0)
1402 			break;
1403 		if (n == -1)
1404 			n = 1;
1405 		c += n;
1406 		s++;
1407 	}
1408 
1409 	return (s - astring);
1410 }
1411 
1412 int
1413 int_regwcomp(regex_t *r, const wchar_t *pattern, int uflags)
1414 {
1415 	char *mbpattern;
1416 	int ret;
1417 
1418 	if ((mbpattern = wcstombsdup((wchar_t *)pattern)) == NULL)
1419 		return (REG_ESPACE);
1420 
1421 	ret = regcomp(r, mbpattern, uflags);
1422 
1423 	free(mbpattern);
1424 
1425 	return (ret);
1426 }
1427 
1428 int
1429 int_regwexec(const regex_t *r,	/* compiled RE */
1430 	const wchar_t *astring,	/* subject string */
1431 	size_t nsub,		/* number of subexpressions */
1432 	int_regwmatch_t *sub,	/* subexpression pointers */
1433 	int flags)
1434 {
1435 	char *mbs;
1436 	regmatch_t *mbsub = NULL;
1437 	int i;
1438 
1439 	if ((mbs = wcstombsdup((wchar_t *)astring)) == NULL)
1440 		return (REG_ESPACE);
1441 
1442 	if (nsub > 0 && sub) {
1443 		if ((mbsub = malloc(nsub * sizeof (regmatch_t))) == NULL)
1444 			return (REG_ESPACE);
1445 	}
1446 
1447 	i = regexec(r, mbs, nsub, mbsub, flags);
1448 
1449 	/* Now, adjust the pointers/counts in sub */
1450 	if (i == REG_OK && nsub > 0 && mbsub) {
1451 		int j, k;
1452 
1453 		for (j = 0; j < nsub; j++) {
1454 			regmatch_t *ms = &mbsub[j];
1455 			int_regwmatch_t *ws = &sub[j];
1456 
1457 			if ((k = ms->rm_so) >= 0) {
1458 				ws->rm_so = wcoff(astring, k);
1459 				ws->rm_sp = astring + ws->rm_so;
1460 			}
1461 			if ((k = ms->rm_eo) >= 0) {
1462 				ws->rm_eo = wcoff(astring, k);
1463 				ws->rm_ep = astring + ws->rm_eo;
1464 			}
1465 		}
1466 	}
1467 
1468 	free(mbs);
1469 	if (mbsub)
1470 		free(mbsub);
1471 	return (i);
1472 }
1473 
1474 int
1475 int_regwdosuba(regex_t *rp,	/* compiled RE: Pattern */
1476 	const wchar_t *rpl,		/* replacement string: /rpl/ */
1477 	const wchar_t *src,		/* source string */
1478 	wchar_t **dstp,			/* destination string */
1479 	int len,			/* destination length */
1480 	int *globp)	/* IN: occurence, 0 for all; OUT: substitutions */
1481 {
1482 	wchar_t *dst, *odst;
1483 	const wchar_t *ip, *xp;
1484 	wchar_t *op;
1485 	int i;
1486 	wchar_t c;
1487 	int glob, iglob = *globp, oglob = 0;
1488 #define	NSUB	10
1489 	int_regwmatch_t rm[NSUB], *rmp;
1490 	int flags;
1491 	wchar_t *end;
1492 	int regerr;
1493 
1494 /* handle overflow of dst. we need "i" more bytes */
1495 #ifdef OVERFLOW
1496 #undef OVERFLOW
1497 #define	OVERFLOW(i) if (1) { \
1498 		int pos = op - dst; \
1499 		dst = (wchar_t *)realloc(odst = dst, \
1500 			(len += len + i) * sizeof (wchar_t)); \
1501 		if (dst == NULL) \
1502 			goto nospace; \
1503 		op = dst + pos; \
1504 		end = dst + len; \
1505 	} else
1506 #endif
1507 
1508 	*dstp = dst = (wchar_t *)malloc(len * sizeof (wchar_t));
1509 	if (dst == NULL)
1510 		return (REG_ESPACE);
1511 
1512 	if (rp == NULL || rpl == NULL || src == NULL || dst ==  NULL)
1513 		return (REG_EFATAL);
1514 
1515 	glob = 0;	/* match count */
1516 	ip = src;	/* source position */
1517 	op = dst;	/* destination position */
1518 	end = dst + len;
1519 
1520 	flags = 0;
1521 	while ((regerr = int_regwexec(rp, ip, NSUB, rm, flags)) == REG_OK) {
1522 		/* Copy text preceding match */
1523 		if (op + (i = rm[0].rm_sp - ip) >= end)
1524 			OVERFLOW(i);
1525 		while (i--)
1526 			*op++ = *ip++;
1527 
1528 		if (iglob == 0 || ++glob == iglob) {
1529 			oglob++;
1530 			xp = rpl;		/* do substitute */
1531 		} else
1532 			xp = L"&";		/* preserve text */
1533 
1534 		/* Perform replacement of matched substing */
1535 		while ((c = *xp++) != '\0') {
1536 			rmp = NULL;
1537 			if (c == '&')
1538 				rmp = &rm[0];
1539 			else if (c == '\\') {
1540 				if ('0' <= *xp && *xp <= '9')
1541 					rmp = &rm[*xp++ - '0'];
1542 				else if (*xp != '\0')
1543 					c = *xp++;
1544 			}
1545 
1546 			if (rmp ==  NULL) {	/* Ordinary character. */
1547 				*op++ = c;
1548 				if (op >= end)
1549 					OVERFLOW(1);
1550 			} else if (rmp->rm_sp != NULL && rmp->rm_ep != NULL) {
1551 				ip = rmp->rm_sp;
1552 				if (op + (i = rmp->rm_ep - rmp->rm_sp) >= end)
1553 					OVERFLOW(i);
1554 				while (i--)
1555 					*op++ = *ip++;
1556 			}
1557 		}
1558 
1559 		ip = rm[0].rm_ep;
1560 		if (*ip == '\0')	/* If at end break */
1561 			break;
1562 		else if (rm[0].rm_sp == rm[0].rm_ep) {
1563 			/* If empty match copy next char */
1564 			*op++ = *ip++;
1565 			if (op >= end)
1566 				OVERFLOW(1);
1567 		}
1568 		flags = REG_NOTBOL;
1569 	}
1570 
1571 	if (regerr != REG_OK && regerr != REG_NOMATCH)
1572 		return (regerr);
1573 
1574 	/* Copy rest of text */
1575 	if (op + (i =  wcslen(ip)) >= end)
1576 		OVERFLOW(i);
1577 	while (i--)
1578 		*op++ = *ip++;
1579 	*op++ = '\0';
1580 
1581 	if ((*dstp = dst = (wchar_t *)realloc(odst = dst,
1582 	    sizeof (wchar_t) * (size_t)(op - dst))) == NULL) {
1583 nospace:
1584 		free(odst);
1585 		return (REG_ESPACE);
1586 	}
1587 
1588 	*globp = oglob;
1589 
1590 	return ((oglob == 0) ? REG_NOMATCH : REG_OK);
1591 }
1592