xref: /openbsd/usr.bin/awk/run.c (revision cd993586)
1 /*	$OpenBSD: run.c,v 1.88 2024/06/04 14:40:46 millert Exp $	*/
2 /****************************************************************
3 Copyright (C) Lucent Technologies 1997
4 All Rights Reserved
5 
6 Permission to use, copy, modify, and distribute this software and
7 its documentation for any purpose and without fee is hereby
8 granted, provided that the above copyright notice appear in all
9 copies and that both that the copyright notice and this
10 permission notice and warranty disclaimer appear in supporting
11 documentation, and that the name Lucent Technologies or any of
12 its entities not be used in advertising or publicity pertaining
13 to distribution of the software without specific, written prior
14 permission.
15 
16 LUCENT DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
17 INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS.
18 IN NO EVENT SHALL LUCENT OR ANY OF ITS ENTITIES BE LIABLE FOR ANY
19 SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
20 WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER
21 IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,
22 ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF
23 THIS SOFTWARE.
24 ****************************************************************/
25 
26 #define DEBUG
27 #include <stdio.h>
28 #include <ctype.h>
29 #include <errno.h>
30 #include <wctype.h>
31 #include <fcntl.h>
32 #include <setjmp.h>
33 #include <limits.h>
34 #include <math.h>
35 #include <string.h>
36 #include <stdlib.h>
37 #include <time.h>
38 #include <sys/types.h>
39 #include <sys/wait.h>
40 #include "awk.h"
41 #include "awkgram.tab.h"
42 
43 
44 static void stdinit(void);
45 static void flush_all(void);
46 static char *wide_char_to_byte_str(int rune, size_t *outlen);
47 
48 #if 1
49 #define tempfree(x)	do { if (istemp(x)) tfree(x); } while (/*CONSTCOND*/0)
50 #else
tempfree(Cell * p)51 void tempfree(Cell *p) {
52 	if (p->ctype == OCELL && (p->csub < CUNK || p->csub > CFREE)) {
53 		WARNING("bad csub %d in Cell %d %s",
54 			p->csub, p->ctype, p->sval);
55 	}
56 	if (istemp(p))
57 		tfree(p);
58 }
59 #endif
60 
61 /* do we really need these? */
62 /* #ifdef _NFILE */
63 /* #ifndef FOPEN_MAX */
64 /* #define FOPEN_MAX _NFILE */
65 /* #endif */
66 /* #endif */
67 /*  */
68 /* #ifndef	FOPEN_MAX */
69 /* #define	FOPEN_MAX	40 */	/* max number of open files */
70 /* #endif */
71 /*  */
72 /* #ifndef RAND_MAX */
73 /* #define RAND_MAX	32767 */	/* all that ansi guarantees */
74 /* #endif */
75 
76 jmp_buf env;
77 extern	int	pairstack[];
78 extern	Awkfloat	srand_seed;
79 
80 Node	*winner = NULL;	/* root of parse tree */
81 Cell	*tmps;		/* free temporary cells for execution */
82 
83 static Cell	truecell	={ OBOOL, BTRUE, 0, 0, 1.0, NUM, NULL, NULL };
84 Cell	*True	= &truecell;
85 static Cell	falsecell	={ OBOOL, BFALSE, 0, 0, 0.0, NUM, NULL, NULL };
86 Cell	*False	= &falsecell;
87 static Cell	breakcell	={ OJUMP, JBREAK, 0, 0, 0.0, NUM, NULL, NULL };
88 Cell	*jbreak	= &breakcell;
89 static Cell	contcell	={ OJUMP, JCONT, 0, 0, 0.0, NUM, NULL, NULL };
90 Cell	*jcont	= &contcell;
91 static Cell	nextcell	={ OJUMP, JNEXT, 0, 0, 0.0, NUM, NULL, NULL };
92 Cell	*jnext	= &nextcell;
93 static Cell	nextfilecell	={ OJUMP, JNEXTFILE, 0, 0, 0.0, NUM, NULL, NULL };
94 Cell	*jnextfile	= &nextfilecell;
95 static Cell	exitcell	={ OJUMP, JEXIT, 0, 0, 0.0, NUM, NULL, NULL };
96 Cell	*jexit	= &exitcell;
97 static Cell	retcell		={ OJUMP, JRET, 0, 0, 0.0, NUM, NULL, NULL };
98 Cell	*jret	= &retcell;
99 static Cell	tempcell	={ OCELL, CTEMP, 0, EMPTY, 0.0, NUM|STR|DONTFREE, NULL, NULL };
100 
101 Node	*curnode = NULL;	/* the node being executed, for debugging */
102 
103 /* buffer memory management */
adjbuf(char ** pbuf,int * psiz,int minlen,int quantum,char ** pbptr,const char * whatrtn)104 int adjbuf(char **pbuf, int *psiz, int minlen, int quantum, char **pbptr,
105 	const char *whatrtn)
106 /* pbuf:    address of pointer to buffer being managed
107  * psiz:    address of buffer size variable
108  * minlen:  minimum length of buffer needed
109  * quantum: buffer size quantum
110  * pbptr:   address of movable pointer into buffer, or 0 if none
111  * whatrtn: name of the calling routine if failure should cause fatal error
112  *
113  * return   0 for realloc failure, !=0 for success
114  */
115 {
116 	if (minlen > *psiz) {
117 		char *tbuf;
118 		int rminlen = quantum ? minlen % quantum : 0;
119 		int boff = pbptr ? *pbptr - *pbuf : 0;
120 		/* round up to next multiple of quantum */
121 		if (rminlen)
122 			minlen += quantum - rminlen;
123 		tbuf = (char *) realloc(*pbuf, minlen);
124 		DPRINTF("adjbuf %s: %d %d (pbuf=%p, tbuf=%p)\n", whatrtn, *psiz, minlen, (void*)*pbuf, (void*)tbuf);
125 		if (tbuf == NULL) {
126 			if (whatrtn)
127 				FATAL("out of memory in %s", whatrtn);
128 			return 0;
129 		}
130 		*pbuf = tbuf;
131 		*psiz = minlen;
132 		if (pbptr)
133 			*pbptr = tbuf + boff;
134 	}
135 	return 1;
136 }
137 
run(Node * a)138 void run(Node *a)	/* execution of parse tree starts here */
139 {
140 
141 	stdinit();
142 	execute(a);
143 	closeall();
144 }
145 
execute(Node * u)146 Cell *execute(Node *u)	/* execute a node of the parse tree */
147 {
148 	Cell *(*proc)(Node **, int);
149 	Cell *x;
150 	Node *a;
151 
152 	if (u == NULL)
153 		return(True);
154 	for (a = u; ; a = a->nnext) {
155 		curnode = a;
156 		if (isvalue(a)) {
157 			x = (Cell *) (a->narg[0]);
158 			if (isfld(x) && !donefld)
159 				fldbld();
160 			else if (isrec(x) && !donerec)
161 				recbld();
162 			return(x);
163 		}
164 		if (notlegal(a->nobj))	/* probably a Cell* but too risky to print */
165 			FATAL("illegal statement");
166 		proc = proctab[a->nobj-FIRSTTOKEN];
167 		x = (*proc)(a->narg, a->nobj);
168 		if (isfld(x) && !donefld)
169 			fldbld();
170 		else if (isrec(x) && !donerec)
171 			recbld();
172 		if (isexpr(a))
173 			return(x);
174 		if (isjump(x))
175 			return(x);
176 		if (a->nnext == NULL)
177 			return(x);
178 		tempfree(x);
179 	}
180 }
181 
182 
program(Node ** a,int n)183 Cell *program(Node **a, int n)	/* execute an awk program */
184 {				/* a[0] = BEGIN, a[1] = body, a[2] = END */
185 	Cell *x;
186 
187 	if (setjmp(env) != 0)
188 		goto ex;
189 	if (a[0]) {		/* BEGIN */
190 		x = execute(a[0]);
191 		if (isexit(x))
192 			return(True);
193 		if (isjump(x))
194 			FATAL("illegal break, continue, next or nextfile from BEGIN");
195 		tempfree(x);
196 	}
197 	if (a[1] || a[2])
198 		while (getrec(&record, &recsize, true) > 0) {
199 			x = execute(a[1]);
200 			if (isexit(x))
201 				break;
202 			tempfree(x);
203 		}
204   ex:
205 	if (setjmp(env) != 0)	/* handles exit within END */
206 		goto ex1;
207 	if (a[2]) {		/* END */
208 		x = execute(a[2]);
209 		if (isbreak(x) || isnext(x) || iscont(x))
210 			FATAL("illegal break, continue, next or nextfile from END");
211 		tempfree(x);
212 	}
213   ex1:
214 	return(True);
215 }
216 
217 struct Frame {	/* stack frame for awk function calls */
218 	int nargs;	/* number of arguments in this call */
219 	Cell *fcncell;	/* pointer to Cell for function */
220 	Cell **args;	/* pointer to array of arguments after execute */
221 	Cell *retval;	/* return value */
222 };
223 
224 #define	NARGS	50	/* max args in a call */
225 
226 struct Frame *frame = NULL;	/* base of stack frames; dynamically allocated */
227 int	nframe = 0;		/* number of frames allocated */
228 struct Frame *frp = NULL;	/* frame pointer. bottom level unused */
229 
call(Node ** a,int n)230 Cell *call(Node **a, int n)	/* function call.  very kludgy and fragile */
231 {
232 	static const Cell newcopycell = { OCELL, CCOPY, 0, EMPTY, 0.0, NUM|STR|DONTFREE, NULL, NULL };
233 	int i, ncall, ndef;
234 	int freed = 0; /* handles potential double freeing when fcn & param share a tempcell */
235 	Node *x;
236 	Cell *args[NARGS], *oargs[NARGS];	/* BUG: fixed size arrays */
237 	Cell *y, *z, *fcn;
238 	char *s;
239 
240 	fcn = execute(a[0]);	/* the function itself */
241 	s = fcn->nval;
242 	if (!isfcn(fcn))
243 		FATAL("calling undefined function %s", s);
244 	if (frame == NULL) {
245 		frp = frame = (struct Frame *) calloc(nframe += 100, sizeof(*frame));
246 		if (frame == NULL)
247 			FATAL("out of space for stack frames calling %s", s);
248 	}
249 	for (ncall = 0, x = a[1]; x != NULL; x = x->nnext)	/* args in call */
250 		ncall++;
251 	ndef = (int) fcn->fval;			/* args in defn */
252 	DPRINTF("calling %s, %d args (%d in defn), frp=%d\n", s, ncall, ndef, (int) (frp-frame));
253 	if (ncall > ndef)
254 		WARNING("function %s called with %d args, uses only %d",
255 			s, ncall, ndef);
256 	if (ncall + ndef > NARGS)
257 		FATAL("function %s has %d arguments, limit %d", s, ncall+ndef, NARGS);
258 	for (i = 0, x = a[1]; x != NULL; i++, x = x->nnext) {	/* get call args */
259 		DPRINTF("evaluate args[%d], frp=%d:\n", i, (int) (frp-frame));
260 		y = execute(x);
261 		oargs[i] = y;
262 		DPRINTF("args[%d]: %s %f <%s>, t=%o\n",
263 			i, NN(y->nval), y->fval, isarr(y) ? "(array)" : NN(y->sval), y->tval);
264 		if (isfcn(y))
265 			FATAL("can't use function %s as argument in %s", y->nval, s);
266 		if (isarr(y))
267 			args[i] = y;	/* arrays by ref */
268 		else
269 			args[i] = copycell(y);
270 		tempfree(y);
271 	}
272 	for ( ; i < ndef; i++) {	/* add null args for ones not provided */
273 		args[i] = gettemp();
274 		*args[i] = newcopycell;
275 	}
276 	frp++;	/* now ok to up frame */
277 	if (frp >= frame + nframe) {
278 		int dfp = frp - frame;	/* old index */
279 		frame = (struct Frame *) reallocarray(frame, (nframe += 100), sizeof(*frame));
280 		if (frame == NULL)
281 			FATAL("out of space for stack frames in %s", s);
282 		frp = frame + dfp;
283 	}
284 	frp->fcncell = fcn;
285 	frp->args = args;
286 	frp->nargs = ndef;	/* number defined with (excess are locals) */
287 	frp->retval = gettemp();
288 
289 	DPRINTF("start exec of %s, frp=%d\n", s, (int) (frp-frame));
290 	y = execute((Node *)(fcn->sval));	/* execute body */
291 	DPRINTF("finished exec of %s, frp=%d\n", s, (int) (frp-frame));
292 
293 	for (i = 0; i < ndef; i++) {
294 		Cell *t = frp->args[i];
295 		if (isarr(t)) {
296 			if (t->csub == CCOPY) {
297 				if (i >= ncall) {
298 					freesymtab(t);
299 					t->csub = CTEMP;
300 					tempfree(t);
301 				} else {
302 					oargs[i]->tval = t->tval;
303 					oargs[i]->tval &= ~(STR|NUM|DONTFREE);
304 					oargs[i]->sval = t->sval;
305 					tempfree(t);
306 				}
307 			}
308 		} else if (t != y) {	/* kludge to prevent freeing twice */
309 			t->csub = CTEMP;
310 			tempfree(t);
311 		} else if (t == y && t->csub == CCOPY) {
312 			t->csub = CTEMP;
313 			tempfree(t);
314 			freed = 1;
315 		}
316 	}
317 	tempfree(fcn);
318 	if (isexit(y) || isnext(y))
319 		return y;
320 	if (freed == 0) {
321 		tempfree(y);	/* don't free twice! */
322 	}
323 	z = frp->retval;			/* return value */
324 	DPRINTF("%s returns %g |%s| %o\n", s, getfval(z), getsval(z), z->tval);
325 	frp--;
326 	return(z);
327 }
328 
copycell(Cell * x)329 Cell *copycell(Cell *x)	/* make a copy of a cell in a temp */
330 {
331 	Cell *y;
332 
333 	/* copy is not constant or field */
334 
335 	y = gettemp();
336 	y->tval = x->tval & ~(CON|FLD|REC);
337 	y->csub = CCOPY;	/* prevents freeing until call is over */
338 	y->nval = x->nval;	/* BUG? */
339 	if (isstr(x) /* || x->ctype == OCELL */) {
340 		y->sval = tostring(x->sval);
341 		y->tval &= ~DONTFREE;
342 	} else
343 		y->tval |= DONTFREE;
344 	y->fval = x->fval;
345 	return y;
346 }
347 
arg(Node ** a,int n)348 Cell *arg(Node **a, int n)	/* nth argument of a function */
349 {
350 
351 	n = ptoi(a[0]);	/* argument number, counting from 0 */
352 	DPRINTF("arg(%d), frp->nargs=%d\n", n, frp->nargs);
353 	if (n+1 > frp->nargs)
354 		FATAL("argument #%d of function %s was not supplied",
355 			n+1, frp->fcncell->nval);
356 	return frp->args[n];
357 }
358 
jump(Node ** a,int n)359 Cell *jump(Node **a, int n)	/* break, continue, next, nextfile, return */
360 {
361 	Cell *y;
362 
363 	switch (n) {
364 	case EXIT:
365 		if (a[0] != NULL) {
366 			y = execute(a[0]);
367 			errorflag = (int) getfval(y);
368 			tempfree(y);
369 		}
370 		longjmp(env, 1);
371 	case RETURN:
372 		if (a[0] != NULL) {
373 			y = execute(a[0]);
374 			if ((y->tval & (STR|NUM)) == (STR|NUM)) {
375 				setsval(frp->retval, getsval(y));
376 				frp->retval->fval = getfval(y);
377 				frp->retval->tval |= NUM;
378 			}
379 			else if (y->tval & STR)
380 				setsval(frp->retval, getsval(y));
381 			else if (y->tval & NUM)
382 				setfval(frp->retval, getfval(y));
383 			else		/* can't happen */
384 				FATAL("bad type variable %d", y->tval);
385 			tempfree(y);
386 		}
387 		return(jret);
388 	case NEXT:
389 		return(jnext);
390 	case NEXTFILE:
391 		nextfile();
392 		return(jnextfile);
393 	case BREAK:
394 		return(jbreak);
395 	case CONTINUE:
396 		return(jcont);
397 	default:	/* can't happen */
398 		FATAL("illegal jump type %d", n);
399 	}
400 	return 0;	/* not reached */
401 }
402 
awkgetline(Node ** a,int n)403 Cell *awkgetline(Node **a, int n)	/* get next line from specific input */
404 {		/* a[0] is variable, a[1] is operator, a[2] is filename */
405 	Cell *r, *x;
406 	extern Cell **fldtab;
407 	FILE *fp;
408 	char *buf;
409 	int bufsize = recsize;
410 	int mode;
411 	bool newflag;
412 	double result;
413 
414 	if ((buf = (char *) malloc(bufsize)) == NULL)
415 		FATAL("out of memory in getline");
416 
417 	fflush(stdout);	/* in case someone is waiting for a prompt */
418 	r = gettemp();
419 	if (a[1] != NULL) {		/* getline < file */
420 		x = execute(a[2]);		/* filename */
421 		mode = ptoi(a[1]);
422 		if (mode == '|')		/* input pipe */
423 			mode = LE;	/* arbitrary flag */
424 		fp = openfile(mode, getsval(x), &newflag);
425 		tempfree(x);
426 		if (fp == NULL)
427 			n = -1;
428 		else
429 			n = readrec(&buf, &bufsize, fp, newflag);
430 		if (n <= 0) {
431 			;
432 		} else if (a[0] != NULL) {	/* getline var <file */
433 			x = execute(a[0]);
434 			setsval(x, buf);
435 			if (is_number(x->sval, & result)) {
436 				x->fval = result;
437 				x->tval |= NUM;
438 			}
439 			tempfree(x);
440 		} else {			/* getline <file */
441 			setsval(fldtab[0], buf);
442 			if (is_number(fldtab[0]->sval, & result)) {
443 				fldtab[0]->fval = result;
444 				fldtab[0]->tval |= NUM;
445 			}
446 		}
447 	} else {			/* bare getline; use current input */
448 		if (a[0] == NULL)	/* getline */
449 			n = getrec(&record, &recsize, true);
450 		else {			/* getline var */
451 			n = getrec(&buf, &bufsize, false);
452 			if (n > 0) {
453 				x = execute(a[0]);
454 				setsval(x, buf);
455 				if (is_number(x->sval, & result)) {
456 					x->fval = result;
457 					x->tval |= NUM;
458 				}
459 				tempfree(x);
460 			}
461 		}
462 	}
463 	setfval(r, (Awkfloat) n);
464 	free(buf);
465 	return r;
466 }
467 
getnf(Node ** a,int n)468 Cell *getnf(Node **a, int n)	/* get NF */
469 {
470 	if (!donefld)
471 		fldbld();
472 	return (Cell *) a[0];
473 }
474 
475 static char *
makearraystring(Node * p,const char * func)476 makearraystring(Node *p, const char *func)
477 {
478 	char *buf;
479 	int bufsz = recsize;
480 	size_t blen;
481 
482 	if ((buf = (char *) malloc(bufsz)) == NULL) {
483 		FATAL("%s: out of memory", func);
484 	}
485 
486 	blen = 0;
487 	buf[blen] = '\0';
488 
489 	for (; p; p = p->nnext) {
490 		Cell *x = execute(p);	/* expr */
491 		char *s = getsval(x);
492 		size_t seplen = strlen(getsval(subseploc));
493 		size_t nsub = p->nnext ? seplen : 0;
494 		size_t slen = strlen(s);
495 		size_t tlen = blen + slen + nsub;
496 
497 		if (!adjbuf(&buf, &bufsz, tlen + 1, recsize, 0, func)) {
498 			FATAL("%s: out of memory %s[%s...]",
499 			    func, x->nval, buf);
500 		}
501 		memcpy(buf + blen, s, slen);
502 		if (nsub) {
503 			memcpy(buf + blen + slen, *SUBSEP, nsub);
504 		}
505 		buf[tlen] = '\0';
506 		blen = tlen;
507 		tempfree(x);
508 	}
509 	return buf;
510 }
511 
array(Node ** a,int n)512 Cell *array(Node **a, int n)	/* a[0] is symtab, a[1] is list of subscripts */
513 {
514 	Cell *x, *z;
515 	char *buf;
516 
517 	x = execute(a[0]);	/* Cell* for symbol table */
518 	buf = makearraystring(a[1], __func__);
519 	if (!isarr(x)) {
520 		DPRINTF("making %s into an array\n", NN(x->nval));
521 		if (freeable(x))
522 			xfree(x->sval);
523 		x->tval &= ~(STR|NUM|DONTFREE);
524 		x->tval |= ARR;
525 		x->sval = (char *) makesymtab(NSYMTAB);
526 	}
527 	z = setsymtab(buf, "", 0.0, STR|NUM, (Array *) x->sval);
528 	z->ctype = OCELL;
529 	z->csub = CVAR;
530 	tempfree(x);
531 	free(buf);
532 	return(z);
533 }
534 
awkdelete(Node ** a,int n)535 Cell *awkdelete(Node **a, int n)	/* a[0] is symtab, a[1] is list of subscripts */
536 {
537 	Cell *x;
538 
539 	x = execute(a[0]);	/* Cell* for symbol table */
540 	if (x == symtabloc) {
541 		FATAL("cannot delete SYMTAB or its elements");
542 	}
543 	if (!isarr(x))
544 		return True;
545 	if (a[1] == NULL) {	/* delete the elements, not the table */
546 		freesymtab(x);
547 		x->tval &= ~STR;
548 		x->tval |= ARR;
549 		x->sval = (char *) makesymtab(NSYMTAB);
550 	} else {
551 		char *buf = makearraystring(a[1], __func__);
552 		freeelem(x, buf);
553 		free(buf);
554 	}
555 	tempfree(x);
556 	return True;
557 }
558 
intest(Node ** a,int n)559 Cell *intest(Node **a, int n)	/* a[0] is index (list), a[1] is symtab */
560 {
561 	Cell *ap, *k;
562 	char *buf;
563 
564 	ap = execute(a[1]);	/* array name */
565 	if (!isarr(ap)) {
566 		DPRINTF("making %s into an array\n", ap->nval);
567 		if (freeable(ap))
568 			xfree(ap->sval);
569 		ap->tval &= ~(STR|NUM|DONTFREE);
570 		ap->tval |= ARR;
571 		ap->sval = (char *) makesymtab(NSYMTAB);
572 	}
573 	buf = makearraystring(a[0], __func__);
574 	k = lookup(buf, (Array *) ap->sval);
575 	tempfree(ap);
576 	free(buf);
577 	if (k == NULL)
578 		return(False);
579 	else
580 		return(True);
581 }
582 
583 
584 /* ======== utf-8 code ========== */
585 
586 /*
587  * Awk strings can contain ascii, random 8-bit items (eg Latin-1),
588  * or utf-8.  u8_isutf tests whether a string starts with a valid
589  * utf-8 sequence, and returns 0 if not (e.g., high bit set).
590  * u8_nextlen returns length of next valid sequence, which is
591  * 1 for ascii, 2..4 for utf-8, or 1 for high bit non-utf.
592  * u8_strlen returns length of string in valid utf-8 sequences
593  * and/or high-bit bytes.  Conversion functions go between byte
594  * number and character number.
595  *
596  * In theory, this behaves the same as before for non-utf8 bytes.
597  *
598  * Limited checking! This is a potential security hole.
599  */
600 
601 /* is s the beginning of a valid utf-8 string? */
602 /* return length 1..4 if yes, 0 if no */
u8_isutf(const char * s)603 static int u8_isutf(const char *s)
604 {
605 	int ret;
606 	unsigned char c;
607 
608 	c = s[0];
609 	if (c < 128 || awk_mb_cur_max == 1) {
610 		ret = 1; /* what if it's 0? */
611 	} else if (((c>>5) & 0x7) == 0x6 && (s[1] & 0xC0) == 0x80) {
612 		ret = 2; /* 110xxxxx 10xxxxxx */
613 	} else if (((c>>4) & 0xF) == 0xE && (s[1] & 0xC0) == 0x80
614 			 && (s[2] & 0xC0) == 0x80) {
615 		ret = 3; /* 1110xxxx 10xxxxxx 10xxxxxx */
616 	} else if (((c>>3) & 0x1F) == 0x1E && (s[1] & 0xC0) == 0x80
617 			 && (s[2] & 0xC0) == 0x80 && (s[3] & 0xC0) == 0x80) {
618 		ret = 4; /* 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */
619 	} else {
620 		ret = 0;
621 	}
622 	return ret;
623 }
624 
625 /* Convert (prefix of) utf8 string to utf-32 rune. */
626 /* Sets *rune to the value, returns the length. */
627 /* No error checking: watch out. */
u8_rune(int * rune,const char * s)628 int u8_rune(int *rune, const char *s)
629 {
630 	int n, ret;
631 	unsigned char c;
632 
633 	c = s[0];
634 	if (c < 128 || awk_mb_cur_max == 1) {
635 		*rune = c;
636 		return 1;
637 	}
638 
639 	n = strlen(s);
640 	if (n >= 2 && ((c>>5) & 0x7) == 0x6 && (s[1] & 0xC0) == 0x80) {
641 		*rune = ((c & 0x1F) << 6) | (s[1] & 0x3F); /* 110xxxxx 10xxxxxx */
642 		ret = 2;
643 	} else if (n >= 3 && ((c>>4) & 0xF) == 0xE && (s[1] & 0xC0) == 0x80
644 			  && (s[2] & 0xC0) == 0x80) {
645 		*rune = ((c & 0xF) << 12) | ((s[1] & 0x3F) << 6) | (s[2] & 0x3F);
646 			/* 1110xxxx 10xxxxxx 10xxxxxx */
647 		ret = 3;
648 	} else if (n >= 4 && ((c>>3) & 0x1F) == 0x1E && (s[1] & 0xC0) == 0x80
649 			  && (s[2] & 0xC0) == 0x80 && (s[3] & 0xC0) == 0x80) {
650 		*rune = ((c & 0x7) << 18) | ((s[1] & 0x3F) << 12) | ((s[2] & 0x3F) << 6) | (s[3] & 0x3F);
651 			/* 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */
652 		ret = 4;
653 	} else {
654 		*rune = c;
655 		ret = 1;
656 	}
657 	return ret; /* returns one byte if sequence doesn't look like utf */
658 }
659 
660 /* return length of next sequence: 1 for ascii or random, 2..4 for valid utf8 */
u8_nextlen(const char * s)661 int u8_nextlen(const char *s)
662 {
663 	int len;
664 
665 	len = u8_isutf(s);
666 	if (len == 0)
667 		len = 1;
668 	return len;
669 }
670 
671 /* return number of utf characters or single non-utf bytes */
u8_strlen(const char * s)672 static int u8_strlen(const char *s)
673 {
674 	int i, len, n, totlen;
675 	unsigned char c;
676 
677 	n = strlen(s);
678 	totlen = 0;
679 	for (i = 0; i < n; i += len) {
680 		c = s[i];
681 		if (c < 128 || awk_mb_cur_max == 1) {
682 			len = 1;
683 		} else {
684 			len = u8_nextlen(&s[i]);
685 		}
686 		totlen++;
687 		if (i > n)
688 			FATAL("bad utf count [%s] n=%d i=%d\n", s, n, i);
689 	}
690 	return totlen;
691 }
692 
693 /* convert utf-8 char number in a string to its byte offset */
u8_char2byte(const char * s,int charnum)694 static int u8_char2byte(const char *s, int charnum)
695 {
696 	int n;
697 	int bytenum = 0;
698 
699 	while (charnum > 0) {
700 		n = u8_nextlen(s);
701 		s += n;
702 		bytenum += n;
703 		charnum--;
704 	}
705 	return bytenum;
706 }
707 
708 /* convert byte offset in s to utf-8 char number that starts there */
u8_byte2char(const char * s,int bytenum)709 static int u8_byte2char(const char *s, int bytenum)
710 {
711 	int i, len, b;
712 	int charnum = 0; /* BUG: what origin? */
713 	/* should be 0 to match start==0 which means no match */
714 
715 	b = strlen(s);
716 	if (bytenum > b) {
717 		return -1; /* ??? */
718 	}
719 	for (i = 0; i <= bytenum; i += len) {
720 		len = u8_nextlen(s+i);
721 		charnum++;
722 	}
723 	return charnum;
724 }
725 
726 /* runetochar() adapted from rune.c in the Plan 9 distribution */
727 
728 enum
729 {
730 	Runeerror = 128, /* from somewhere else */
731 	Runemax = 0x10FFFF,
732 
733 	Bit1    = 7,
734 	Bitx    = 6,
735 	Bit2    = 5,
736 	Bit3    = 4,
737 	Bit4    = 3,
738 	Bit5    = 2,
739 
740 	T1      = ((1<<(Bit1+1))-1) ^ 0xFF,     /* 0000 0000 */
741 	Tx      = ((1<<(Bitx+1))-1) ^ 0xFF,     /* 1000 0000 */
742 	T2      = ((1<<(Bit2+1))-1) ^ 0xFF,     /* 1100 0000 */
743 	T3      = ((1<<(Bit3+1))-1) ^ 0xFF,     /* 1110 0000 */
744 	T4      = ((1<<(Bit4+1))-1) ^ 0xFF,     /* 1111 0000 */
745 	T5      = ((1<<(Bit5+1))-1) ^ 0xFF,     /* 1111 1000 */
746 
747 	Rune1   = (1<<(Bit1+0*Bitx))-1,	 	/* 0000 0000 0000 0000 0111 1111 */
748 	Rune2   = (1<<(Bit2+1*Bitx))-1,	 	/* 0000 0000 0000 0111 1111 1111 */
749 	Rune3   = (1<<(Bit3+2*Bitx))-1,	 	/* 0000 0000 1111 1111 1111 1111 */
750 	Rune4   = (1<<(Bit4+3*Bitx))-1,	 	/* 0011 1111 1111 1111 1111 1111 */
751 
752 	Maskx   = (1<<Bitx)-1,		  	/* 0011 1111 */
753 	Testx   = Maskx ^ 0xFF,		 	/* 1100 0000 */
754 
755 };
756 
runetochar(char * str,int c)757 int runetochar(char *str, int c)
758 {
759 	/* one character sequence 00000-0007F => 00-7F */
760 	if (c <= Rune1) {
761 		str[0] = c;
762 		return 1;
763 	}
764 
765 	/* two character sequence 00080-007FF => T2 Tx */
766 	if (c <= Rune2) {
767 		str[0] = T2 | (c >> 1*Bitx);
768 		str[1] = Tx | (c & Maskx);
769 		return 2;
770 	}
771 
772 	/* three character sequence 00800-0FFFF => T3 Tx Tx */
773 	if (c > Runemax)
774 		c = Runeerror;
775 	if (c <= Rune3) {
776 		str[0] = T3 |  (c >> 2*Bitx);
777 		str[1] = Tx | ((c >> 1*Bitx) & Maskx);
778 		str[2] = Tx |  (c & Maskx);
779 		return 3;
780 	}
781 
782 	/* four character sequence 010000-1FFFFF => T4 Tx Tx Tx */
783 	str[0] = T4 |  (c >> 3*Bitx);
784 	str[1] = Tx | ((c >> 2*Bitx) & Maskx);
785 	str[2] = Tx | ((c >> 1*Bitx) & Maskx);
786 	str[3] = Tx |  (c & Maskx);
787 	return 4;
788 }
789 
790 
791 /* ========== end of utf8 code =========== */
792 
793 
794 
matchop(Node ** a,int n)795 Cell *matchop(Node **a, int n)	/* ~ and match() */
796 {
797 	Cell *x, *y, *z;
798 	char *s, *t;
799 	int i;
800 	int cstart, cpatlen, len;
801 	fa *pfa;
802 	int (*mf)(fa *, const char *) = match, mode = 0;
803 
804 	if (n == MATCHFCN) {
805 		mf = pmatch;
806 		mode = 1;
807 	}
808 	x = execute(a[1]);	/* a[1] = target text */
809 	s = getsval(x);
810 	if (a[0] == NULL)	/* a[1] == 0: already-compiled reg expr */
811 		i = (*mf)((fa *) a[2], s);
812 	else {
813 		y = execute(a[2]);	/* a[2] = regular expr */
814 		t = getsval(y);
815 		pfa = makedfa(t, mode);
816 		i = (*mf)(pfa, s);
817 		tempfree(y);
818 	}
819 	z = x;
820 	if (n == MATCHFCN) {
821 		int start = patbeg - s + 1; /* origin 1 */
822 		if (patlen < 0) {
823 			start = 0; /* not found */
824 		} else {
825 			cstart = u8_byte2char(s, start-1);
826 			cpatlen = 0;
827 			for (i = 0; i < patlen; i += len) {
828 				len = u8_nextlen(patbeg+i);
829 				cpatlen++;
830 			}
831 
832 			start = cstart;
833 			patlen = cpatlen;
834 		}
835 
836 		setfval(rstartloc, (Awkfloat) start);
837 		setfval(rlengthloc, (Awkfloat) patlen);
838 		x = gettemp();
839 		x->tval = NUM;
840 		x->fval = start;
841 	} else if ((n == MATCH && i == 1) || (n == NOTMATCH && i == 0))
842 		x = True;
843 	else
844 		x = False;
845 
846 	tempfree(z);
847 	return x;
848 }
849 
850 
boolop(Node ** a,int n)851 Cell *boolop(Node **a, int n)	/* a[0] || a[1], a[0] && a[1], !a[0] */
852 {
853 	Cell *x, *y;
854 	int i;
855 
856 	x = execute(a[0]);
857 	i = istrue(x);
858 	tempfree(x);
859 	switch (n) {
860 	case BOR:
861 		if (i) return(True);
862 		y = execute(a[1]);
863 		i = istrue(y);
864 		tempfree(y);
865 		if (i) return(True);
866 		else return(False);
867 	case AND:
868 		if ( !i ) return(False);
869 		y = execute(a[1]);
870 		i = istrue(y);
871 		tempfree(y);
872 		if (i) return(True);
873 		else return(False);
874 	case NOT:
875 		if (i) return(False);
876 		else return(True);
877 	default:	/* can't happen */
878 		FATAL("unknown boolean operator %d", n);
879 	}
880 	return 0;	/*NOTREACHED*/
881 }
882 
relop(Node ** a,int n)883 Cell *relop(Node **a, int n)	/* a[0 < a[1], etc. */
884 {
885 	int i;
886 	Cell *x, *y;
887 	Awkfloat j;
888 	bool x_is_nan, y_is_nan;
889 
890 	x = execute(a[0]);
891 	y = execute(a[1]);
892 	x_is_nan = isnan(x->fval);
893 	y_is_nan = isnan(y->fval);
894 	if (x->tval&NUM && y->tval&NUM) {
895 		if ((x_is_nan || y_is_nan) && n != NE)
896 			return(False);
897 		j = x->fval - y->fval;
898 		i = j<0? -1: (j>0? 1: 0);
899 	} else {
900 		i = strcmp(getsval(x), getsval(y));
901 	}
902 	tempfree(x);
903 	tempfree(y);
904 	switch (n) {
905 	case LT:	if (i<0) return(True);
906 			else return(False);
907 	case LE:	if (i<=0) return(True);
908 			else return(False);
909 	case NE:	if (x_is_nan && y_is_nan) return(True);
910 			else if (i!=0) return(True);
911 			else return(False);
912 	case EQ:	if (i == 0) return(True);
913 			else return(False);
914 	case GE:	if (i>=0) return(True);
915 			else return(False);
916 	case GT:	if (i>0) return(True);
917 			else return(False);
918 	default:	/* can't happen */
919 		FATAL("unknown relational operator %d", n);
920 	}
921 	return 0;	/*NOTREACHED*/
922 }
923 
tfree(Cell * a)924 void tfree(Cell *a)	/* free a tempcell */
925 {
926 	if (freeable(a)) {
927 		DPRINTF("freeing %s %s %o\n", NN(a->nval), NN(a->sval), a->tval);
928 		xfree(a->sval);
929 	}
930 	if (a == tmps)
931 		FATAL("tempcell list is curdled");
932 	a->cnext = tmps;
933 	tmps = a;
934 }
935 
gettemp(void)936 Cell *gettemp(void)	/* get a tempcell */
937 {	int i;
938 	Cell *x;
939 
940 	if (!tmps) {
941 		tmps = (Cell *) calloc(100, sizeof(*tmps));
942 		if (!tmps)
943 			FATAL("out of space for temporaries");
944 		for (i = 1; i < 100; i++)
945 			tmps[i-1].cnext = &tmps[i];
946 		tmps[i-1].cnext = NULL;
947 	}
948 	x = tmps;
949 	tmps = x->cnext;
950 	*x = tempcell;
951 	return(x);
952 }
953 
indirect(Node ** a,int n)954 Cell *indirect(Node **a, int n)	/* $( a[0] ) */
955 {
956 	Awkfloat val;
957 	Cell *x;
958 	int m;
959 	char *s;
960 
961 	x = execute(a[0]);
962 	val = getfval(x);	/* freebsd: defend against super large field numbers */
963 	if ((Awkfloat)INT_MAX < val)
964 		FATAL("trying to access out of range field %s", x->nval);
965 	m = (int) val;
966 	if (m == 0 && !is_number(s = getsval(x), NULL))	/* suspicion! */
967 		FATAL("illegal field $(%s), name \"%s\"", s, x->nval);
968 		/* BUG: can x->nval ever be null??? */
969 	tempfree(x);
970 	x = fieldadr(m);
971 	x->ctype = OCELL;	/* BUG?  why are these needed? */
972 	x->csub = CFLD;
973 	return(x);
974 }
975 
substr(Node ** a,int nnn)976 Cell *substr(Node **a, int nnn)		/* substr(a[0], a[1], a[2]) */
977 {
978 	int k, m, n;
979 	int mb, nb;
980 	char *s;
981 	int temp;
982 	Cell *x, *y, *z = NULL;
983 
984 	x = execute(a[0]);
985 	y = execute(a[1]);
986 	if (a[2] != NULL)
987 		z = execute(a[2]);
988 	s = getsval(x);
989 	k = u8_strlen(s) + 1;
990 	if (k <= 1) {
991 		tempfree(x);
992 		tempfree(y);
993 		if (a[2] != NULL) {
994 			tempfree(z);
995 		}
996 		x = gettemp();
997 		setsval(x, "");
998 		return(x);
999 	}
1000 	m = (int) getfval(y);
1001 	if (m <= 0)
1002 		m = 1;
1003 	else if (m > k)
1004 		m = k;
1005 	tempfree(y);
1006 	if (a[2] != NULL) {
1007 		n = (int) getfval(z);
1008 		tempfree(z);
1009 	} else
1010 		n = k - 1;
1011 	if (n < 0)
1012 		n = 0;
1013 	else if (n > k - m)
1014 		n = k - m;
1015 	/* m is start, n is length from there */
1016 	DPRINTF("substr: m=%d, n=%d, s=%s\n", m, n, s);
1017 	y = gettemp();
1018 	mb = u8_char2byte(s, m-1); /* byte offset of start char in s */
1019 	nb = mb + u8_char2byte(&s[mb], n);  /* byte offset of end+1 char in s */
1020 
1021 	temp = s[nb];	/* with thanks to John Linderman */
1022 	s[nb] = '\0';
1023 	setsval(y, s + mb);
1024 	s[nb] = temp;
1025 	tempfree(x);
1026 	return(y);
1027 }
1028 
sindex(Node ** a,int nnn)1029 Cell *sindex(Node **a, int nnn)		/* index(a[0], a[1]) */
1030 {
1031 	Cell *x, *y, *z;
1032 	char *s1, *s2, *p1, *p2, *q;
1033 	Awkfloat v = 0.0;
1034 
1035 	x = execute(a[0]);
1036 	s1 = getsval(x);
1037 	y = execute(a[1]);
1038 	s2 = getsval(y);
1039 
1040 	z = gettemp();
1041 	for (p1 = s1; *p1 != '\0'; p1++) {
1042 		for (q = p1, p2 = s2; *p2 != '\0' && *q == *p2; q++, p2++)
1043 			continue;
1044 		if (*p2 == '\0') {
1045 			/* v = (Awkfloat) (p1 - s1 + 1);	 origin 1 */
1046 
1047 		   /* should be a function: used in match() as well */
1048 			int i, len;
1049 			v = 0;
1050 			for (i = 0; i < p1-s1+1; i += len) {
1051 				len = u8_nextlen(s1+i);
1052 				v++;
1053 			}
1054 			break;
1055 		}
1056 	}
1057 	tempfree(x);
1058 	tempfree(y);
1059 	setfval(z, v);
1060 	return(z);
1061 }
1062 
has_utf8(char * s)1063 static int has_utf8(char *s)	/* return 1 if s contains any utf-8 (2 bytes or more) character */
1064 {
1065 	int n;
1066 
1067 	for (n = 0; *s != 0; s += n) {
1068 		n = u8_nextlen(s);
1069 		if (n > 1)
1070 			return 1;
1071 	}
1072 	return 0;
1073 }
1074 
1075 #define	MAXNUMSIZE	50
1076 
format(char ** pbuf,int * pbufsize,const char * s,Node * a)1077 int format(char **pbuf, int *pbufsize, const char *s, Node *a)	/* printf-like conversions */
1078 {
1079 	char *fmt;
1080 	char *p, *t;
1081 	const char *os;
1082 	Cell *x;
1083 	int flag = 0, n;
1084 	int fmtwd; /* format width */
1085 	int fmtsz = recsize;
1086 	char *buf = *pbuf;
1087 	int bufsize = *pbufsize;
1088 #define FMTSZ(a)   (fmtsz - ((a) - fmt))
1089 #define BUFSZ(a)   (bufsize - ((a) - buf))
1090 
1091 	static bool first = true;
1092 	static bool have_a_format = false;
1093 
1094 	if (first) {
1095 		char xbuf[100];
1096 
1097 		snprintf(xbuf, sizeof(xbuf), "%a", 42.0);
1098 		have_a_format = (strcmp(xbuf, "0x1.5p+5") == 0);
1099 		first = false;
1100 	}
1101 
1102 	os = s;
1103 	p = buf;
1104 	if ((fmt = (char *) malloc(fmtsz)) == NULL)
1105 		FATAL("out of memory in format()");
1106 	while (*s) {
1107 		adjbuf(&buf, &bufsize, MAXNUMSIZE+1+p-buf, recsize, &p, "format1");
1108 		if (*s != '%') {
1109 			*p++ = *s++;
1110 			continue;
1111 		}
1112 		if (*(s+1) == '%') {
1113 			*p++ = '%';
1114 			s += 2;
1115 			continue;
1116 		}
1117 		fmtwd = atoi(s+1);
1118 		if (fmtwd < 0)
1119 			fmtwd = -fmtwd;
1120 		adjbuf(&buf, &bufsize, fmtwd+1+p-buf, recsize, &p, "format2");
1121 		for (t = fmt; (*t++ = *s) != '\0'; s++) {
1122 			if (!adjbuf(&fmt, &fmtsz, MAXNUMSIZE+1+t-fmt, recsize, &t, "format3"))
1123 				FATAL("format item %.30s... ran format() out of memory", os);
1124 			/* Ignore size specifiers */
1125 			if (strchr("hjLlqtz", *s) != NULL) {	/* the ansi panoply */
1126 				t--;
1127 				continue;
1128 			}
1129 			if (isalpha((uschar)*s))
1130 				break;
1131 			if (*s == '$') {
1132 				FATAL("'$' not permitted in awk formats");
1133 			}
1134 			if (*s == '*') {
1135 				if (a == NULL) {
1136 					FATAL("not enough args in printf(%s)", os);
1137 				}
1138 				x = execute(a);
1139 				a = a->nnext;
1140 				snprintf(t - 1, FMTSZ(t - 1),
1141 				    "%d", fmtwd=(int) getfval(x));
1142 				if (fmtwd < 0)
1143 					fmtwd = -fmtwd;
1144 				adjbuf(&buf, &bufsize, fmtwd+1+p-buf, recsize, &p, "format");
1145 				t = fmt + strlen(fmt);
1146 				tempfree(x);
1147 			}
1148 		}
1149 		*t = '\0';
1150 		if (fmtwd < 0)
1151 			fmtwd = -fmtwd;
1152 		adjbuf(&buf, &bufsize, fmtwd+1+p-buf, recsize, &p, "format4");
1153 		switch (*s) {
1154 		case 'a': case 'A':
1155 			if (have_a_format)
1156 				flag = *s;
1157 			else
1158 				flag = 'f';
1159 			break;
1160 		case 'f': case 'e': case 'g': case 'E': case 'G':
1161 			flag = 'f';
1162 			break;
1163 		case 'd': case 'i': case 'o': case 'x': case 'X': case 'u':
1164 			flag = (*s == 'd' || *s == 'i') ? 'd' : 'u';
1165 			*(t-1) = 'j';
1166 			*t = *s;
1167 			*++t = '\0';
1168 			break;
1169 		case 's':
1170 			flag = 's';
1171 			break;
1172 		case 'c':
1173 			flag = 'c';
1174 			break;
1175 		default:
1176 			WARNING("weird printf conversion %s", fmt);
1177 			flag = '?';
1178 			break;
1179 		}
1180 		if (a == NULL)
1181 			FATAL("not enough args in printf(%s)", os);
1182 		x = execute(a);
1183 		a = a->nnext;
1184 		n = MAXNUMSIZE;
1185 		if (fmtwd > n)
1186 			n = fmtwd;
1187 		adjbuf(&buf, &bufsize, 1+n+p-buf, recsize, &p, "format5");
1188 		switch (flag) {
1189 		case '?':
1190 			snprintf(p, BUFSZ(p), "%s", fmt);	/* unknown, so dump it too */
1191 			t = getsval(x);
1192 			n = strlen(t);
1193 			if (fmtwd > n)
1194 				n = fmtwd;
1195 			adjbuf(&buf, &bufsize, 1+strlen(p)+n+p-buf, recsize, &p, "format6");
1196 			p += strlen(p);
1197 			snprintf(p, BUFSZ(p), "%s", t);
1198 			break;
1199 		case 'a':
1200 		case 'A':
1201 		case 'f':	snprintf(p, BUFSZ(p), fmt, getfval(x)); break;
1202 		case 'd':	snprintf(p, BUFSZ(p), fmt, (intmax_t) getfval(x)); break;
1203 		case 'u':	snprintf(p, BUFSZ(p), fmt, (uintmax_t) getfval(x)); break;
1204 
1205 		case 's': {
1206 			t = getsval(x);
1207 			n = strlen(t);
1208 			/* if simple format or no utf-8 in the string, sprintf works */
1209 			if (!has_utf8(t) || strcmp(fmt,"%s") == 0) {
1210 				if (fmtwd > n)
1211 					n = fmtwd;
1212 				if (!adjbuf(&buf, &bufsize, 1+n+p-buf, recsize, &p, "format7"))
1213 					FATAL("huge string/format (%d chars) in printf %.30s..." \
1214 						" ran format() out of memory", n, t);
1215 				snprintf(p, BUFSZ(p), fmt, t);
1216 				break;
1217 			}
1218 
1219 			/* get here if string has utf-8 chars and fmt is not plain %s */
1220 			/* "%-w.ps", where -, w and .p are all optional */
1221 			/* '0' before the w is a flag character */
1222 			/* fmt points at % */
1223 			int ljust = 0, wid = 0, prec = n, pad = 0;
1224 			char *f = fmt+1;
1225 			if (f[0] == '-') {
1226 				ljust = 1;
1227 				f++;
1228 			}
1229 			// flags '0' and '+' are recognized but skipped
1230 			if (f[0] == '0') {
1231 				f++;
1232 				if (f[0] == '+')
1233 					f++;
1234 			}
1235 			if (f[0] == '+') {
1236 				f++;
1237 				if (f[0] == '0')
1238 					f++;
1239 			}
1240 			if (isdigit((uschar)f[0])) { /* there is a wid */
1241 				wid = strtol(f, &f, 10);
1242 			}
1243 			if (f[0] == '.') { /* there is a .prec */
1244 				prec = strtol(++f, &f, 10);
1245 			}
1246 			if (prec > u8_strlen(t))
1247 				prec = u8_strlen(t);
1248 			pad = wid>prec ? wid - prec : 0;  // has to be >= 0
1249 			int i, precb;
1250 
1251 			if (ljust) { // print prec chars from t, then pad blanks
1252 				precb = u8_char2byte(t, prec);
1253 				for (i = 0; i < precb; i++) {
1254 					//putchar(t[i]);
1255 					*p++ = t[i];
1256 				}
1257 				for (i = 0; i < pad; i++) {
1258 					//printf(" ");
1259 					*p++ = ' ';
1260 				}
1261 			} else { // print pad blanks, then prec chars from t
1262 				for (i = 0; i < pad; i++) {
1263 					//printf(" ");
1264 					*p++ = ' ';
1265 				}
1266 				precb = u8_char2byte(t, prec);
1267 				for (i = 0; i < precb; i++) {
1268 					//putchar(t[i]);
1269 					*p++ = t[i];
1270 				}
1271 			}
1272 			*p = 0;
1273 			break;
1274 		}
1275 
1276                case 'c': {
1277 			/*
1278 			 * If a numeric value is given, awk should just turn
1279 			 * it into a character and print it:
1280 			 *      BEGIN { printf("%c\n", 65) }
1281 			 * prints "A".
1282 			 *
1283 			 * But what if the numeric value is > 128 and
1284 			 * represents a valid Unicode code point?!? We do
1285 			 * our best to convert it back into UTF-8. If we
1286 			 * can't, we output the encoding of the Unicode
1287 			 * "invalid character", 0xFFFD.
1288 			 */
1289 			if (isnum(x)) {
1290 				int charval = (int) getfval(x);
1291 
1292 				if (charval != 0) {
1293 					if (charval < 128 || awk_mb_cur_max == 1)
1294 						snprintf(p, BUFSZ(p), fmt, charval);
1295 					else {
1296 						// possible unicode character
1297 						size_t count;
1298 						char *bs = wide_char_to_byte_str(charval, &count);
1299 
1300 						if (bs == NULL)	{ // invalid character
1301 							// use unicode invalid character, 0xFFFD
1302 							static char invalid_char[] = "\357\277\275";
1303 							bs = invalid_char;
1304 							count = 3;
1305 						}
1306 						t = bs;
1307 						n = count;
1308 						goto format_percent_c;
1309 					}
1310 				} else {
1311 					*p++ = '\0'; /* explicit null byte */
1312 					*p = '\0';   /* next output will start here */
1313 				}
1314 				break;
1315 			}
1316 			t = getsval(x);
1317 			n = u8_nextlen(t);
1318 		format_percent_c:
1319 			if (n < 2) { /* not utf8 */
1320 				snprintf(p, BUFSZ(p), fmt, getsval(x)[0]);
1321 				break;
1322 			}
1323 
1324 			// utf8 character, almost same song and dance as for %s
1325 			int ljust = 0, wid = 0, prec = n, pad = 0;
1326 			char *f = fmt+1;
1327 			if (f[0] == '-') {
1328 				ljust = 1;
1329 				f++;
1330 			}
1331 			// flags '0' and '+' are recognized but skipped
1332 			if (f[0] == '0') {
1333 				f++;
1334 				if (f[0] == '+')
1335 					f++;
1336 			}
1337 			if (f[0] == '+') {
1338 				f++;
1339 				if (f[0] == '0')
1340 					f++;
1341 			}
1342 			if (isdigit((uschar)f[0])) { /* there is a wid */
1343 				wid = strtol(f, &f, 10);
1344 			}
1345 			if (f[0] == '.') { /* there is a .prec */
1346 				prec = strtol(++f, &f, 10);
1347 			}
1348 			if (prec > 1)           // %c --> only one character
1349 				prec = 1;
1350 			pad = wid>prec ? wid - prec : 0;  // has to be >= 0
1351 			int i;
1352 
1353 			if (ljust) { // print one char from t, then pad blanks
1354 				for (i = 0; i < n; i++)
1355 					*p++ = t[i];
1356 				for (i = 0; i < pad; i++) {
1357 					//printf(" ");
1358 					*p++ = ' ';
1359 				}
1360 			} else { // print pad blanks, then prec chars from t
1361 				for (i = 0; i < pad; i++) {
1362 					//printf(" ");
1363 					*p++ = ' ';
1364 				}
1365 				for (i = 0; i < n; i++)
1366 					*p++ = t[i];
1367 			}
1368 			*p = 0;
1369 			break;
1370 		}
1371 		default:
1372 			FATAL("can't happen: bad conversion %c in format()", flag);
1373 		}
1374 
1375 		tempfree(x);
1376 		p += strlen(p);
1377 		s++;
1378 	}
1379 	*p = '\0';
1380 	free(fmt);
1381 	for ( ; a; a = a->nnext) {		/* evaluate any remaining args */
1382 		x = execute(a);
1383 		tempfree(x);
1384 	}
1385 	*pbuf = buf;
1386 	*pbufsize = bufsize;
1387 	return p - buf;
1388 }
1389 
awksprintf(Node ** a,int n)1390 Cell *awksprintf(Node **a, int n)		/* sprintf(a[0]) */
1391 {
1392 	Cell *x;
1393 	Node *y;
1394 	char *buf;
1395 	int bufsz=3*recsize;
1396 
1397 	if ((buf = (char *) malloc(bufsz)) == NULL)
1398 		FATAL("out of memory in awksprintf");
1399 	y = a[0]->nnext;
1400 	x = execute(a[0]);
1401 	if (format(&buf, &bufsz, getsval(x), y) == -1)
1402 		FATAL("sprintf string %.30s... too long.  can't happen.", buf);
1403 	tempfree(x);
1404 	x = gettemp();
1405 	x->sval = buf;
1406 	x->tval = STR;
1407 	return(x);
1408 }
1409 
awkprintf(Node ** a,int n)1410 Cell *awkprintf(Node **a, int n)		/* printf */
1411 {	/* a[0] is list of args, starting with format string */
1412 	/* a[1] is redirection operator, a[2] is redirection file */
1413 	FILE *fp;
1414 	Cell *x;
1415 	Node *y;
1416 	char *buf;
1417 	int len;
1418 	int bufsz=3*recsize;
1419 
1420 	if ((buf = (char *) malloc(bufsz)) == NULL)
1421 		FATAL("out of memory in awkprintf");
1422 	y = a[0]->nnext;
1423 	x = execute(a[0]);
1424 	if ((len = format(&buf, &bufsz, getsval(x), y)) == -1)
1425 		FATAL("printf string %.30s... too long.  can't happen.", buf);
1426 	tempfree(x);
1427 	if (a[1] == NULL) {
1428 		/* fputs(buf, stdout); */
1429 		fwrite(buf, len, 1, stdout);
1430 		if (ferror(stdout))
1431 			FATAL("write error on stdout");
1432 	} else {
1433 		fp = redirect(ptoi(a[1]), a[2]);
1434 		/* fputs(buf, fp); */
1435 		fwrite(buf, len, 1, fp);
1436 		fflush(fp);
1437 		if (ferror(fp))
1438 			FATAL("write error on %s", filename(fp));
1439 	}
1440 	free(buf);
1441 	return(True);
1442 }
1443 
arith(Node ** a,int n)1444 Cell *arith(Node **a, int n)	/* a[0] + a[1], etc.  also -a[0] */
1445 {
1446 	Awkfloat i, j = 0;
1447 	double v;
1448 	Cell *x, *y, *z;
1449 
1450 	x = execute(a[0]);
1451 	i = getfval(x);
1452 	tempfree(x);
1453 	if (n != UMINUS && n != UPLUS) {
1454 		y = execute(a[1]);
1455 		j = getfval(y);
1456 		tempfree(y);
1457 	}
1458 	z = gettemp();
1459 	switch (n) {
1460 	case ADD:
1461 		i += j;
1462 		break;
1463 	case MINUS:
1464 		i -= j;
1465 		break;
1466 	case MULT:
1467 		i *= j;
1468 		break;
1469 	case DIVIDE:
1470 		if (j == 0)
1471 			FATAL("division by zero");
1472 		i /= j;
1473 		break;
1474 	case MOD:
1475 		if (j == 0)
1476 			FATAL("division by zero in mod");
1477 		modf(i/j, &v);
1478 		i = i - j * v;
1479 		break;
1480 	case UMINUS:
1481 		i = -i;
1482 		break;
1483 	case UPLUS: /* handled by getfval(), above */
1484 		break;
1485 	case POWER:
1486 		if (j >= 0 && modf(j, &v) == 0.0)	/* pos integer exponent */
1487 			i = ipow(i, (int) j);
1488                else {
1489 			errno = 0;
1490 			i = errcheck(pow(i, j), "pow");
1491                }
1492 		break;
1493 	default:	/* can't happen */
1494 		FATAL("illegal arithmetic operator %d", n);
1495 	}
1496 	setfval(z, i);
1497 	return(z);
1498 }
1499 
ipow(double x,int n)1500 double ipow(double x, int n)	/* x**n.  ought to be done by pow, but isn't always */
1501 {
1502 	double v;
1503 
1504 	if (n <= 0)
1505 		return 1;
1506 	v = ipow(x, n/2);
1507 	if (n % 2 == 0)
1508 		return v * v;
1509 	else
1510 		return x * v * v;
1511 }
1512 
incrdecr(Node ** a,int n)1513 Cell *incrdecr(Node **a, int n)		/* a[0]++, etc. */
1514 {
1515 	Cell *x, *z;
1516 	int k;
1517 	Awkfloat xf;
1518 
1519 	x = execute(a[0]);
1520 	xf = getfval(x);
1521 	k = (n == PREINCR || n == POSTINCR) ? 1 : -1;
1522 	if (n == PREINCR || n == PREDECR) {
1523 		setfval(x, xf + k);
1524 		return(x);
1525 	}
1526 	z = gettemp();
1527 	setfval(z, xf);
1528 	setfval(x, xf + k);
1529 	tempfree(x);
1530 	return(z);
1531 }
1532 
assign(Node ** a,int n)1533 Cell *assign(Node **a, int n)	/* a[0] = a[1], a[0] += a[1], etc. */
1534 {		/* this is subtle; don't muck with it. */
1535 	Cell *x, *y;
1536 	Awkfloat xf, yf;
1537 	double v;
1538 
1539 	y = execute(a[1]);
1540 	x = execute(a[0]);
1541 	if (n == ASSIGN) {	/* ordinary assignment */
1542 		if (x == y && !(x->tval & (FLD|REC)) && x != nfloc)
1543 			;	/* self-assignment: leave alone unless it's a field or NF */
1544 		else if ((y->tval & (STR|NUM)) == (STR|NUM)) {
1545 			yf = getfval(y);
1546 			setsval(x, getsval(y));
1547 			x->fval = yf;
1548 			x->tval |= NUM;
1549 		}
1550 		else if (isstr(y))
1551 			setsval(x, getsval(y));
1552 		else if (isnum(y))
1553 			setfval(x, getfval(y));
1554 		else
1555 			funnyvar(y, "read value of");
1556 		tempfree(y);
1557 		return(x);
1558 	}
1559 	xf = getfval(x);
1560 	yf = getfval(y);
1561 	switch (n) {
1562 	case ADDEQ:
1563 		xf += yf;
1564 		break;
1565 	case SUBEQ:
1566 		xf -= yf;
1567 		break;
1568 	case MULTEQ:
1569 		xf *= yf;
1570 		break;
1571 	case DIVEQ:
1572 		if (yf == 0)
1573 			FATAL("division by zero in /=");
1574 		xf /= yf;
1575 		break;
1576 	case MODEQ:
1577 		if (yf == 0)
1578 			FATAL("division by zero in %%=");
1579 		modf(xf/yf, &v);
1580 		xf = xf - yf * v;
1581 		break;
1582 	case POWEQ:
1583 		if (yf >= 0 && modf(yf, &v) == 0.0)	/* pos integer exponent */
1584 			xf = ipow(xf, (int) yf);
1585                else {
1586 			errno = 0;
1587 			xf = errcheck(pow(xf, yf), "pow");
1588                }
1589 		break;
1590 	default:
1591 		FATAL("illegal assignment operator %d", n);
1592 		break;
1593 	}
1594 	tempfree(y);
1595 	setfval(x, xf);
1596 	return(x);
1597 }
1598 
cat(Node ** a,int q)1599 Cell *cat(Node **a, int q)	/* a[0] cat a[1] */
1600 {
1601 	Cell *x, *y, *z;
1602 	int n1, n2;
1603 	char *s = NULL;
1604 	int ssz = 0;
1605 
1606 	x = execute(a[0]);
1607 	n1 = strlen(getsval(x));
1608 	adjbuf(&s, &ssz, n1 + 1, recsize, 0, "cat1");
1609 	memcpy(s, x->sval, n1);
1610 
1611 	tempfree(x);
1612 
1613 	y = execute(a[1]);
1614 	n2 = strlen(getsval(y));
1615 	adjbuf(&s, &ssz, n1 + n2 + 1, recsize, 0, "cat2");
1616 	memcpy(s + n1, y->sval, n2);
1617 	s[n1 + n2] = '\0';
1618 
1619 	tempfree(y);
1620 
1621 	z = gettemp();
1622 	z->sval = s;
1623 	z->tval = STR;
1624 
1625 	return(z);
1626 }
1627 
pastat(Node ** a,int n)1628 Cell *pastat(Node **a, int n)	/* a[0] { a[1] } */
1629 {
1630 	Cell *x;
1631 
1632 	if (a[0] == NULL)
1633 		x = execute(a[1]);
1634 	else {
1635 		x = execute(a[0]);
1636 		if (istrue(x)) {
1637 			tempfree(x);
1638 			x = execute(a[1]);
1639 		}
1640 	}
1641 	return x;
1642 }
1643 
dopa2(Node ** a,int n)1644 Cell *dopa2(Node **a, int n)	/* a[0], a[1] { a[2] } */
1645 {
1646 	Cell *x;
1647 	int pair;
1648 
1649 	pair = ptoi(a[3]);
1650 	if (pairstack[pair] == 0) {
1651 		x = execute(a[0]);
1652 		if (istrue(x))
1653 			pairstack[pair] = 1;
1654 		tempfree(x);
1655 	}
1656 	if (pairstack[pair] == 1) {
1657 		x = execute(a[1]);
1658 		if (istrue(x))
1659 			pairstack[pair] = 0;
1660 		tempfree(x);
1661 		x = execute(a[2]);
1662 		return(x);
1663 	}
1664 	return(False);
1665 }
1666 
split(Node ** a,int nnn)1667 Cell *split(Node **a, int nnn)	/* split(a[0], a[1], a[2]); a[3] is type */
1668 {
1669 	Cell *x = NULL, *y, *ap;
1670 	const char *s, *origs, *t;
1671 	const char *fs = NULL;
1672 	char *origfs = NULL;
1673 	int sep;
1674 	char temp, num[50];
1675 	int j, n, tempstat, arg3type;
1676 	double result;
1677 
1678 	y = execute(a[0]);	/* source string */
1679 	origs = s = strdup(getsval(y));
1680 	if (s == NULL)
1681 		FATAL("out of space in split");
1682 	tempfree(y);
1683 	arg3type = ptoi(a[3]);
1684 	if (a[2] == NULL) {		/* BUG: CSV should override implicit fs but not explicit */
1685 		fs = getsval(fsloc);
1686 	} else if (arg3type == STRING) {	/* split(str,arr,"string") */
1687 		x = execute(a[2]);
1688 		fs = origfs = strdup(getsval(x));
1689 		if (fs == NULL)
1690 			FATAL("out of space in split");
1691 		tempfree(x);
1692 	} else if (arg3type == REGEXPR) {
1693 		fs = "(regexpr)";	/* split(str,arr,/regexpr/) */
1694 	} else {
1695 		FATAL("illegal type of split");
1696 	}
1697 	sep = *fs;
1698 	ap = execute(a[1]);	/* array name */
1699 	/* BUG 7/26/22: this appears not to reset array: see C1/asplit */
1700 	freesymtab(ap);
1701 	DPRINTF("split: s=|%s|, a=%s, sep=|%s|\n", s, NN(ap->nval), fs);
1702 	ap->tval &= ~STR;
1703 	ap->tval |= ARR;
1704 	ap->sval = (char *) makesymtab(NSYMTAB);
1705 
1706 	n = 0;
1707         if (arg3type == REGEXPR && strlen((char*)((fa*)a[2])->restr) == 0) {
1708 		/* split(s, a, //); have to arrange that it looks like empty sep */
1709 		arg3type = 0;
1710 		fs = "";
1711 		sep = 0;
1712 	}
1713 	if (*s != '\0' && (strlen(fs) > 1 || arg3type == REGEXPR)) {	/* reg expr */
1714 		fa *pfa;
1715 		if (arg3type == REGEXPR) {	/* it's ready already */
1716 			pfa = (fa *) a[2];
1717 		} else {
1718 			pfa = makedfa(fs, 1);
1719 		}
1720 		if (nematch(pfa,s)) {
1721 			tempstat = pfa->initstat;
1722 			pfa->initstat = 2;
1723 			do {
1724 				n++;
1725 				snprintf(num, sizeof(num), "%d", n);
1726 				temp = *patbeg;
1727 				setptr(patbeg, '\0');
1728 				if (is_number(s, & result))
1729 					setsymtab(num, s, result, STR|NUM, (Array *) ap->sval);
1730 				else
1731 					setsymtab(num, s, 0.0, STR, (Array *) ap->sval);
1732 				setptr(patbeg, temp);
1733 				s = patbeg + patlen;
1734 				if (*(patbeg+patlen-1) == '\0' || *s == '\0') {
1735 					n++;
1736 					snprintf(num, sizeof(num), "%d", n);
1737 					setsymtab(num, "", 0.0, STR, (Array *) ap->sval);
1738 					pfa->initstat = tempstat;
1739 					goto spdone;
1740 				}
1741 			} while (nematch(pfa,s));
1742 			pfa->initstat = tempstat; 	/* bwk: has to be here to reset */
1743 							/* cf gsub and refldbld */
1744 		}
1745 		n++;
1746 		snprintf(num, sizeof(num), "%d", n);
1747 		if (is_number(s, & result))
1748 			setsymtab(num, s, result, STR|NUM, (Array *) ap->sval);
1749 		else
1750 			setsymtab(num, s, 0.0, STR, (Array *) ap->sval);
1751   spdone:
1752 		pfa = NULL;
1753 
1754 	} else if (a[2] == NULL && CSV) {	/* CSV only if no explicit separator */
1755 		char *newt = (char *) malloc(strlen(s)); /* for building new string; reuse for each field */
1756 		for (;;) {
1757 			char *fr = newt;
1758 			n++;
1759 			if (*s == '"' ) { /* start of "..." */
1760 				for (s++ ; *s != '\0'; ) {
1761 					if (*s == '"' && s[1] != '\0' && s[1] == '"') {
1762 						s += 2; /* doubled quote */
1763 						*fr++ = '"';
1764 					} else if (*s == '"' && (s[1] == '\0' || s[1] == ',')) {
1765 						s++; /* skip over closing quote */
1766 						break;
1767 					} else {
1768 						*fr++ = *s++;
1769 					}
1770 				}
1771 				*fr++ = 0;
1772 			} else {	/* unquoted field */
1773 				while (*s != ',' && *s != '\0')
1774 					*fr++ = *s++;
1775 				*fr++ = 0;
1776 			}
1777 			snprintf(num, sizeof(num), "%d", n);
1778 			if (is_number(newt, &result))
1779 				setsymtab(num, newt, result, STR|NUM, (Array *) ap->sval);
1780 			else
1781 				setsymtab(num, newt, 0.0, STR, (Array *) ap->sval);
1782 			if (*s++ == '\0')
1783 				break;
1784 		}
1785 		free(newt);
1786 
1787 	} else if (!CSV && sep == ' ') { /* usual case: split on white space */
1788 		for (n = 0; ; ) {
1789 #define ISWS(c)	((c) == ' ' || (c) == '\t' || (c) == '\n')
1790 			while (ISWS(*s))
1791 				s++;
1792 			if (*s == '\0')
1793 				break;
1794 			n++;
1795 			t = s;
1796 			do
1797 				s++;
1798 			while (*s != '\0' && !ISWS(*s));
1799 			temp = *s;
1800 			setptr(s, '\0');
1801 			snprintf(num, sizeof(num), "%d", n);
1802 			if (is_number(t, & result))
1803 				setsymtab(num, t, result, STR|NUM, (Array *) ap->sval);
1804 			else
1805 				setsymtab(num, t, 0.0, STR, (Array *) ap->sval);
1806 			setptr(s, temp);
1807 			if (*s != '\0')
1808 				s++;
1809 		}
1810 
1811 	} else if (sep == 0) {	/* new: split(s, a, "") => 1 char/elem */
1812 		for (n = 0; *s != '\0'; s += u8_nextlen(s)) {
1813 			char buf[10];
1814 			n++;
1815 			snprintf(num, sizeof(num), "%d", n);
1816 
1817 			for (j = 0; j < u8_nextlen(s); j++) {
1818 				buf[j] = s[j];
1819 			}
1820 			buf[j] = '\0';
1821 
1822 			if (isdigit((uschar)buf[0]))
1823 				setsymtab(num, buf, atof(buf), STR|NUM, (Array *) ap->sval);
1824 			else
1825 				setsymtab(num, buf, 0.0, STR, (Array *) ap->sval);
1826 		}
1827 
1828 	} else if (*s != '\0') {  /* some random single character */
1829 		for (;;) {
1830 			n++;
1831 			t = s;
1832 			while (*s != sep && *s != '\0')
1833 				s++;
1834 			temp = *s;
1835 			setptr(s, '\0');
1836 			snprintf(num, sizeof(num), "%d", n);
1837 			if (is_number(t, & result))
1838 				setsymtab(num, t, result, STR|NUM, (Array *) ap->sval);
1839 			else
1840 				setsymtab(num, t, 0.0, STR, (Array *) ap->sval);
1841 			setptr(s, temp);
1842 			if (*s++ == '\0')
1843 				break;
1844 		}
1845 	}
1846 	tempfree(ap);
1847 	xfree(origs);
1848 	xfree(origfs);
1849 	x = gettemp();
1850 	x->tval = NUM;
1851 	x->fval = n;
1852 	return(x);
1853 }
1854 
condexpr(Node ** a,int n)1855 Cell *condexpr(Node **a, int n)	/* a[0] ? a[1] : a[2] */
1856 {
1857 	Cell *x;
1858 
1859 	x = execute(a[0]);
1860 	if (istrue(x)) {
1861 		tempfree(x);
1862 		x = execute(a[1]);
1863 	} else {
1864 		tempfree(x);
1865 		x = execute(a[2]);
1866 	}
1867 	return(x);
1868 }
1869 
ifstat(Node ** a,int n)1870 Cell *ifstat(Node **a, int n)	/* if (a[0]) a[1]; else a[2] */
1871 {
1872 	Cell *x;
1873 
1874 	x = execute(a[0]);
1875 	if (istrue(x)) {
1876 		tempfree(x);
1877 		x = execute(a[1]);
1878 	} else if (a[2] != NULL) {
1879 		tempfree(x);
1880 		x = execute(a[2]);
1881 	}
1882 	return(x);
1883 }
1884 
whilestat(Node ** a,int n)1885 Cell *whilestat(Node **a, int n)	/* while (a[0]) a[1] */
1886 {
1887 	Cell *x;
1888 
1889 	for (;;) {
1890 		x = execute(a[0]);
1891 		if (!istrue(x))
1892 			return(x);
1893 		tempfree(x);
1894 		x = execute(a[1]);
1895 		if (isbreak(x)) {
1896 			x = True;
1897 			return(x);
1898 		}
1899 		if (isnext(x) || isexit(x) || isret(x))
1900 			return(x);
1901 		tempfree(x);
1902 	}
1903 }
1904 
dostat(Node ** a,int n)1905 Cell *dostat(Node **a, int n)	/* do a[0]; while(a[1]) */
1906 {
1907 	Cell *x;
1908 
1909 	for (;;) {
1910 		x = execute(a[0]);
1911 		if (isbreak(x))
1912 			return True;
1913 		if (isnext(x) || isexit(x) || isret(x))
1914 			return(x);
1915 		tempfree(x);
1916 		x = execute(a[1]);
1917 		if (!istrue(x))
1918 			return(x);
1919 		tempfree(x);
1920 	}
1921 }
1922 
forstat(Node ** a,int n)1923 Cell *forstat(Node **a, int n)	/* for (a[0]; a[1]; a[2]) a[3] */
1924 {
1925 	Cell *x;
1926 
1927 	x = execute(a[0]);
1928 	tempfree(x);
1929 	for (;;) {
1930 		if (a[1]!=NULL) {
1931 			x = execute(a[1]);
1932 			if (!istrue(x)) return(x);
1933 			else tempfree(x);
1934 		}
1935 		x = execute(a[3]);
1936 		if (isbreak(x))		/* turn off break */
1937 			return True;
1938 		if (isnext(x) || isexit(x) || isret(x))
1939 			return(x);
1940 		tempfree(x);
1941 		x = execute(a[2]);
1942 		tempfree(x);
1943 	}
1944 }
1945 
instat(Node ** a,int n)1946 Cell *instat(Node **a, int n)	/* for (a[0] in a[1]) a[2] */
1947 {
1948 	Cell *x, *vp, *arrayp, *cp, *ncp;
1949 	Array *tp;
1950 	int i;
1951 
1952 	vp = execute(a[0]);
1953 	arrayp = execute(a[1]);
1954 	if (!isarr(arrayp)) {
1955 		return True;
1956 	}
1957 	tp = (Array *) arrayp->sval;
1958 	tempfree(arrayp);
1959 	for (i = 0; i < tp->size; i++) {	/* this routine knows too much */
1960 		for (cp = tp->tab[i]; cp != NULL; cp = ncp) {
1961 			setsval(vp, cp->nval);
1962 			ncp = cp->cnext;
1963 			x = execute(a[2]);
1964 			if (isbreak(x)) {
1965 				tempfree(vp);
1966 				return True;
1967 			}
1968 			if (isnext(x) || isexit(x) || isret(x)) {
1969 				tempfree(vp);
1970 				return(x);
1971 			}
1972 			tempfree(x);
1973 		}
1974 	}
1975 	return True;
1976 }
1977 
nawk_convert(const char * s,int (* fun_c)(int),wint_t (* fun_wc)(wint_t))1978 static char *nawk_convert(const char *s, int (*fun_c)(int),
1979     wint_t (*fun_wc)(wint_t))
1980 {
1981 	char *buf      = NULL;
1982 	char *pbuf     = NULL;
1983 	const char *ps = NULL;
1984 	size_t n       = 0;
1985 	wchar_t wc;
1986 	const size_t sz = awk_mb_cur_max;
1987 
1988 	if (sz == 1) {
1989 		buf = tostring(s);
1990 
1991 		for (pbuf = buf; *pbuf; pbuf++)
1992 			*pbuf = fun_c((uschar)*pbuf);
1993 
1994 		return buf;
1995 	} else {
1996 		/* upper/lower character may be shorter/longer */
1997 		buf = tostringN(s, strlen(s) * sz + 1);
1998 
1999 		/* reset internal state */
2000 		if (mbtowc(NULL, NULL, 0) == -1 || wctomb(NULL, L'\0') == -1)
2001 			FATAL("unable to reset character conversion state");
2002 
2003 		ps   = s;
2004 		pbuf = buf;
2005 		while (n = mbtowc(&wc, ps, sz),
2006 		       n > 0 && n != (size_t)-1 && n != (size_t)-2)
2007 		{
2008 			ps += n;
2009 
2010 			n = wctomb(pbuf, fun_wc(wc));
2011 			if (n == (size_t)-1)
2012 				FATAL("illegal wide character %s", s);
2013 
2014 			pbuf += n;
2015 		}
2016 
2017 		*pbuf = '\0';
2018 
2019 		if (n)
2020 			FATAL("illegal byte sequence %s", s);
2021 
2022 		return buf;
2023 	}
2024 }
2025 
2026 #ifdef __DJGPP__
towupper(wint_t wc)2027 static wint_t towupper(wint_t wc)
2028 {
2029 	if (wc >= 0 && wc < 256)
2030 		return toupper(wc & 0xFF);
2031 
2032 	return wc;
2033 }
2034 
towlower(wint_t wc)2035 static wint_t towlower(wint_t wc)
2036 {
2037 	if (wc >= 0 && wc < 256)
2038 		return tolower(wc & 0xFF);
2039 
2040 	return wc;
2041 }
2042 #endif
2043 
nawk_toupper(const char * s)2044 static char *nawk_toupper(const char *s)
2045 {
2046 	return nawk_convert(s, toupper, towupper);
2047 }
2048 
nawk_tolower(const char * s)2049 static char *nawk_tolower(const char *s)
2050 {
2051 	return nawk_convert(s, tolower, towlower);
2052 }
2053 
bltin(Node ** a,int n)2054 Cell *bltin(Node **a, int n)	/* builtin functions. a[0] is type, a[1] is arg list */
2055 {
2056 	Cell *x, *y;
2057 	Awkfloat u = 0;
2058 	int t, sz;
2059 	Awkfloat tmp;
2060 	char *buf, *fmt;
2061 	Node *nextarg;
2062 	FILE *fp;
2063 	int status = 0;
2064 	time_t tv;
2065 	struct tm *tm, tmbuf;
2066 	int estatus = 0;
2067 
2068 	t = ptoi(a[0]);
2069 	x = execute(a[1]);
2070 	nextarg = a[1]->nnext;
2071 	switch (t) {
2072 	case FLENGTH:
2073 		if (isarr(x))
2074 			u = ((Array *) x->sval)->nelem;	/* GROT.  should be function*/
2075 		else
2076 			u = u8_strlen(getsval(x));
2077 		break;
2078 	case FLOG:
2079 		errno = 0;
2080 		u = errcheck(log(getfval(x)), "log");
2081 		break;
2082 	case FINT:
2083 		modf(getfval(x), &u); break;
2084 	case FEXP:
2085 		errno = 0;
2086 		u = errcheck(exp(getfval(x)), "exp");
2087 		break;
2088 	case FSQRT:
2089 		errno = 0;
2090 		u = errcheck(sqrt(getfval(x)), "sqrt");
2091 		break;
2092 	case FSIN:
2093 		u = sin(getfval(x)); break;
2094 	case FCOS:
2095 		u = cos(getfval(x)); break;
2096 	case FATAN:
2097 		if (nextarg == NULL) {
2098 			WARNING("atan2 requires two arguments; returning 1.0");
2099 			u = 1.0;
2100 		} else {
2101 			y = execute(a[1]->nnext);
2102 			u = atan2(getfval(x), getfval(y));
2103 			tempfree(y);
2104 			nextarg = nextarg->nnext;
2105 		}
2106 		break;
2107 	case FCOMPL:
2108 		u = ~((int)getfval(x));
2109 		break;
2110 	case FAND:
2111 		if (nextarg == 0) {
2112 			WARNING("and requires two arguments; returning 0");
2113 			u = 0;
2114 			break;
2115 		}
2116 		y = execute(a[1]->nnext);
2117 		u = ((int)getfval(x)) & ((int)getfval(y));
2118 		tempfree(y);
2119 		nextarg = nextarg->nnext;
2120 		break;
2121 	case FFOR:
2122 		if (nextarg == 0) {
2123 			WARNING("or requires two arguments; returning 0");
2124 			u = 0;
2125 			break;
2126 		}
2127 		y = execute(a[1]->nnext);
2128 		u = ((int)getfval(x)) | ((int)getfval(y));
2129 		tempfree(y);
2130 		nextarg = nextarg->nnext;
2131 		break;
2132 	case FXOR:
2133 		if (nextarg == 0) {
2134 			WARNING("xor requires two arguments; returning 0");
2135 			u = 0;
2136 			break;
2137 		}
2138 		y = execute(a[1]->nnext);
2139 		u = ((int)getfval(x)) ^ ((int)getfval(y));
2140 		tempfree(y);
2141 		nextarg = nextarg->nnext;
2142 		break;
2143 	case FLSHIFT:
2144 		if (nextarg == 0) {
2145 			WARNING("lshift requires two arguments; returning 0");
2146 			u = 0;
2147 			break;
2148 		}
2149 		y = execute(a[1]->nnext);
2150 		u = ((int)getfval(x)) << ((int)getfval(y));
2151 		tempfree(y);
2152 		nextarg = nextarg->nnext;
2153 		break;
2154 	case FRSHIFT:
2155 		if (nextarg == 0) {
2156 			WARNING("rshift requires two arguments; returning 0");
2157 			u = 0;
2158 			break;
2159 		}
2160 		y = execute(a[1]->nnext);
2161 		u = ((int)getfval(x)) >> ((int)getfval(y));
2162 		tempfree(y);
2163 		nextarg = nextarg->nnext;
2164 		break;
2165 	case FSYSTEM:
2166 		fflush(stdout);		/* in case something is buffered already */
2167 		estatus = status = system(getsval(x));
2168 		if (status != -1) {
2169 			if (WIFEXITED(status)) {
2170 				estatus = WEXITSTATUS(status);
2171 			} else if (WIFSIGNALED(status)) {
2172 				estatus = WTERMSIG(status) + 256;
2173 #ifdef WCOREDUMP
2174 				if (WCOREDUMP(status))
2175 					estatus += 256;
2176 #endif
2177 			} else	/* something else?!? */
2178 				estatus = 0;
2179 		}
2180 		/* else estatus was set to -1 */
2181 		u = estatus;
2182 		break;
2183 	case FRAND:
2184 		/* random() returns numbers in [0..2^31-1]
2185 		 * in order to get a number in [0, 1), divide it by 2^31
2186 		 */
2187 		u = (Awkfloat) random() / (0x7fffffffL + 0x1UL);
2188 		break;
2189 	case FSRAND:
2190 		if (isrec(x)) {		/* no argument provided */
2191 			u = time(NULL);
2192 			tmp = u;
2193 			srandom((unsigned int) u);
2194 		} else {
2195 			u = getfval(x);
2196 			tmp = u;
2197 			srandom_deterministic((unsigned int) u);
2198 		}
2199 		u = srand_seed;
2200 		srand_seed = tmp;
2201 		break;
2202 	case FTOUPPER:
2203 	case FTOLOWER:
2204 		if (t == FTOUPPER)
2205 			buf = nawk_toupper(getsval(x));
2206 		else
2207 			buf = nawk_tolower(getsval(x));
2208 		tempfree(x);
2209 		x = gettemp();
2210 		setsval(x, buf);
2211 		free(buf);
2212 		return x;
2213 	case FFLUSH:
2214 		if (isrec(x) || strlen(getsval(x)) == 0) {
2215 			flush_all();	/* fflush() or fflush("") -> all */
2216 			u = 0;
2217 		} else if ((fp = openfile(FFLUSH, getsval(x), NULL)) == NULL)
2218 			u = EOF;
2219 		else
2220 			u = fflush(fp);
2221 		break;
2222 	case FMKTIME:
2223 		memset(&tmbuf, 0, sizeof(tmbuf));
2224 		tm = &tmbuf;
2225 		t = sscanf(getsval(x), "%d %d %d %d %d %d %d",
2226 		    &tm->tm_year, &tm->tm_mon, &tm->tm_mday, &tm->tm_hour,
2227 		    &tm->tm_min, &tm->tm_sec, &tm->tm_isdst);
2228 		switch (t) {
2229 		case 6:
2230 			tm->tm_isdst = -1;	/* let mktime figure it out */
2231 			/* FALLTHROUGH */
2232 		case 7:
2233 			tm->tm_year -= 1900;
2234 			tm->tm_mon--;
2235 			u = mktime(tm);
2236 			break;
2237 		default:
2238 			u = -1;
2239 			break;
2240 		}
2241 		break;
2242 	case FSYSTIME:
2243 		u = time((time_t *) 0);
2244 		break;
2245 	case FSTRFTIME:
2246 		/* strftime([format [,timestamp]]) */
2247 		if (nextarg) {
2248 			y = execute(nextarg);
2249 			nextarg = nextarg->nnext;
2250 			tv = (time_t) getfval(y);
2251 			tempfree(y);
2252 		} else
2253 			tv = time((time_t *) 0);
2254 		tm = localtime(&tv);
2255 		if (tm == NULL)
2256 			FATAL("bad time %ld", (long)tv);
2257 
2258 		if (isrec(x)) {
2259 			/* format argument not provided, use default */
2260 			fmt = tostring("%a %b %d %H:%M:%S %Z %Y");
2261 		} else
2262 			fmt = tostring(getsval(x));
2263 
2264 		sz = 32;
2265 		buf = NULL;
2266 		do {
2267 			if ((buf = (char *) reallocarray(buf, 2, sz)) == NULL)
2268 				FATAL("out of memory in strftime");
2269 			sz *= 2;
2270 		} while (strftime(buf, sz, fmt, tm) == 0 && fmt[0] != '\0');
2271 
2272 		y = gettemp();
2273 		setsval(y, buf);
2274 		free(fmt);
2275 		free(buf);
2276 
2277 		return y;
2278 	default:	/* can't happen */
2279 		FATAL("illegal function type %d", t);
2280 		break;
2281 	}
2282 	tempfree(x);
2283 	x = gettemp();
2284 	setfval(x, u);
2285 	if (nextarg != NULL) {
2286 		WARNING("warning: function has too many arguments");
2287 		for ( ; nextarg; nextarg = nextarg->nnext) {
2288 			y = execute(nextarg);
2289 			tempfree(y);
2290 		}
2291 	}
2292 	return(x);
2293 }
2294 
printstat(Node ** a,int n)2295 Cell *printstat(Node **a, int n)	/* print a[0] */
2296 {
2297 	Node *x;
2298 	Cell *y;
2299 	FILE *fp;
2300 
2301 	if (a[1] == NULL)	/* a[1] is redirection operator, a[2] is file */
2302 		fp = stdout;
2303 	else
2304 		fp = redirect(ptoi(a[1]), a[2]);
2305 	for (x = a[0]; x != NULL; x = x->nnext) {
2306 		y = execute(x);
2307 		fputs(getpssval(y), fp);
2308 		tempfree(y);
2309 		if (x->nnext == NULL)
2310 			fputs(getsval(orsloc), fp);
2311 		else
2312 			fputs(getsval(ofsloc), fp);
2313 	}
2314 	if (a[1] != NULL)
2315 		fflush(fp);
2316 	if (ferror(fp))
2317 		FATAL("write error on %s", filename(fp));
2318 	return(True);
2319 }
2320 
nullproc(Node ** a,int n)2321 Cell *nullproc(Node **a, int n)
2322 {
2323 	return 0;
2324 }
2325 
2326 
redirect(int a,Node * b)2327 FILE *redirect(int a, Node *b)	/* set up all i/o redirections */
2328 {
2329 	FILE *fp;
2330 	Cell *x;
2331 	char *fname;
2332 
2333 	x = execute(b);
2334 	fname = getsval(x);
2335 	fp = openfile(a, fname, NULL);
2336 	if (fp == NULL)
2337 		FATAL("can't open file %s", fname);
2338 	tempfree(x);
2339 	return fp;
2340 }
2341 
2342 struct files {
2343 	FILE	*fp;
2344 	const char	*fname;
2345 	int	mode;	/* '|', 'a', 'w' => LE/LT, GT */
2346 } *files;
2347 
2348 size_t nfiles;
2349 
stdinit(void)2350 static void stdinit(void)	/* in case stdin, etc., are not constants */
2351 {
2352 	nfiles = FOPEN_MAX;
2353 	files = (struct files *) calloc(nfiles, sizeof(*files));
2354 	if (files == NULL)
2355 		FATAL("can't allocate file memory for %zu files", nfiles);
2356         files[0].fp = stdin;
2357 	files[0].fname = tostring("/dev/stdin");
2358 	files[0].mode = LT;
2359         files[1].fp = stdout;
2360 	files[1].fname = tostring("/dev/stdout");
2361 	files[1].mode = GT;
2362         files[2].fp = stderr;
2363 	files[2].fname = tostring("/dev/stderr");
2364 	files[2].mode = GT;
2365 }
2366 
openfile(int a,const char * us,bool * pnewflag)2367 FILE *openfile(int a, const char *us, bool *pnewflag)
2368 {
2369 	const char *s = us;
2370 	size_t i;
2371 	int m;
2372 	FILE *fp = NULL;
2373 
2374 	if (*s == '\0')
2375 		FATAL("null file name in print or getline");
2376 	for (i = 0; i < nfiles; i++)
2377 		if (files[i].fname && strcmp(s, files[i].fname) == 0 &&
2378 		    (a == files[i].mode || (a==APPEND && files[i].mode==GT) ||
2379 		     a == FFLUSH)) {
2380 			if (pnewflag)
2381 				*pnewflag = false;
2382 			return files[i].fp;
2383 		}
2384 	if (a == FFLUSH)	/* didn't find it, so don't create it! */
2385 		return NULL;
2386 
2387 	for (i = 0; i < nfiles; i++)
2388 		if (files[i].fp == NULL)
2389 			break;
2390 	if (i >= nfiles) {
2391 		struct files *nf;
2392 		size_t nnf = nfiles + FOPEN_MAX;
2393 		nf = (struct files *) reallocarray(files, nnf, sizeof(*nf));
2394 		if (nf == NULL)
2395 			FATAL("cannot grow files for %s and %zu files", s, nnf);
2396 		memset(&nf[nfiles], 0, FOPEN_MAX * sizeof(*nf));
2397 		nfiles = nnf;
2398 		files = nf;
2399 	}
2400 	fflush(stdout);	/* force a semblance of order */
2401 	m = a;
2402 	if (a == GT) {
2403 		fp = fopen(s, "w");
2404 	} else if (a == APPEND) {
2405 		fp = fopen(s, "a");
2406 		m = GT;	/* so can mix > and >> */
2407 	} else if (a == '|') {	/* output pipe */
2408 		fp = popen(s, "w");
2409 	} else if (a == LE) {	/* input pipe */
2410 		fp = popen(s, "r");
2411 	} else if (a == LT) {	/* getline <file */
2412 		fp = strcmp(s, "-") == 0 ? stdin : fopen(s, "r");	/* "-" is stdin */
2413 	} else	/* can't happen */
2414 		FATAL("illegal redirection %d", a);
2415 	if (fp != NULL) {
2416 		files[i].fname = tostring(s);
2417 		files[i].fp = fp;
2418 		files[i].mode = m;
2419 		if (pnewflag)
2420 			*pnewflag = true;
2421 		if (fp != stdin && fp != stdout && fp != stderr)
2422 			(void) fcntl(fileno(fp), F_SETFD, FD_CLOEXEC);
2423 	}
2424 	return fp;
2425 }
2426 
filename(FILE * fp)2427 const char *filename(FILE *fp)
2428 {
2429 	size_t i;
2430 
2431 	for (i = 0; i < nfiles; i++)
2432 		if (fp == files[i].fp)
2433 			return files[i].fname;
2434 	return "???";
2435 }
2436 
closefile(Node ** a,int n)2437 Cell *closefile(Node **a, int n)
2438 {
2439  	Cell *x;
2440 	size_t i;
2441 	bool stat;
2442 
2443  	x = execute(a[0]);
2444  	getsval(x);
2445 	stat = true;
2446  	for (i = 0; i < nfiles; i++) {
2447 		if (!files[i].fname || strcmp(x->sval, files[i].fname) != 0)
2448 			continue;
2449 		if (files[i].mode == GT || files[i].mode == '|')
2450 			fflush(files[i].fp);
2451 		if (ferror(files[i].fp)) {
2452 			if ((files[i].mode == GT && files[i].fp != stderr)
2453 			  || files[i].mode == '|')
2454 				FATAL("write error on %s", files[i].fname);
2455 			else
2456 				WARNING("i/o error occurred on %s", files[i].fname);
2457 		}
2458 		if (files[i].fp == stdin || files[i].fp == stdout ||
2459 		    files[i].fp == stderr)
2460 			stat = freopen("/dev/null", "r+", files[i].fp) == NULL;
2461 		else if (files[i].mode == '|' || files[i].mode == LE)
2462 			stat = pclose(files[i].fp) == -1;
2463 		else
2464 			stat = fclose(files[i].fp) == EOF;
2465 		if (stat)
2466 			WARNING("i/o error occurred closing %s", files[i].fname);
2467 		xfree(files[i].fname);
2468 		files[i].fname = NULL;	/* watch out for ref thru this */
2469 		files[i].fp = NULL;
2470 		break;
2471  	}
2472  	tempfree(x);
2473  	x = gettemp();
2474 	setfval(x, (Awkfloat) (stat ? -1 : 0));
2475  	return(x);
2476 }
2477 
closeall(void)2478 void closeall(void)
2479 {
2480 	size_t i;
2481 	bool stat = false;
2482 
2483 	for (i = 0; i < nfiles; i++) {
2484 		if (! files[i].fp)
2485 			continue;
2486 		if (files[i].mode == GT || files[i].mode == '|')
2487 			fflush(files[i].fp);
2488 		if (ferror(files[i].fp)) {
2489 			if ((files[i].mode == GT && files[i].fp != stderr)
2490 			  || files[i].mode == '|')
2491 				FATAL("write error on %s", files[i].fname);
2492 			else
2493 				WARNING("i/o error occurred on %s", files[i].fname);
2494 		}
2495 		if (files[i].fp == stdin || files[i].fp == stdout ||
2496 		    files[i].fp == stderr)
2497 			continue;
2498 		if (files[i].mode == '|' || files[i].mode == LE)
2499 			stat = pclose(files[i].fp) == -1;
2500 		else
2501 			stat = fclose(files[i].fp) == EOF;
2502 		if (stat)
2503 			WARNING("i/o error occurred while closing %s", files[i].fname);
2504 	}
2505 }
2506 
flush_all(void)2507 static void flush_all(void)
2508 {
2509 	size_t i;
2510 
2511 	for (i = 0; i < nfiles; i++)
2512 		if (files[i].fp)
2513 			fflush(files[i].fp);
2514 }
2515 
2516 void backsub(char **pb_ptr, const char **sptr_ptr);
2517 
dosub(Node ** a,int subop)2518 Cell *dosub(Node **a, int subop)        /* sub and gsub */
2519 {
2520 	fa *pfa;
2521 	int tempstat = 0;
2522 	char *repl;
2523 	Cell *x;
2524 
2525 	char *buf = NULL;
2526 	char *pb = NULL;
2527 	int bufsz = recsize;
2528 
2529 	const char *r, *s;
2530 	const char *start;
2531 	const char *noempty = NULL;      /* empty match disallowed here */
2532 	size_t m = 0;                    /* match count */
2533 	size_t whichm = 0;               /* which match to select, 0 = global */
2534 	int mtype;                       /* match type */
2535 
2536 	if (a[0] == NULL) {	/* 0 => a[1] is already-compiled regexpr */
2537 		pfa = (fa *) a[1];
2538 	} else {
2539 		x = execute(a[1]);
2540 		pfa = makedfa(getsval(x), 1);
2541 		tempfree(x);
2542 	}
2543 
2544 	x = execute(a[2]);	/* replacement string */
2545 	repl = tostring(getsval(x));
2546 	tempfree(x);
2547 
2548 	switch (subop) {
2549 	case SUB:
2550 		whichm = 1;
2551 		x = execute(a[3]);    /* source string */
2552 		break;
2553 	case GSUB:
2554 		whichm = 0;
2555 		x = execute(a[3]);    /* source string */
2556 		break;
2557 	default:
2558 		FATAL("dosub: unrecognized subop: %d", subop);
2559 	}
2560 
2561 	start = getsval(x);
2562 	while (pmatch(pfa, start)) {
2563 		if (buf == NULL) {
2564 			if ((pb = buf = (char *) malloc(bufsz)) == NULL)
2565 				FATAL("out of memory in dosub");
2566 			tempstat = pfa->initstat;
2567 			pfa->initstat = 2;
2568 		}
2569 
2570 		/* match types */
2571 		#define	MT_IGNORE  0  /* unselected or invalid */
2572 		#define MT_INSERT  1  /* selected, empty */
2573 		#define MT_REPLACE 2  /* selected, not empty */
2574 
2575 		/* an empty match just after replacement is invalid */
2576 
2577 		if (patbeg == noempty && patlen == 0) {
2578 			mtype = MT_IGNORE;    /* invalid, not counted */
2579 		} else if (whichm == ++m || whichm == 0) {
2580 			mtype = patlen ? MT_REPLACE : MT_INSERT;
2581 		} else {
2582 			mtype = MT_IGNORE;    /* unselected, but counted */
2583 		}
2584 
2585 		/* leading text: */
2586 		if (patbeg > start) {
2587 			adjbuf(&buf, &bufsz, (pb - buf) + (patbeg - start),
2588 				recsize, &pb, "dosub");
2589 			s = start;
2590 			while (s < patbeg)
2591 				*pb++ = *s++;
2592 		}
2593 
2594 		if (mtype == MT_IGNORE)
2595 			goto matching_text;  /* skip replacement text */
2596 
2597 		r = repl;
2598 		while (*r != 0) {
2599 			adjbuf(&buf, &bufsz, 5+pb-buf, recsize, &pb, "dosub");
2600 			if (*r == '\\') {
2601 				backsub(&pb, &r);
2602 			} else if (*r == '&') {
2603 				r++;
2604 				adjbuf(&buf, &bufsz, 1+patlen+pb-buf, recsize,
2605 					&pb, "dosub");
2606 				for (s = patbeg; s < patbeg+patlen; )
2607 					*pb++ = *s++;
2608 			} else {
2609 				*pb++ = *r++;
2610 			}
2611 		}
2612 
2613 matching_text:
2614 		if (mtype == MT_REPLACE || *patbeg == '\0')
2615 			goto next_search;  /* skip matching text */
2616 
2617 		if (patlen == 0)
2618 			patlen = u8_nextlen(patbeg);
2619 		adjbuf(&buf, &bufsz, (pb-buf) + patlen, recsize, &pb, "dosub");
2620 		s = patbeg;
2621 		while (s < patbeg + patlen)
2622 			*pb++ = *s++;
2623 
2624 next_search:
2625 		start = patbeg + patlen;
2626 		if (m == whichm || *patbeg == '\0')
2627 			break;
2628 		if (mtype == MT_REPLACE)
2629 			noempty = start;
2630 
2631 		#undef MT_IGNORE
2632 		#undef MT_INSERT
2633 		#undef MT_REPLACE
2634 	}
2635 
2636 	xfree(repl);
2637 
2638 	if (buf != NULL) {
2639 		pfa->initstat = tempstat;
2640 
2641 		/* trailing text */
2642 		adjbuf(&buf, &bufsz, 1+strlen(start)+pb-buf, 0, &pb, "dosub");
2643 		while ((*pb++ = *start++) != '\0')
2644 			;
2645 
2646 		setsval(x, buf);
2647 		free(buf);
2648 	}
2649 
2650 	tempfree(x);
2651 	x = gettemp();
2652 	x->tval = NUM;
2653 	x->fval = m;
2654 	return x;
2655 }
2656 
gensub(Node ** a,int nnn)2657 Cell *gensub(Node **a, int nnn)	/* global selective substitute */
2658 	/* XXX incomplete - doesn't support backreferences \0 ... \9 */
2659 {
2660 	Cell *x, *y, *res, *h;
2661 	char *rptr;
2662 	const char *sptr;
2663 	char *buf, *pb;
2664 	const char *t, *q;
2665 	fa *pfa;
2666 	int mflag, tempstat, num, whichm;
2667 	int bufsz = recsize;
2668 
2669 	if ((buf = (char *) malloc(bufsz)) == NULL)
2670 		FATAL("out of memory in gensub");
2671 	mflag = 0;	/* if mflag == 0, can replace empty string */
2672 	num = 0;
2673 	x = execute(a[4]);	/* source string */
2674 	t = getsval(x);
2675 	res = copycell(x);	/* target string - initially copy of source */
2676 	res->csub = CTEMP;	/* result values are temporary */
2677 	if (a[0] == 0)		/* 0 => a[1] is already-compiled regexpr */
2678 		pfa = (fa *) a[1];	/* regular expression */
2679 	else {
2680 		y = execute(a[1]);
2681 		pfa = makedfa(getsval(y), 1);
2682 		tempfree(y);
2683 	}
2684 	y = execute(a[2]);	/* replacement string */
2685 	h = execute(a[3]);	/* which matches should be replaced */
2686 	sptr = getsval(h);
2687 	if (sptr[0] == 'g' || sptr[0] == 'G')
2688 		whichm = -1;
2689 	else {
2690 		/*
2691 		 * The specified number is index of replacement, starting
2692 		 * from 1. GNU awk treats index lower than 0 same as
2693 		 * 1, we do same for compatibility.
2694 		 */
2695 		whichm = (int) getfval(h) - 1;
2696 		if (whichm < 0)
2697 			whichm = 0;
2698 	}
2699 	tempfree(h);
2700 
2701 	if (pmatch(pfa, t)) {
2702 		char *sl;
2703 
2704 		tempstat = pfa->initstat;
2705 		pfa->initstat = 2;
2706 		pb = buf;
2707 		rptr = getsval(y);
2708 		/*
2709 		 * XXX if there are any backreferences in subst string,
2710 		 * complain now.
2711 		 */
2712 		for (sl = rptr; (sl = strchr(sl, '\\')) && sl[1]; sl++) {
2713 			if (strchr("0123456789", sl[1])) {
2714 				FATAL("gensub doesn't support backreferences (subst \"%s\")", rptr);
2715 			}
2716 		}
2717 
2718 		do {
2719 			if (whichm >= 0 && whichm != num) {
2720 				num++;
2721 				adjbuf(&buf, &bufsz, (pb - buf) + (patbeg - t) + patlen, recsize, &pb, "gensub");
2722 
2723 				/* copy the part of string up to and including
2724 				 * match to output buffer */
2725 				while (t < patbeg + patlen)
2726 					*pb++ = *t++;
2727 				continue;
2728 			}
2729 
2730 			if (patlen == 0 && *patbeg != 0) {	/* matched empty string */
2731 				if (mflag == 0) {	/* can replace empty */
2732 					num++;
2733 					sptr = rptr;
2734 					while (*sptr != 0) {
2735 						adjbuf(&buf, &bufsz, 5+pb-buf, recsize, &pb, "gensub");
2736 						if (*sptr == '\\') {
2737 							backsub(&pb, &sptr);
2738 						} else if (*sptr == '&') {
2739 							sptr++;
2740 							adjbuf(&buf, &bufsz, 1+patlen+pb-buf, recsize, &pb, "gensub");
2741 							for (q = patbeg; q < patbeg+patlen; )
2742 								*pb++ = *q++;
2743 						} else
2744 							*pb++ = *sptr++;
2745 					}
2746 				}
2747 				if (*t == 0)	/* at end */
2748 					goto done;
2749 				adjbuf(&buf, &bufsz, 2+pb-buf, recsize, &pb, "gensub");
2750 				*pb++ = *t++;
2751 				if (pb > buf + bufsz)	/* BUG: not sure of this test */
2752 					FATAL("gensub result0 %.30s too big; can't happen", buf);
2753 				mflag = 0;
2754 			}
2755 			else {	/* matched nonempty string */
2756 				num++;
2757 				sptr = t;
2758 				adjbuf(&buf, &bufsz, 1+(patbeg-sptr)+pb-buf, recsize, &pb, "gensub");
2759 				while (sptr < patbeg)
2760 					*pb++ = *sptr++;
2761 				sptr = rptr;
2762 				while (*sptr != 0) {
2763 					adjbuf(&buf, &bufsz, 5+pb-buf, recsize, &pb, "gensub");
2764 					if (*sptr == '\\') {
2765 						backsub(&pb, &sptr);
2766 					} else if (*sptr == '&') {
2767 						sptr++;
2768 						adjbuf(&buf, &bufsz, 1+patlen+pb-buf, recsize, &pb, "gensub");
2769 						for (q = patbeg; q < patbeg+patlen; )
2770 							*pb++ = *q++;
2771 					} else
2772 						*pb++ = *sptr++;
2773 				}
2774 				t = patbeg + patlen;
2775 				if (patlen == 0 || *t == 0 || *(t-1) == 0)
2776 					goto done;
2777 				if (pb > buf + bufsz)
2778 					FATAL("gensub result1 %.30s too big; can't happen", buf);
2779 				mflag = 1;
2780 			}
2781 		} while (pmatch(pfa,t));
2782 		sptr = t;
2783 		adjbuf(&buf, &bufsz, 1+strlen(sptr)+pb-buf, 0, &pb, "gensub");
2784 		while ((*pb++ = *sptr++) != 0)
2785 			;
2786 	done:	if (pb > buf + bufsz)
2787 			FATAL("gensub result2 %.30s too big; can't happen", buf);
2788 		*pb = '\0';
2789 		setsval(res, buf);
2790 		pfa->initstat = tempstat;
2791 	}
2792 	tempfree(x);
2793 	tempfree(y);
2794 	free(buf);
2795 	return(res);
2796 }
2797 
backsub(char ** pb_ptr,const char ** sptr_ptr)2798 void backsub(char **pb_ptr, const char **sptr_ptr)	/* handle \\& variations */
2799 {						/* sptr[0] == '\\' */
2800 	char *pb = *pb_ptr;
2801 	const char *sptr = *sptr_ptr;
2802 
2803 	if (sptr[1] == '\\') {
2804 		if (sptr[2] == '\\' && sptr[3] == '&') { /* \\\& -> \& */
2805 			*pb++ = '\\';
2806 			*pb++ = '&';
2807 			sptr += 4;
2808 		} else if (sptr[2] == '&') {	/* \\& -> \ + matched */
2809 			*pb++ = '\\';
2810 			sptr += 2;
2811 		} else if (do_posix) {		/* \\x -> \x */
2812 			sptr++;
2813 			*pb++ = *sptr++;
2814 		} else {			/* \\x -> \\x */
2815 			*pb++ = *sptr++;
2816 			*pb++ = *sptr++;
2817 		}
2818 	} else if (sptr[1] == '&') {	/* literal & */
2819 		sptr++;
2820 		*pb++ = *sptr++;
2821 	} else				/* literal \ */
2822 		*pb++ = *sptr++;
2823 
2824 	*pb_ptr = pb;
2825 	*sptr_ptr = sptr;
2826 }
2827 
wide_char_to_byte_str(int rune,size_t * outlen)2828 static char *wide_char_to_byte_str(int rune, size_t *outlen)
2829 {
2830 	static char buf[5];
2831 	int len;
2832 
2833 	if (rune < 0 || rune > 0x10FFFF)
2834 		return NULL;
2835 
2836 	memset(buf, 0, sizeof(buf));
2837 
2838 	len = 0;
2839 	if (rune <= 0x0000007F) {
2840 		buf[len++] = rune;
2841 	} else if (rune <= 0x000007FF) {
2842 		// 110xxxxx 10xxxxxx
2843 		buf[len++] = 0xC0 | (rune >> 6);
2844 		buf[len++] = 0x80 | (rune & 0x3F);
2845 	} else if (rune <= 0x0000FFFF) {
2846 		// 1110xxxx 10xxxxxx 10xxxxxx
2847 		buf[len++] = 0xE0 | (rune >> 12);
2848 		buf[len++] = 0x80 | ((rune >> 6) & 0x3F);
2849 		buf[len++] = 0x80 | (rune & 0x3F);
2850 
2851 	} else {
2852 		// 0x00010000 - 0x10FFFF
2853 		// 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
2854 		buf[len++] = 0xF0 | (rune >> 18);
2855 		buf[len++] = 0x80 | ((rune >> 12) & 0x3F);
2856 		buf[len++] = 0x80 | ((rune >> 6) & 0x3F);
2857 		buf[len++] = 0x80 | (rune & 0x3F);
2858 	}
2859 
2860 	*outlen = len;
2861 	buf[len++] = '\0';
2862 
2863 	return buf;
2864 }
2865