xref: /freebsd/contrib/one-true-awk/run.c (revision eb690a05)
1 /****************************************************************
2 Copyright (C) Lucent Technologies 1997
3 All Rights Reserved
4 
5 Permission to use, copy, modify, and distribute this software and
6 its documentation for any purpose and without fee is hereby
7 granted, provided that the above copyright notice appear in all
8 copies and that both that the copyright notice and this
9 permission notice and warranty disclaimer appear in supporting
10 documentation, and that the name Lucent Technologies or any of
11 its entities not be used in advertising or publicity pertaining
12 to distribution of the software without specific, written prior
13 permission.
14 
15 LUCENT DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
16 INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS.
17 IN NO EVENT SHALL LUCENT OR ANY OF ITS ENTITIES BE LIABLE FOR ANY
18 SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
19 WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER
20 IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,
21 ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF
22 THIS SOFTWARE.
23 ****************************************************************/
24 
25 #define DEBUG
26 #include <stdio.h>
27 #include <ctype.h>
28 #include <errno.h>
29 #include <wctype.h>
30 #include <fcntl.h>
31 #include <setjmp.h>
32 #include <limits.h>
33 #include <math.h>
34 #include <string.h>
35 #include <stdlib.h>
36 #include <time.h>
37 #include <sys/types.h>
38 #include <sys/wait.h>
39 #include "awk.h"
40 #include "awkgram.tab.h"
41 
42 
43 static void stdinit(void);
44 static void flush_all(void);
45 static char *wide_char_to_byte_str(int rune, size_t *outlen);
46 
47 #if 1
48 #define tempfree(x)	do { if (istemp(x)) tfree(x); } while (/*CONSTCOND*/0)
49 #else
tempfree(Cell * p)50 void tempfree(Cell *p) {
51 	if (p->ctype == OCELL && (p->csub < CUNK || p->csub > CFREE)) {
52 		WARNING("bad csub %d in Cell %d %s",
53 			p->csub, p->ctype, p->sval);
54 	}
55 	if (istemp(p))
56 		tfree(p);
57 }
58 #endif
59 
60 /* do we really need these? */
61 /* #ifdef _NFILE */
62 /* #ifndef FOPEN_MAX */
63 /* #define FOPEN_MAX _NFILE */
64 /* #endif */
65 /* #endif */
66 /*  */
67 /* #ifndef	FOPEN_MAX */
68 /* #define	FOPEN_MAX	40 */	/* max number of open files */
69 /* #endif */
70 /*  */
71 /* #ifndef RAND_MAX */
72 /* #define RAND_MAX	32767 */	/* all that ansi guarantees */
73 /* #endif */
74 
75 jmp_buf env;
76 extern	int	pairstack[];
77 extern	Awkfloat	srand_seed;
78 
79 Node	*winner = NULL;	/* root of parse tree */
80 Cell	*tmps;		/* free temporary cells for execution */
81 
82 static Cell	truecell	={ OBOOL, BTRUE, 0, 0, 1.0, NUM, NULL, NULL };
83 Cell	*True	= &truecell;
84 static Cell	falsecell	={ OBOOL, BFALSE, 0, 0, 0.0, NUM, NULL, NULL };
85 Cell	*False	= &falsecell;
86 static Cell	breakcell	={ OJUMP, JBREAK, 0, 0, 0.0, NUM, NULL, NULL };
87 Cell	*jbreak	= &breakcell;
88 static Cell	contcell	={ OJUMP, JCONT, 0, 0, 0.0, NUM, NULL, NULL };
89 Cell	*jcont	= &contcell;
90 static Cell	nextcell	={ OJUMP, JNEXT, 0, 0, 0.0, NUM, NULL, NULL };
91 Cell	*jnext	= &nextcell;
92 static Cell	nextfilecell	={ OJUMP, JNEXTFILE, 0, 0, 0.0, NUM, NULL, NULL };
93 Cell	*jnextfile	= &nextfilecell;
94 static Cell	exitcell	={ OJUMP, JEXIT, 0, 0, 0.0, NUM, NULL, NULL };
95 Cell	*jexit	= &exitcell;
96 static Cell	retcell		={ OJUMP, JRET, 0, 0, 0.0, NUM, NULL, NULL };
97 Cell	*jret	= &retcell;
98 static Cell	tempcell	={ OCELL, CTEMP, 0, EMPTY, 0.0, NUM|STR|DONTFREE, NULL, NULL };
99 
100 Node	*curnode = NULL;	/* the node being executed, for debugging */
101 
102 /* buffer memory management */
adjbuf(char ** pbuf,int * psiz,int minlen,int quantum,char ** pbptr,const char * whatrtn)103 int adjbuf(char **pbuf, int *psiz, int minlen, int quantum, char **pbptr,
104 	const char *whatrtn)
105 /* pbuf:    address of pointer to buffer being managed
106  * psiz:    address of buffer size variable
107  * minlen:  minimum length of buffer needed
108  * quantum: buffer size quantum
109  * pbptr:   address of movable pointer into buffer, or 0 if none
110  * whatrtn: name of the calling routine if failure should cause fatal error
111  *
112  * return   0 for realloc failure, !=0 for success
113  */
114 {
115 	if (minlen > *psiz) {
116 		char *tbuf;
117 		int rminlen = quantum ? minlen % quantum : 0;
118 		int boff = pbptr ? *pbptr - *pbuf : 0;
119 		/* round up to next multiple of quantum */
120 		if (rminlen)
121 			minlen += quantum - rminlen;
122 		tbuf = (char *) realloc(*pbuf, minlen);
123 		DPRINTF("adjbuf %s: %d %d (pbuf=%p, tbuf=%p)\n", whatrtn, *psiz, minlen, (void*)*pbuf, (void*)tbuf);
124 		if (tbuf == NULL) {
125 			if (whatrtn)
126 				FATAL("out of memory in %s", whatrtn);
127 			return 0;
128 		}
129 		*pbuf = tbuf;
130 		*psiz = minlen;
131 		if (pbptr)
132 			*pbptr = tbuf + boff;
133 	}
134 	return 1;
135 }
136 
run(Node * a)137 void run(Node *a)	/* execution of parse tree starts here */
138 {
139 
140 	stdinit();
141 	execute(a);
142 	closeall();
143 }
144 
execute(Node * u)145 Cell *execute(Node *u)	/* execute a node of the parse tree */
146 {
147 	Cell *(*proc)(Node **, int);
148 	Cell *x;
149 	Node *a;
150 
151 	if (u == NULL)
152 		return(True);
153 	for (a = u; ; a = a->nnext) {
154 		curnode = a;
155 		if (isvalue(a)) {
156 			x = (Cell *) (a->narg[0]);
157 			if (isfld(x) && !donefld)
158 				fldbld();
159 			else if (isrec(x) && !donerec)
160 				recbld();
161 			return(x);
162 		}
163 		if (notlegal(a->nobj))	/* probably a Cell* but too risky to print */
164 			FATAL("illegal statement");
165 		proc = proctab[a->nobj-FIRSTTOKEN];
166 		x = (*proc)(a->narg, a->nobj);
167 		if (isfld(x) && !donefld)
168 			fldbld();
169 		else if (isrec(x) && !donerec)
170 			recbld();
171 		if (isexpr(a))
172 			return(x);
173 		if (isjump(x))
174 			return(x);
175 		if (a->nnext == NULL)
176 			return(x);
177 		tempfree(x);
178 	}
179 }
180 
181 
program(Node ** a,int n)182 Cell *program(Node **a, int n)	/* execute an awk program */
183 {				/* a[0] = BEGIN, a[1] = body, a[2] = END */
184 	Cell *x;
185 
186 	if (setjmp(env) != 0)
187 		goto ex;
188 	if (a[0]) {		/* BEGIN */
189 		x = execute(a[0]);
190 		if (isexit(x))
191 			return(True);
192 		if (isjump(x))
193 			FATAL("illegal break, continue, next or nextfile from BEGIN");
194 		tempfree(x);
195 	}
196 	if (a[1] || a[2])
197 		while (getrec(&record, &recsize, true) > 0) {
198 			x = execute(a[1]);
199 			if (isexit(x))
200 				break;
201 			tempfree(x);
202 		}
203   ex:
204 	if (setjmp(env) != 0)	/* handles exit within END */
205 		goto ex1;
206 	if (a[2]) {		/* END */
207 		x = execute(a[2]);
208 		if (isbreak(x) || isnext(x) || iscont(x))
209 			FATAL("illegal break, continue, next or nextfile from END");
210 		tempfree(x);
211 	}
212   ex1:
213 	return(True);
214 }
215 
216 struct Frame {	/* stack frame for awk function calls */
217 	int nargs;	/* number of arguments in this call */
218 	Cell *fcncell;	/* pointer to Cell for function */
219 	Cell **args;	/* pointer to array of arguments after execute */
220 	Cell *retval;	/* return value */
221 };
222 
223 #define	NARGS	50	/* max args in a call */
224 
225 struct Frame *frame = NULL;	/* base of stack frames; dynamically allocated */
226 int	nframe = 0;		/* number of frames allocated */
227 struct Frame *frp = NULL;	/* frame pointer. bottom level unused */
228 
call(Node ** a,int n)229 Cell *call(Node **a, int n)	/* function call.  very kludgy and fragile */
230 {
231 	static const Cell newcopycell = { OCELL, CCOPY, 0, EMPTY, 0.0, NUM|STR|DONTFREE, NULL, NULL };
232 	int i, ncall, ndef;
233 	int freed = 0; /* handles potential double freeing when fcn & param share a tempcell */
234 	Node *x;
235 	Cell *args[NARGS], *oargs[NARGS];	/* BUG: fixed size arrays */
236 	Cell *y, *z, *fcn;
237 	char *s;
238 
239 	fcn = execute(a[0]);	/* the function itself */
240 	s = fcn->nval;
241 	if (!isfcn(fcn))
242 		FATAL("calling undefined function %s", s);
243 	if (frame == NULL) {
244 		frp = frame = (struct Frame *) calloc(nframe += 100, sizeof(*frame));
245 		if (frame == NULL)
246 			FATAL("out of space for stack frames calling %s", s);
247 	}
248 	for (ncall = 0, x = a[1]; x != NULL; x = x->nnext)	/* args in call */
249 		ncall++;
250 	ndef = (int) fcn->fval;			/* args in defn */
251 	DPRINTF("calling %s, %d args (%d in defn), frp=%d\n", s, ncall, ndef, (int) (frp-frame));
252 	if (ncall > ndef)
253 		WARNING("function %s called with %d args, uses only %d",
254 			s, ncall, ndef);
255 	if (ncall + ndef > NARGS)
256 		FATAL("function %s has %d arguments, limit %d", s, ncall+ndef, NARGS);
257 	for (i = 0, x = a[1]; x != NULL; i++, x = x->nnext) {	/* get call args */
258 		DPRINTF("evaluate args[%d], frp=%d:\n", i, (int) (frp-frame));
259 		y = execute(x);
260 		oargs[i] = y;
261 		DPRINTF("args[%d]: %s %f <%s>, t=%o\n",
262 			i, NN(y->nval), y->fval, isarr(y) ? "(array)" : NN(y->sval), y->tval);
263 		if (isfcn(y))
264 			FATAL("can't use function %s as argument in %s", y->nval, s);
265 		if (isarr(y))
266 			args[i] = y;	/* arrays by ref */
267 		else
268 			args[i] = copycell(y);
269 		tempfree(y);
270 	}
271 	for ( ; i < ndef; i++) {	/* add null args for ones not provided */
272 		args[i] = gettemp();
273 		*args[i] = newcopycell;
274 	}
275 	frp++;	/* now ok to up frame */
276 	if (frp >= frame + nframe) {
277 		int dfp = frp - frame;	/* old index */
278 		frame = (struct Frame *) realloc(frame, (nframe += 100) * sizeof(*frame));
279 		if (frame == NULL)
280 			FATAL("out of space for stack frames in %s", s);
281 		frp = frame + dfp;
282 	}
283 	frp->fcncell = fcn;
284 	frp->args = args;
285 	frp->nargs = ndef;	/* number defined with (excess are locals) */
286 	frp->retval = gettemp();
287 
288 	DPRINTF("start exec of %s, frp=%d\n", s, (int) (frp-frame));
289 	y = execute((Node *)(fcn->sval));	/* execute body */
290 	DPRINTF("finished exec of %s, frp=%d\n", s, (int) (frp-frame));
291 
292 	for (i = 0; i < ndef; i++) {
293 		Cell *t = frp->args[i];
294 		if (isarr(t)) {
295 			if (t->csub == CCOPY) {
296 				if (i >= ncall) {
297 					freesymtab(t);
298 					t->csub = CTEMP;
299 					tempfree(t);
300 				} else {
301 					oargs[i]->tval = t->tval;
302 					oargs[i]->tval &= ~(STR|NUM|DONTFREE);
303 					oargs[i]->sval = t->sval;
304 					tempfree(t);
305 				}
306 			}
307 		} else if (t != y) {	/* kludge to prevent freeing twice */
308 			t->csub = CTEMP;
309 			tempfree(t);
310 		} else if (t == y && t->csub == CCOPY) {
311 			t->csub = CTEMP;
312 			tempfree(t);
313 			freed = 1;
314 		}
315 	}
316 	tempfree(fcn);
317 	if (isexit(y) || isnext(y))
318 		return y;
319 	if (freed == 0) {
320 		tempfree(y);	/* don't free twice! */
321 	}
322 	z = frp->retval;			/* return value */
323 	DPRINTF("%s returns %g |%s| %o\n", s, getfval(z), getsval(z), z->tval);
324 	frp--;
325 	return(z);
326 }
327 
copycell(Cell * x)328 Cell *copycell(Cell *x)	/* make a copy of a cell in a temp */
329 {
330 	Cell *y;
331 
332 	/* copy is not constant or field */
333 
334 	y = gettemp();
335 	y->tval = x->tval & ~(CON|FLD|REC);
336 	y->csub = CCOPY;	/* prevents freeing until call is over */
337 	y->nval = x->nval;	/* BUG? */
338 	if (isstr(x) /* || x->ctype == OCELL */) {
339 		y->sval = tostring(x->sval);
340 		y->tval &= ~DONTFREE;
341 	} else
342 		y->tval |= DONTFREE;
343 	y->fval = x->fval;
344 	return y;
345 }
346 
arg(Node ** a,int n)347 Cell *arg(Node **a, int n)	/* nth argument of a function */
348 {
349 
350 	n = ptoi(a[0]);	/* argument number, counting from 0 */
351 	DPRINTF("arg(%d), frp->nargs=%d\n", n, frp->nargs);
352 	if (n+1 > frp->nargs)
353 		FATAL("argument #%d of function %s was not supplied",
354 			n+1, frp->fcncell->nval);
355 	return frp->args[n];
356 }
357 
jump(Node ** a,int n)358 Cell *jump(Node **a, int n)	/* break, continue, next, nextfile, return */
359 {
360 	Cell *y;
361 
362 	switch (n) {
363 	case EXIT:
364 		if (a[0] != NULL) {
365 			y = execute(a[0]);
366 			errorflag = (int) getfval(y);
367 			tempfree(y);
368 		}
369 		longjmp(env, 1);
370 	case RETURN:
371 		if (a[0] != NULL) {
372 			y = execute(a[0]);
373 			if ((y->tval & (STR|NUM)) == (STR|NUM)) {
374 				setsval(frp->retval, getsval(y));
375 				frp->retval->fval = getfval(y);
376 				frp->retval->tval |= NUM;
377 			}
378 			else if (y->tval & STR)
379 				setsval(frp->retval, getsval(y));
380 			else if (y->tval & NUM)
381 				setfval(frp->retval, getfval(y));
382 			else		/* can't happen */
383 				FATAL("bad type variable %d", y->tval);
384 			tempfree(y);
385 		}
386 		return(jret);
387 	case NEXT:
388 		return(jnext);
389 	case NEXTFILE:
390 		nextfile();
391 		return(jnextfile);
392 	case BREAK:
393 		return(jbreak);
394 	case CONTINUE:
395 		return(jcont);
396 	default:	/* can't happen */
397 		FATAL("illegal jump type %d", n);
398 	}
399 	return 0;	/* not reached */
400 }
401 
awkgetline(Node ** a,int n)402 Cell *awkgetline(Node **a, int n)	/* get next line from specific input */
403 {		/* a[0] is variable, a[1] is operator, a[2] is filename */
404 	Cell *r, *x;
405 	extern Cell **fldtab;
406 	FILE *fp;
407 	char *buf;
408 	int bufsize = recsize;
409 	int mode;
410 	bool newflag;
411 	double result;
412 
413 	if ((buf = (char *) malloc(bufsize)) == NULL)
414 		FATAL("out of memory in getline");
415 
416 	fflush(stdout);	/* in case someone is waiting for a prompt */
417 	r = gettemp();
418 	if (a[1] != NULL) {		/* getline < file */
419 		x = execute(a[2]);		/* filename */
420 		mode = ptoi(a[1]);
421 		if (mode == '|')		/* input pipe */
422 			mode = LE;	/* arbitrary flag */
423 		fp = openfile(mode, getsval(x), &newflag);
424 		tempfree(x);
425 		if (fp == NULL)
426 			n = -1;
427 		else
428 			n = readrec(&buf, &bufsize, fp, newflag);
429 		if (n <= 0) {
430 			;
431 		} else if (a[0] != NULL) {	/* getline var <file */
432 			x = execute(a[0]);
433 			setsval(x, buf);
434 			if (is_number(x->sval, & result)) {
435 				x->fval = result;
436 				x->tval |= NUM;
437 			}
438 			tempfree(x);
439 		} else {			/* getline <file */
440 			setsval(fldtab[0], buf);
441 			if (is_number(fldtab[0]->sval, & result)) {
442 				fldtab[0]->fval = result;
443 				fldtab[0]->tval |= NUM;
444 			}
445 		}
446 	} else {			/* bare getline; use current input */
447 		if (a[0] == NULL)	/* getline */
448 			n = getrec(&record, &recsize, true);
449 		else {			/* getline var */
450 			n = getrec(&buf, &bufsize, false);
451 			if (n > 0) {
452 				x = execute(a[0]);
453 				setsval(x, buf);
454 				if (is_number(x->sval, & result)) {
455 					x->fval = result;
456 					x->tval |= NUM;
457 				}
458 				tempfree(x);
459 			}
460 		}
461 	}
462 	setfval(r, (Awkfloat) n);
463 	free(buf);
464 	return r;
465 }
466 
getnf(Node ** a,int n)467 Cell *getnf(Node **a, int n)	/* get NF */
468 {
469 	if (!donefld)
470 		fldbld();
471 	return (Cell *) a[0];
472 }
473 
474 static char *
makearraystring(Node * p,const char * func)475 makearraystring(Node *p, const char *func)
476 {
477 	char *buf;
478 	int bufsz = recsize;
479 	size_t blen;
480 
481 	if ((buf = (char *) malloc(bufsz)) == NULL) {
482 		FATAL("%s: out of memory", func);
483 	}
484 
485 	blen = 0;
486 	buf[blen] = '\0';
487 
488 	for (; p; p = p->nnext) {
489 		Cell *x = execute(p);	/* expr */
490 		char *s = getsval(x);
491 		size_t seplen = strlen(getsval(subseploc));
492 		size_t nsub = p->nnext ? seplen : 0;
493 		size_t slen = strlen(s);
494 		size_t tlen = blen + slen + nsub;
495 
496 		if (!adjbuf(&buf, &bufsz, tlen + 1, recsize, 0, func)) {
497 			FATAL("%s: out of memory %s[%s...]",
498 			    func, x->nval, buf);
499 		}
500 		memcpy(buf + blen, s, slen);
501 		if (nsub) {
502 			memcpy(buf + blen + slen, *SUBSEP, nsub);
503 		}
504 		buf[tlen] = '\0';
505 		blen = tlen;
506 		tempfree(x);
507 	}
508 	return buf;
509 }
510 
array(Node ** a,int n)511 Cell *array(Node **a, int n)	/* a[0] is symtab, a[1] is list of subscripts */
512 {
513 	Cell *x, *z;
514 	char *buf;
515 
516 	x = execute(a[0]);	/* Cell* for symbol table */
517 	buf = makearraystring(a[1], __func__);
518 	if (!isarr(x)) {
519 		DPRINTF("making %s into an array\n", NN(x->nval));
520 		if (freeable(x))
521 			xfree(x->sval);
522 		x->tval &= ~(STR|NUM|DONTFREE);
523 		x->tval |= ARR;
524 		x->sval = (char *) makesymtab(NSYMTAB);
525 	}
526 	z = setsymtab(buf, "", 0.0, STR|NUM, (Array *) x->sval);
527 	z->ctype = OCELL;
528 	z->csub = CVAR;
529 	tempfree(x);
530 	free(buf);
531 	return(z);
532 }
533 
awkdelete(Node ** a,int n)534 Cell *awkdelete(Node **a, int n)	/* a[0] is symtab, a[1] is list of subscripts */
535 {
536 	Cell *x;
537 
538 	x = execute(a[0]);	/* Cell* for symbol table */
539 	if (x == symtabloc) {
540 		FATAL("cannot delete SYMTAB or its elements");
541 	}
542 	if (!isarr(x))
543 		return True;
544 	if (a[1] == NULL) {	/* delete the elements, not the table */
545 		freesymtab(x);
546 		x->tval &= ~STR;
547 		x->tval |= ARR;
548 		x->sval = (char *) makesymtab(NSYMTAB);
549 	} else {
550 		char *buf = makearraystring(a[1], __func__);
551 		freeelem(x, buf);
552 		free(buf);
553 	}
554 	tempfree(x);
555 	return True;
556 }
557 
intest(Node ** a,int n)558 Cell *intest(Node **a, int n)	/* a[0] is index (list), a[1] is symtab */
559 {
560 	Cell *ap, *k;
561 	char *buf;
562 
563 	ap = execute(a[1]);	/* array name */
564 	if (!isarr(ap)) {
565 		DPRINTF("making %s into an array\n", ap->nval);
566 		if (freeable(ap))
567 			xfree(ap->sval);
568 		ap->tval &= ~(STR|NUM|DONTFREE);
569 		ap->tval |= ARR;
570 		ap->sval = (char *) makesymtab(NSYMTAB);
571 	}
572 	buf = makearraystring(a[0], __func__);
573 	k = lookup(buf, (Array *) ap->sval);
574 	tempfree(ap);
575 	free(buf);
576 	if (k == NULL)
577 		return(False);
578 	else
579 		return(True);
580 }
581 
582 
583 /* ======== utf-8 code ========== */
584 
585 /*
586  * Awk strings can contain ascii, random 8-bit items (eg Latin-1),
587  * or utf-8.  u8_isutf tests whether a string starts with a valid
588  * utf-8 sequence, and returns 0 if not (e.g., high bit set).
589  * u8_nextlen returns length of next valid sequence, which is
590  * 1 for ascii, 2..4 for utf-8, or 1 for high bit non-utf.
591  * u8_strlen returns length of string in valid utf-8 sequences
592  * and/or high-bit bytes.  Conversion functions go between byte
593  * number and character number.
594  *
595  * In theory, this behaves the same as before for non-utf8 bytes.
596  *
597  * Limited checking! This is a potential security hole.
598  */
599 
600 /* is s the beginning of a valid utf-8 string? */
601 /* return length 1..4 if yes, 0 if no */
u8_isutf(const char * s)602 int u8_isutf(const char *s)
603 {
604 	int n, ret;
605 	unsigned char c;
606 
607 	c = s[0];
608 	if (c < 128 || awk_mb_cur_max == 1)
609 		return 1; /* what if it's 0? */
610 
611 	n = strlen(s);
612 	if (n >= 2 && ((c>>5) & 0x7) == 0x6 && (s[1] & 0xC0) == 0x80) {
613 		ret = 2; /* 110xxxxx 10xxxxxx */
614 	} else if (n >= 3 && ((c>>4) & 0xF) == 0xE && (s[1] & 0xC0) == 0x80
615 			 && (s[2] & 0xC0) == 0x80) {
616 		ret = 3; /* 1110xxxx 10xxxxxx 10xxxxxx */
617 	} else if (n >= 4 && ((c>>3) & 0x1F) == 0x1E && (s[1] & 0xC0) == 0x80
618 			 && (s[2] & 0xC0) == 0x80 && (s[3] & 0xC0) == 0x80) {
619 		ret = 4; /* 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */
620 	} else {
621 		ret = 0;
622 	}
623 	return ret;
624 }
625 
626 /* Convert (prefix of) utf8 string to utf-32 rune. */
627 /* Sets *rune to the value, returns the length. */
628 /* No error checking: watch out. */
u8_rune(int * rune,const char * s)629 int u8_rune(int *rune, const char *s)
630 {
631 	int n, ret;
632 	unsigned char c;
633 
634 	c = s[0];
635 	if (c < 128 || awk_mb_cur_max == 1) {
636 		*rune = c;
637 		return 1;
638 	}
639 
640 	n = strlen(s);
641 	if (n >= 2 && ((c>>5) & 0x7) == 0x6 && (s[1] & 0xC0) == 0x80) {
642 		*rune = ((c & 0x1F) << 6) | (s[1] & 0x3F); /* 110xxxxx 10xxxxxx */
643 		ret = 2;
644 	} else if (n >= 3 && ((c>>4) & 0xF) == 0xE && (s[1] & 0xC0) == 0x80
645 			  && (s[2] & 0xC0) == 0x80) {
646 		*rune = ((c & 0xF) << 12) | ((s[1] & 0x3F) << 6) | (s[2] & 0x3F);
647 			/* 1110xxxx 10xxxxxx 10xxxxxx */
648 		ret = 3;
649 	} else if (n >= 4 && ((c>>3) & 0x1F) == 0x1E && (s[1] & 0xC0) == 0x80
650 			  && (s[2] & 0xC0) == 0x80 && (s[3] & 0xC0) == 0x80) {
651 		*rune = ((c & 0x7) << 18) | ((s[1] & 0x3F) << 12) | ((s[2] & 0x3F) << 6) | (s[3] & 0x3F);
652 			/* 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */
653 		ret = 4;
654 	} else {
655 		*rune = c;
656 		ret = 1;
657 	}
658 	return ret; /* returns one byte if sequence doesn't look like utf */
659 }
660 
661 /* return length of next sequence: 1 for ascii or random, 2..4 for valid utf8 */
u8_nextlen(const char * s)662 int u8_nextlen(const char *s)
663 {
664 	int len;
665 
666 	len = u8_isutf(s);
667 	if (len == 0)
668 		len = 1;
669 	return len;
670 }
671 
672 /* return number of utf characters or single non-utf bytes */
u8_strlen(const char * s)673 int u8_strlen(const char *s)
674 {
675 	int i, len, n, totlen;
676 	unsigned char c;
677 
678 	n = strlen(s);
679 	totlen = 0;
680 	for (i = 0; i < n; i += len) {
681 		c = s[i];
682 		if (c < 128 || awk_mb_cur_max == 1) {
683 			len = 1;
684 		} else {
685 			len = u8_nextlen(&s[i]);
686 		}
687 		totlen++;
688 		if (i > n)
689 			FATAL("bad utf count [%s] n=%d i=%d\n", s, n, i);
690 	}
691 	return totlen;
692 }
693 
694 /* convert utf-8 char number in a string to its byte offset */
u8_char2byte(const char * s,int charnum)695 int u8_char2byte(const char *s, int charnum)
696 {
697 	int n;
698 	int bytenum = 0;
699 
700 	while (charnum > 0) {
701 		n = u8_nextlen(s);
702 		s += n;
703 		bytenum += n;
704 		charnum--;
705 	}
706 	return bytenum;
707 }
708 
709 /* convert byte offset in s to utf-8 char number that starts there */
u8_byte2char(const char * s,int bytenum)710 int u8_byte2char(const char *s, int bytenum)
711 {
712 	int i, len, b;
713 	int charnum = 0; /* BUG: what origin? */
714 	/* should be 0 to match start==0 which means no match */
715 
716 	b = strlen(s);
717 	if (bytenum > b) {
718 		return -1; /* ??? */
719 	}
720 	for (i = 0; i <= bytenum; i += len) {
721 		len = u8_nextlen(s+i);
722 		charnum++;
723 	}
724 	return charnum;
725 }
726 
727 /* runetochar() adapted from rune.c in the Plan 9 distributione */
728 
729 enum
730 {
731 	Runeerror = 128, /* from somewhere else */
732 	Runemax = 0x10FFFF,
733 
734 	Bit1    = 7,
735 	Bitx    = 6,
736 	Bit2    = 5,
737 	Bit3    = 4,
738 	Bit4    = 3,
739 	Bit5    = 2,
740 
741 	T1      = ((1<<(Bit1+1))-1) ^ 0xFF,     /* 0000 0000 */
742 	Tx      = ((1<<(Bitx+1))-1) ^ 0xFF,     /* 1000 0000 */
743 	T2      = ((1<<(Bit2+1))-1) ^ 0xFF,     /* 1100 0000 */
744 	T3      = ((1<<(Bit3+1))-1) ^ 0xFF,     /* 1110 0000 */
745 	T4      = ((1<<(Bit4+1))-1) ^ 0xFF,     /* 1111 0000 */
746 	T5      = ((1<<(Bit5+1))-1) ^ 0xFF,     /* 1111 1000 */
747 
748 	Rune1   = (1<<(Bit1+0*Bitx))-1,	 	/* 0000 0000 0000 0000 0111 1111 */
749 	Rune2   = (1<<(Bit2+1*Bitx))-1,	 	/* 0000 0000 0000 0111 1111 1111 */
750 	Rune3   = (1<<(Bit3+2*Bitx))-1,	 	/* 0000 0000 1111 1111 1111 1111 */
751 	Rune4   = (1<<(Bit4+3*Bitx))-1,	 	/* 0011 1111 1111 1111 1111 1111 */
752 
753 	Maskx   = (1<<Bitx)-1,		  	/* 0011 1111 */
754 	Testx   = Maskx ^ 0xFF,		 	/* 1100 0000 */
755 
756 };
757 
runetochar(char * str,int c)758 int runetochar(char *str, int c)
759 {
760 	/* one character sequence 00000-0007F => 00-7F */
761 	if (c <= Rune1) {
762 		str[0] = c;
763 		return 1;
764 	}
765 
766 	/* two character sequence 00080-007FF => T2 Tx */
767 	if (c <= Rune2) {
768 		str[0] = T2 | (c >> 1*Bitx);
769 		str[1] = Tx | (c & Maskx);
770 		return 2;
771 	}
772 
773 	/* three character sequence 00800-0FFFF => T3 Tx Tx */
774 	if (c > Runemax)
775 		c = Runeerror;
776 	if (c <= Rune3) {
777 		str[0] = T3 |  (c >> 2*Bitx);
778 		str[1] = Tx | ((c >> 1*Bitx) & Maskx);
779 		str[2] = Tx |  (c & Maskx);
780 		return 3;
781 	}
782 
783 	/* four character sequence 010000-1FFFFF => T4 Tx Tx Tx */
784 	str[0] = T4 |  (c >> 3*Bitx);
785 	str[1] = Tx | ((c >> 2*Bitx) & Maskx);
786 	str[2] = Tx | ((c >> 1*Bitx) & Maskx);
787 	str[3] = Tx |  (c & Maskx);
788 	return 4;
789 }
790 
791 
792 /* ========== end of utf8 code =========== */
793 
794 
795 
matchop(Node ** a,int n)796 Cell *matchop(Node **a, int n)	/* ~ and match() */
797 {
798 	Cell *x, *y, *z;
799 	char *s, *t;
800 	int i;
801 	int cstart, cpatlen, len;
802 	fa *pfa;
803 	int (*mf)(fa *, const char *) = match, mode = 0;
804 
805 	if (n == MATCHFCN) {
806 		mf = pmatch;
807 		mode = 1;
808 	}
809 	x = execute(a[1]);	/* a[1] = target text */
810 	s = getsval(x);
811 	if (a[0] == NULL)	/* a[1] == 0: already-compiled reg expr */
812 		i = (*mf)((fa *) a[2], s);
813 	else {
814 		y = execute(a[2]);	/* a[2] = regular expr */
815 		t = getsval(y);
816 		pfa = makedfa(t, mode);
817 		i = (*mf)(pfa, s);
818 		tempfree(y);
819 	}
820 	z = x;
821 	if (n == MATCHFCN) {
822 		int start = patbeg - s + 1; /* origin 1 */
823 		if (patlen < 0) {
824 			start = 0; /* not found */
825 		} else {
826 			cstart = u8_byte2char(s, start-1);
827 			cpatlen = 0;
828 			for (i = 0; i < patlen; i += len) {
829 				len = u8_nextlen(patbeg+i);
830 				cpatlen++;
831 			}
832 
833 			start = cstart;
834 			patlen = cpatlen;
835 		}
836 
837 		setfval(rstartloc, (Awkfloat) start);
838 		setfval(rlengthloc, (Awkfloat) patlen);
839 		x = gettemp();
840 		x->tval = NUM;
841 		x->fval = start;
842 	} else if ((n == MATCH && i == 1) || (n == NOTMATCH && i == 0))
843 		x = True;
844 	else
845 		x = False;
846 
847 	tempfree(z);
848 	return x;
849 }
850 
851 
boolop(Node ** a,int n)852 Cell *boolop(Node **a, int n)	/* a[0] || a[1], a[0] && a[1], !a[0] */
853 {
854 	Cell *x, *y;
855 	int i;
856 
857 	x = execute(a[0]);
858 	i = istrue(x);
859 	tempfree(x);
860 	switch (n) {
861 	case BOR:
862 		if (i) return(True);
863 		y = execute(a[1]);
864 		i = istrue(y);
865 		tempfree(y);
866 		if (i) return(True);
867 		else return(False);
868 	case AND:
869 		if ( !i ) return(False);
870 		y = execute(a[1]);
871 		i = istrue(y);
872 		tempfree(y);
873 		if (i) return(True);
874 		else return(False);
875 	case NOT:
876 		if (i) return(False);
877 		else return(True);
878 	default:	/* can't happen */
879 		FATAL("unknown boolean operator %d", n);
880 	}
881 	return 0;	/*NOTREACHED*/
882 }
883 
relop(Node ** a,int n)884 Cell *relop(Node **a, int n)	/* a[0 < a[1], etc. */
885 {
886 	int i;
887 	Cell *x, *y;
888 	Awkfloat j;
889 	bool x_is_nan, y_is_nan;
890 
891 	x = execute(a[0]);
892 	y = execute(a[1]);
893 	x_is_nan = isnan(x->fval);
894 	y_is_nan = isnan(y->fval);
895 	if (x->tval&NUM && y->tval&NUM) {
896 		if ((x_is_nan || y_is_nan) && n != NE)
897 			return(False);
898 		j = x->fval - y->fval;
899 		i = j<0? -1: (j>0? 1: 0);
900 	} else {
901 		i = strcmp(getsval(x), getsval(y));
902 	}
903 	tempfree(x);
904 	tempfree(y);
905 	switch (n) {
906 	case LT:	if (i<0) return(True);
907 			else return(False);
908 	case LE:	if (i<=0) return(True);
909 			else return(False);
910 	case NE:	if (x_is_nan && y_is_nan) return(True);
911 			else if (i!=0) return(True);
912 			else return(False);
913 	case EQ:	if (i == 0) return(True);
914 			else return(False);
915 	case GE:	if (i>=0) return(True);
916 			else return(False);
917 	case GT:	if (i>0) return(True);
918 			else return(False);
919 	default:	/* can't happen */
920 		FATAL("unknown relational operator %d", n);
921 	}
922 	return 0;	/*NOTREACHED*/
923 }
924 
tfree(Cell * a)925 void tfree(Cell *a)	/* free a tempcell */
926 {
927 	if (freeable(a)) {
928 		DPRINTF("freeing %s %s %o\n", NN(a->nval), NN(a->sval), a->tval);
929 		xfree(a->sval);
930 	}
931 	if (a == tmps)
932 		FATAL("tempcell list is curdled");
933 	a->cnext = tmps;
934 	tmps = a;
935 }
936 
gettemp(void)937 Cell *gettemp(void)	/* get a tempcell */
938 {	int i;
939 	Cell *x;
940 
941 	if (!tmps) {
942 		tmps = (Cell *) calloc(100, sizeof(*tmps));
943 		if (!tmps)
944 			FATAL("out of space for temporaries");
945 		for (i = 1; i < 100; i++)
946 			tmps[i-1].cnext = &tmps[i];
947 		tmps[i-1].cnext = NULL;
948 	}
949 	x = tmps;
950 	tmps = x->cnext;
951 	*x = tempcell;
952 	return(x);
953 }
954 
indirect(Node ** a,int n)955 Cell *indirect(Node **a, int n)	/* $( a[0] ) */
956 {
957 	Awkfloat val;
958 	Cell *x;
959 	int m;
960 	char *s;
961 
962 	x = execute(a[0]);
963 	val = getfval(x);	/* freebsd: defend against super large field numbers */
964 	if ((Awkfloat)INT_MAX < val)
965 		FATAL("trying to access out of range field %s", x->nval);
966 	m = (int) val;
967 	if (m == 0 && !is_number(s = getsval(x), NULL))	/* suspicion! */
968 		FATAL("illegal field $(%s), name \"%s\"", s, x->nval);
969 		/* BUG: can x->nval ever be null??? */
970 	tempfree(x);
971 	x = fieldadr(m);
972 	x->ctype = OCELL;	/* BUG?  why are these needed? */
973 	x->csub = CFLD;
974 	return(x);
975 }
976 
substr(Node ** a,int nnn)977 Cell *substr(Node **a, int nnn)		/* substr(a[0], a[1], a[2]) */
978 {
979 	int k, m, n;
980 	int mb, nb;
981 	char *s;
982 	int temp;
983 	Cell *x, *y, *z = NULL;
984 
985 	x = execute(a[0]);
986 	y = execute(a[1]);
987 	if (a[2] != NULL)
988 		z = execute(a[2]);
989 	s = getsval(x);
990 	k = u8_strlen(s) + 1;
991 	if (k <= 1) {
992 		tempfree(x);
993 		tempfree(y);
994 		if (a[2] != NULL) {
995 			tempfree(z);
996 		}
997 		x = gettemp();
998 		setsval(x, "");
999 		return(x);
1000 	}
1001 	m = (int) getfval(y);
1002 	if (m <= 0)
1003 		m = 1;
1004 	else if (m > k)
1005 		m = k;
1006 	tempfree(y);
1007 	if (a[2] != NULL) {
1008 		n = (int) getfval(z);
1009 		tempfree(z);
1010 	} else
1011 		n = k - 1;
1012 	if (n < 0)
1013 		n = 0;
1014 	else if (n > k - m)
1015 		n = k - m;
1016 	/* m is start, n is length from there */
1017 	DPRINTF("substr: m=%d, n=%d, s=%s\n", m, n, s);
1018 	y = gettemp();
1019 	mb = u8_char2byte(s, m-1); /* byte offset of start char in s */
1020 	nb = u8_char2byte(s, m-1+n);  /* byte offset of end+1 char in s */
1021 
1022 	temp = s[nb];	/* with thanks to John Linderman */
1023 	s[nb] = '\0';
1024 	setsval(y, s + mb);
1025 	s[nb] = temp;
1026 	tempfree(x);
1027 	return(y);
1028 }
1029 
sindex(Node ** a,int nnn)1030 Cell *sindex(Node **a, int nnn)		/* index(a[0], a[1]) */
1031 {
1032 	Cell *x, *y, *z;
1033 	char *s1, *s2, *p1, *p2, *q;
1034 	Awkfloat v = 0.0;
1035 
1036 	x = execute(a[0]);
1037 	s1 = getsval(x);
1038 	y = execute(a[1]);
1039 	s2 = getsval(y);
1040 
1041 	z = gettemp();
1042 	for (p1 = s1; *p1 != '\0'; p1++) {
1043 		for (q = p1, p2 = s2; *p2 != '\0' && *q == *p2; q++, p2++)
1044 			continue;
1045 		if (*p2 == '\0') {
1046 			/* v = (Awkfloat) (p1 - s1 + 1);	 origin 1 */
1047 
1048 		   /* should be a function: used in match() as well */
1049 			int i, len;
1050 			v = 0;
1051 			for (i = 0; i < p1-s1+1; i += len) {
1052 				len = u8_nextlen(s1+i);
1053 				v++;
1054 			}
1055 			break;
1056 		}
1057 	}
1058 	tempfree(x);
1059 	tempfree(y);
1060 	setfval(z, v);
1061 	return(z);
1062 }
1063 
has_utf8(char * s)1064 int has_utf8(char *s)	/* return 1 if s contains any utf-8 (2 bytes or more) character */
1065 {
1066 	int n;
1067 
1068 	for (n = 0; *s != 0; s += n) {
1069 		n = u8_nextlen(s);
1070 		if (n > 1)
1071 			return 1;
1072 	}
1073 	return 0;
1074 }
1075 
1076 #define	MAXNUMSIZE	50
1077 
format(char ** pbuf,int * pbufsize,const char * s,Node * a)1078 int format(char **pbuf, int *pbufsize, const char *s, Node *a)	/* printf-like conversions */
1079 {
1080 	char *fmt;
1081 	char *p, *t;
1082 	const char *os;
1083 	Cell *x;
1084 	int flag = 0, n;
1085 	int fmtwd; /* format width */
1086 	int fmtsz = recsize;
1087 	char *buf = *pbuf;
1088 	int bufsize = *pbufsize;
1089 #define FMTSZ(a)   (fmtsz - ((a) - fmt))
1090 #define BUFSZ(a)   (bufsize - ((a) - buf))
1091 
1092 	static bool first = true;
1093 	static bool have_a_format = false;
1094 
1095 	if (first) {
1096 		char xbuf[100];
1097 
1098 		snprintf(xbuf, sizeof(xbuf), "%a", 42.0);
1099 		have_a_format = (strcmp(xbuf, "0x1.5p+5") == 0);
1100 		first = false;
1101 	}
1102 
1103 	os = s;
1104 	p = buf;
1105 	if ((fmt = (char *) malloc(fmtsz)) == NULL)
1106 		FATAL("out of memory in format()");
1107 	while (*s) {
1108 		adjbuf(&buf, &bufsize, MAXNUMSIZE+1+p-buf, recsize, &p, "format1");
1109 		if (*s != '%') {
1110 			*p++ = *s++;
1111 			continue;
1112 		}
1113 		if (*(s+1) == '%') {
1114 			*p++ = '%';
1115 			s += 2;
1116 			continue;
1117 		}
1118 		fmtwd = atoi(s+1);
1119 		if (fmtwd < 0)
1120 			fmtwd = -fmtwd;
1121 		adjbuf(&buf, &bufsize, fmtwd+1+p-buf, recsize, &p, "format2");
1122 		for (t = fmt; (*t++ = *s) != '\0'; s++) {
1123 			if (!adjbuf(&fmt, &fmtsz, MAXNUMSIZE+1+t-fmt, recsize, &t, "format3"))
1124 				FATAL("format item %.30s... ran format() out of memory", os);
1125 			/* Ignore size specifiers */
1126 			if (strchr("hjLlqtz", *s) != NULL) {	/* the ansi panoply */
1127 				t--;
1128 				continue;
1129 			}
1130 			if (isalpha((uschar)*s))
1131 				break;
1132 			if (*s == '$') {
1133 				FATAL("'$' not permitted in awk formats");
1134 			}
1135 			if (*s == '*') {
1136 				if (a == NULL) {
1137 					FATAL("not enough args in printf(%s)", os);
1138 				}
1139 				x = execute(a);
1140 				a = a->nnext;
1141 				snprintf(t - 1, FMTSZ(t - 1),
1142 				    "%d", fmtwd=(int) getfval(x));
1143 				if (fmtwd < 0)
1144 					fmtwd = -fmtwd;
1145 				adjbuf(&buf, &bufsize, fmtwd+1+p-buf, recsize, &p, "format");
1146 				t = fmt + strlen(fmt);
1147 				tempfree(x);
1148 			}
1149 		}
1150 		*t = '\0';
1151 		if (fmtwd < 0)
1152 			fmtwd = -fmtwd;
1153 		adjbuf(&buf, &bufsize, fmtwd+1+p-buf, recsize, &p, "format4");
1154 		switch (*s) {
1155 		case 'a': case 'A':
1156 			if (have_a_format)
1157 				flag = *s;
1158 			else
1159 				flag = 'f';
1160 			break;
1161 		case 'f': case 'e': case 'g': case 'E': case 'G':
1162 			flag = 'f';
1163 			break;
1164 		case 'd': case 'i': case 'o': case 'x': case 'X': case 'u':
1165 			flag = (*s == 'd' || *s == 'i') ? 'd' : 'u';
1166 			*(t-1) = 'j';
1167 			*t = *s;
1168 			*++t = '\0';
1169 			break;
1170 		case 's':
1171 			flag = 's';
1172 			break;
1173 		case 'c':
1174 			flag = 'c';
1175 			break;
1176 		default:
1177 			WARNING("weird printf conversion %s", fmt);
1178 			flag = '?';
1179 			break;
1180 		}
1181 		if (a == NULL)
1182 			FATAL("not enough args in printf(%s)", os);
1183 		x = execute(a);
1184 		a = a->nnext;
1185 		n = MAXNUMSIZE;
1186 		if (fmtwd > n)
1187 			n = fmtwd;
1188 		adjbuf(&buf, &bufsize, 1+n+p-buf, recsize, &p, "format5");
1189 		switch (flag) {
1190 		case '?':
1191 			snprintf(p, BUFSZ(p), "%s", fmt);	/* unknown, so dump it too */
1192 			t = getsval(x);
1193 			n = strlen(t);
1194 			if (fmtwd > n)
1195 				n = fmtwd;
1196 			adjbuf(&buf, &bufsize, 1+strlen(p)+n+p-buf, recsize, &p, "format6");
1197 			p += strlen(p);
1198 			snprintf(p, BUFSZ(p), "%s", t);
1199 			break;
1200 		case 'a':
1201 		case 'A':
1202 		case 'f':	snprintf(p, BUFSZ(p), fmt, getfval(x)); break;
1203 		case 'd':	snprintf(p, BUFSZ(p), fmt, (intmax_t) getfval(x)); break;
1204 		case 'u':	snprintf(p, BUFSZ(p), fmt, (uintmax_t) getfval(x)); break;
1205 
1206 		case 's': {
1207 			t = getsval(x);
1208 			n = strlen(t);
1209 			/* if simple format or no utf-8 in the string, sprintf works */
1210 			if (!has_utf8(t) || strcmp(fmt,"%s") == 0) {
1211 				if (fmtwd > n)
1212 					n = fmtwd;
1213 				if (!adjbuf(&buf, &bufsize, 1+n+p-buf, recsize, &p, "format7"))
1214 					FATAL("huge string/format (%d chars) in printf %.30s..." \
1215 						" ran format() out of memory", n, t);
1216 				snprintf(p, BUFSZ(p), fmt, t);
1217 				break;
1218 			}
1219 
1220 			/* get here if string has utf-8 chars and fmt is not plain %s */
1221 			/* "%-w.ps", where -, w and .p are all optional */
1222 			/* '0' before the w is a flag character */
1223 			/* fmt points at % */
1224 			int ljust = 0, wid = 0, prec = n, pad = 0;
1225 			char *f = fmt+1;
1226 			if (f[0] == '-') {
1227 				ljust = 1;
1228 				f++;
1229 			}
1230 			// flags '0' and '+' are recognized but skipped
1231 			if (f[0] == '0') {
1232 				f++;
1233 				if (f[0] == '+')
1234 					f++;
1235 			}
1236 			if (f[0] == '+') {
1237 				f++;
1238 				if (f[0] == '0')
1239 					f++;
1240 			}
1241 			if (isdigit(f[0])) { /* there is a wid */
1242 				wid = strtol(f, &f, 10);
1243 			}
1244 			if (f[0] == '.') { /* there is a .prec */
1245 				prec = strtol(++f, &f, 10);
1246 			}
1247 			if (prec > u8_strlen(t))
1248 				prec = u8_strlen(t);
1249 			pad = wid>prec ? wid - prec : 0;  // has to be >= 0
1250 			int i, k, n;
1251 
1252 			if (ljust) { // print prec chars from t, then pad blanks
1253 				n = u8_char2byte(t, prec);
1254 				for (k = 0; k < n; k++) {
1255 					//putchar(t[k]);
1256 					*p++ = t[k];
1257 				}
1258 				for (i = 0; i < pad; i++) {
1259 					//printf(" ");
1260 					*p++ = ' ';
1261 				}
1262 			} else { // print pad blanks, then prec chars from t
1263 				for (i = 0; i < pad; i++) {
1264 					//printf(" ");
1265 					*p++ = ' ';
1266 				}
1267 				n = u8_char2byte(t, prec);
1268 				for (k = 0; k < n; k++) {
1269 					//putchar(t[k]);
1270 					*p++ = t[k];
1271 				}
1272 			}
1273 			*p = 0;
1274 			break;
1275 		}
1276 
1277                case 'c': {
1278 			/*
1279 			 * If a numeric value is given, awk should just turn
1280 			 * it into a character and print it:
1281 			 *      BEGIN { printf("%c\n", 65) }
1282 			 * prints "A".
1283 			 *
1284 			 * But what if the numeric value is > 128 and
1285 			 * represents a valid Unicode code point?!? We do
1286 			 * our best to convert it back into UTF-8. If we
1287 			 * can't, we output the encoding of the Unicode
1288 			 * "invalid character", 0xFFFD.
1289 			 */
1290 			if (isnum(x)) {
1291 				int charval = (int) getfval(x);
1292 
1293 				if (charval != 0) {
1294 					if (charval < 128 || awk_mb_cur_max == 1)
1295 						snprintf(p, BUFSZ(p), fmt, charval);
1296 					else {
1297 						// possible unicode character
1298 						size_t count;
1299 						char *bs = wide_char_to_byte_str(charval, &count);
1300 
1301 						if (bs == NULL)	{ // invalid character
1302 							// use unicode invalid character, 0xFFFD
1303 							static char invalid_char[] = "\357\277\275";
1304 							bs = invalid_char;
1305 							count = 3;
1306 						}
1307 						t = bs;
1308 						n = count;
1309 						goto format_percent_c;
1310 					}
1311 				} else {
1312 					*p++ = '\0'; /* explicit null byte */
1313 					*p = '\0';   /* next output will start here */
1314 				}
1315 				break;
1316 			}
1317 			t = getsval(x);
1318 			n = u8_nextlen(t);
1319 		format_percent_c:
1320 			if (n < 2) { /* not utf8 */
1321 				snprintf(p, BUFSZ(p), fmt, getsval(x)[0]);
1322 				break;
1323 			}
1324 
1325 			// utf8 character, almost same song and dance as for %s
1326 			int ljust = 0, wid = 0, prec = n, pad = 0;
1327 			char *f = fmt+1;
1328 			if (f[0] == '-') {
1329 				ljust = 1;
1330 				f++;
1331 			}
1332 			// flags '0' and '+' are recognized but skipped
1333 			if (f[0] == '0') {
1334 				f++;
1335 				if (f[0] == '+')
1336 					f++;
1337 			}
1338 			if (f[0] == '+') {
1339 				f++;
1340 				if (f[0] == '0')
1341 					f++;
1342 			}
1343 			if (isdigit(f[0])) { /* there is a wid */
1344 				wid = strtol(f, &f, 10);
1345 			}
1346 			if (f[0] == '.') { /* there is a .prec */
1347 				prec = strtol(++f, &f, 10);
1348 			}
1349 			if (prec > 1)           // %c --> only one character
1350 				prec = 1;
1351 			pad = wid>prec ? wid - prec : 0;  // has to be >= 0
1352 			int i;
1353 
1354 			if (ljust) { // print one char from t, then pad blanks
1355 				for (i = 0; i < n; i++)
1356 					*p++ = t[i];
1357 				for (i = 0; i < pad; i++) {
1358 					//printf(" ");
1359 					*p++ = ' ';
1360 				}
1361 			} else { // print pad blanks, then prec chars from t
1362 				for (i = 0; i < pad; i++) {
1363 					//printf(" ");
1364 					*p++ = ' ';
1365 				}
1366 				for (i = 0; i < n; i++)
1367 					*p++ = t[i];
1368 			}
1369 			*p = 0;
1370 			break;
1371 		}
1372 		default:
1373 			FATAL("can't happen: bad conversion %c in format()", flag);
1374 		}
1375 
1376 		tempfree(x);
1377 		p += strlen(p);
1378 		s++;
1379 	}
1380 	*p = '\0';
1381 	free(fmt);
1382 	for ( ; a; a = a->nnext) {		/* evaluate any remaining args */
1383 		x = execute(a);
1384 		tempfree(x);
1385 	}
1386 	*pbuf = buf;
1387 	*pbufsize = bufsize;
1388 	return p - buf;
1389 }
1390 
awksprintf(Node ** a,int n)1391 Cell *awksprintf(Node **a, int n)		/* sprintf(a[0]) */
1392 {
1393 	Cell *x;
1394 	Node *y;
1395 	char *buf;
1396 	int bufsz=3*recsize;
1397 
1398 	if ((buf = (char *) malloc(bufsz)) == NULL)
1399 		FATAL("out of memory in awksprintf");
1400 	y = a[0]->nnext;
1401 	x = execute(a[0]);
1402 	if (format(&buf, &bufsz, getsval(x), y) == -1)
1403 		FATAL("sprintf string %.30s... too long.  can't happen.", buf);
1404 	tempfree(x);
1405 	x = gettemp();
1406 	x->sval = buf;
1407 	x->tval = STR;
1408 	return(x);
1409 }
1410 
awkprintf(Node ** a,int n)1411 Cell *awkprintf(Node **a, int n)		/* printf */
1412 {	/* a[0] is list of args, starting with format string */
1413 	/* a[1] is redirection operator, a[2] is redirection file */
1414 	FILE *fp;
1415 	Cell *x;
1416 	Node *y;
1417 	char *buf;
1418 	int len;
1419 	int bufsz=3*recsize;
1420 
1421 	if ((buf = (char *) malloc(bufsz)) == NULL)
1422 		FATAL("out of memory in awkprintf");
1423 	y = a[0]->nnext;
1424 	x = execute(a[0]);
1425 	if ((len = format(&buf, &bufsz, getsval(x), y)) == -1)
1426 		FATAL("printf string %.30s... too long.  can't happen.", buf);
1427 	tempfree(x);
1428 	if (a[1] == NULL) {
1429 		/* fputs(buf, stdout); */
1430 		fwrite(buf, len, 1, stdout);
1431 		if (ferror(stdout))
1432 			FATAL("write error on stdout");
1433 	} else {
1434 		fp = redirect(ptoi(a[1]), a[2]);
1435 		/* fputs(buf, fp); */
1436 		fwrite(buf, len, 1, fp);
1437 		fflush(fp);
1438 		if (ferror(fp))
1439 			FATAL("write error on %s", filename(fp));
1440 	}
1441 	free(buf);
1442 	return(True);
1443 }
1444 
arith(Node ** a,int n)1445 Cell *arith(Node **a, int n)	/* a[0] + a[1], etc.  also -a[0] */
1446 {
1447 	Awkfloat i, j = 0;
1448 	double v;
1449 	Cell *x, *y, *z;
1450 
1451 	x = execute(a[0]);
1452 	i = getfval(x);
1453 	tempfree(x);
1454 	if (n != UMINUS && n != UPLUS) {
1455 		y = execute(a[1]);
1456 		j = getfval(y);
1457 		tempfree(y);
1458 	}
1459 	z = gettemp();
1460 	switch (n) {
1461 	case ADD:
1462 		i += j;
1463 		break;
1464 	case MINUS:
1465 		i -= j;
1466 		break;
1467 	case MULT:
1468 		i *= j;
1469 		break;
1470 	case DIVIDE:
1471 		if (j == 0)
1472 			FATAL("division by zero");
1473 		i /= j;
1474 		break;
1475 	case MOD:
1476 		if (j == 0)
1477 			FATAL("division by zero in mod");
1478 		modf(i/j, &v);
1479 		i = i - j * v;
1480 		break;
1481 	case UMINUS:
1482 		i = -i;
1483 		break;
1484 	case UPLUS: /* handled by getfval(), above */
1485 		break;
1486 	case POWER:
1487 		if (j >= 0 && modf(j, &v) == 0.0)	/* pos integer exponent */
1488 			i = ipow(i, (int) j);
1489                else {
1490 			errno = 0;
1491 			i = errcheck(pow(i, j), "pow");
1492                }
1493 		break;
1494 	default:	/* can't happen */
1495 		FATAL("illegal arithmetic operator %d", n);
1496 	}
1497 	setfval(z, i);
1498 	return(z);
1499 }
1500 
ipow(double x,int n)1501 double ipow(double x, int n)	/* x**n.  ought to be done by pow, but isn't always */
1502 {
1503 	double v;
1504 
1505 	if (n <= 0)
1506 		return 1;
1507 	v = ipow(x, n/2);
1508 	if (n % 2 == 0)
1509 		return v * v;
1510 	else
1511 		return x * v * v;
1512 }
1513 
incrdecr(Node ** a,int n)1514 Cell *incrdecr(Node **a, int n)		/* a[0]++, etc. */
1515 {
1516 	Cell *x, *z;
1517 	int k;
1518 	Awkfloat xf;
1519 
1520 	x = execute(a[0]);
1521 	xf = getfval(x);
1522 	k = (n == PREINCR || n == POSTINCR) ? 1 : -1;
1523 	if (n == PREINCR || n == PREDECR) {
1524 		setfval(x, xf + k);
1525 		return(x);
1526 	}
1527 	z = gettemp();
1528 	setfval(z, xf);
1529 	setfval(x, xf + k);
1530 	tempfree(x);
1531 	return(z);
1532 }
1533 
assign(Node ** a,int n)1534 Cell *assign(Node **a, int n)	/* a[0] = a[1], a[0] += a[1], etc. */
1535 {		/* this is subtle; don't muck with it. */
1536 	Cell *x, *y;
1537 	Awkfloat xf, yf;
1538 	double v;
1539 
1540 	y = execute(a[1]);
1541 	x = execute(a[0]);
1542 	if (n == ASSIGN) {	/* ordinary assignment */
1543 		if (x == y && !(x->tval & (FLD|REC)) && x != nfloc)
1544 			;	/* self-assignment: leave alone unless it's a field or NF */
1545 		else if ((y->tval & (STR|NUM)) == (STR|NUM)) {
1546 			yf = getfval(y);
1547 			setsval(x, getsval(y));
1548 			x->fval = yf;
1549 			x->tval |= NUM;
1550 		}
1551 		else if (isstr(y))
1552 			setsval(x, getsval(y));
1553 		else if (isnum(y))
1554 			setfval(x, getfval(y));
1555 		else
1556 			funnyvar(y, "read value of");
1557 		tempfree(y);
1558 		return(x);
1559 	}
1560 	xf = getfval(x);
1561 	yf = getfval(y);
1562 	switch (n) {
1563 	case ADDEQ:
1564 		xf += yf;
1565 		break;
1566 	case SUBEQ:
1567 		xf -= yf;
1568 		break;
1569 	case MULTEQ:
1570 		xf *= yf;
1571 		break;
1572 	case DIVEQ:
1573 		if (yf == 0)
1574 			FATAL("division by zero in /=");
1575 		xf /= yf;
1576 		break;
1577 	case MODEQ:
1578 		if (yf == 0)
1579 			FATAL("division by zero in %%=");
1580 		modf(xf/yf, &v);
1581 		xf = xf - yf * v;
1582 		break;
1583 	case POWEQ:
1584 		if (yf >= 0 && modf(yf, &v) == 0.0)	/* pos integer exponent */
1585 			xf = ipow(xf, (int) yf);
1586                else {
1587 			errno = 0;
1588 			xf = errcheck(pow(xf, yf), "pow");
1589                }
1590 		break;
1591 	default:
1592 		FATAL("illegal assignment operator %d", n);
1593 		break;
1594 	}
1595 	tempfree(y);
1596 	setfval(x, xf);
1597 	return(x);
1598 }
1599 
cat(Node ** a,int q)1600 Cell *cat(Node **a, int q)	/* a[0] cat a[1] */
1601 {
1602 	Cell *x, *y, *z;
1603 	int n1, n2;
1604 	char *s = NULL;
1605 	int ssz = 0;
1606 
1607 	x = execute(a[0]);
1608 	n1 = strlen(getsval(x));
1609 	adjbuf(&s, &ssz, n1 + 1, recsize, 0, "cat1");
1610 	memcpy(s, x->sval, n1);
1611 
1612 	tempfree(x);
1613 
1614 	y = execute(a[1]);
1615 	n2 = strlen(getsval(y));
1616 	adjbuf(&s, &ssz, n1 + n2 + 1, recsize, 0, "cat2");
1617 	memcpy(s + n1, y->sval, n2);
1618 	s[n1 + n2] = '\0';
1619 
1620 	tempfree(y);
1621 
1622 	z = gettemp();
1623 	z->sval = s;
1624 	z->tval = STR;
1625 
1626 	return(z);
1627 }
1628 
pastat(Node ** a,int n)1629 Cell *pastat(Node **a, int n)	/* a[0] { a[1] } */
1630 {
1631 	Cell *x;
1632 
1633 	if (a[0] == NULL)
1634 		x = execute(a[1]);
1635 	else {
1636 		x = execute(a[0]);
1637 		if (istrue(x)) {
1638 			tempfree(x);
1639 			x = execute(a[1]);
1640 		}
1641 	}
1642 	return x;
1643 }
1644 
dopa2(Node ** a,int n)1645 Cell *dopa2(Node **a, int n)	/* a[0], a[1] { a[2] } */
1646 {
1647 	Cell *x;
1648 	int pair;
1649 
1650 	pair = ptoi(a[3]);
1651 	if (pairstack[pair] == 0) {
1652 		x = execute(a[0]);
1653 		if (istrue(x))
1654 			pairstack[pair] = 1;
1655 		tempfree(x);
1656 	}
1657 	if (pairstack[pair] == 1) {
1658 		x = execute(a[1]);
1659 		if (istrue(x))
1660 			pairstack[pair] = 0;
1661 		tempfree(x);
1662 		x = execute(a[2]);
1663 		return(x);
1664 	}
1665 	return(False);
1666 }
1667 
split(Node ** a,int nnn)1668 Cell *split(Node **a, int nnn)	/* split(a[0], a[1], a[2]); a[3] is type */
1669 {
1670 	Cell *x = NULL, *y, *ap;
1671 	const char *s, *origs, *t;
1672 	const char *fs = NULL;
1673 	char *origfs = NULL;
1674 	int sep;
1675 	char temp, num[50];
1676 	int n, tempstat, arg3type;
1677 	int j;
1678 	double result;
1679 
1680 	y = execute(a[0]);	/* source string */
1681 	origs = s = strdup(getsval(y));
1682 	tempfree(y);
1683 	arg3type = ptoi(a[3]);
1684 	if (a[2] == NULL) {		/* BUG: CSV should override implicit fs but not explicit */
1685 		fs = getsval(fsloc);
1686 	} else if (arg3type == STRING) {	/* split(str,arr,"string") */
1687 		x = execute(a[2]);
1688 		fs = origfs = strdup(getsval(x));
1689 		tempfree(x);
1690 	} else if (arg3type == REGEXPR) {
1691 		fs = "(regexpr)";	/* split(str,arr,/regexpr/) */
1692 	} else {
1693 		FATAL("illegal type of split");
1694 	}
1695 	sep = *fs;
1696 	ap = execute(a[1]);	/* array name */
1697 /* BUG 7/26/22: this appears not to reset array: see C1/asplit */
1698 	freesymtab(ap);
1699 	DPRINTF("split: s=|%s|, a=%s, sep=|%s|\n", s, NN(ap->nval), fs);
1700 	ap->tval &= ~STR;
1701 	ap->tval |= ARR;
1702 	ap->sval = (char *) makesymtab(NSYMTAB);
1703 
1704 	n = 0;
1705         if (arg3type == REGEXPR && strlen((char*)((fa*)a[2])->restr) == 0) {
1706 		/* split(s, a, //); have to arrange that it looks like empty sep */
1707 		arg3type = 0;
1708 		fs = "";
1709 		sep = 0;
1710 	}
1711 	if (*s != '\0' && (strlen(fs) > 1 || arg3type == REGEXPR)) {	/* reg expr */
1712 		fa *pfa;
1713 		if (arg3type == REGEXPR) {	/* it's ready already */
1714 			pfa = (fa *) a[2];
1715 		} else {
1716 			pfa = makedfa(fs, 1);
1717 		}
1718 		if (nematch(pfa,s)) {
1719 			tempstat = pfa->initstat;
1720 			pfa->initstat = 2;
1721 			do {
1722 				n++;
1723 				snprintf(num, sizeof(num), "%d", n);
1724 				temp = *patbeg;
1725 				setptr(patbeg, '\0');
1726 				if (is_number(s, & result))
1727 					setsymtab(num, s, result, STR|NUM, (Array *) ap->sval);
1728 				else
1729 					setsymtab(num, s, 0.0, STR, (Array *) ap->sval);
1730 				setptr(patbeg, temp);
1731 				s = patbeg + patlen;
1732 				if (*(patbeg+patlen-1) == '\0' || *s == '\0') {
1733 					n++;
1734 					snprintf(num, sizeof(num), "%d", n);
1735 					setsymtab(num, "", 0.0, STR, (Array *) ap->sval);
1736 					pfa->initstat = tempstat;
1737 					goto spdone;
1738 				}
1739 			} while (nematch(pfa,s));
1740 			pfa->initstat = tempstat; 	/* bwk: has to be here to reset */
1741 							/* cf gsub and refldbld */
1742 		}
1743 		n++;
1744 		snprintf(num, sizeof(num), "%d", n);
1745 		if (is_number(s, & result))
1746 			setsymtab(num, s, result, STR|NUM, (Array *) ap->sval);
1747 		else
1748 			setsymtab(num, s, 0.0, STR, (Array *) ap->sval);
1749   spdone:
1750 		pfa = NULL;
1751 
1752 	} else if (a[2] == NULL && CSV) {	/* CSV only if no explicit separator */
1753 		char *newt = (char *) malloc(strlen(s)); /* for building new string; reuse for each field */
1754 		for (;;) {
1755 			char *fr = newt;
1756 			n++;
1757 			if (*s == '"' ) { /* start of "..." */
1758 				for (s++ ; *s != '\0'; ) {
1759 					if (*s == '"' && s[1] != '\0' && s[1] == '"') {
1760 						s += 2; /* doubled quote */
1761 						*fr++ = '"';
1762 					} else if (*s == '"' && (s[1] == '\0' || s[1] == ',')) {
1763 						s++; /* skip over closing quote */
1764 						break;
1765 					} else {
1766 						*fr++ = *s++;
1767 					}
1768 				}
1769 				*fr++ = 0;
1770 			} else {	/* unquoted field */
1771 				while (*s != ',' && *s != '\0')
1772 					*fr++ = *s++;
1773 				*fr++ = 0;
1774 			}
1775 			snprintf(num, sizeof(num), "%d", n);
1776 			if (is_number(newt, &result))
1777 				setsymtab(num, newt, result, STR|NUM, (Array *) ap->sval);
1778 			else
1779 				setsymtab(num, newt, 0.0, STR, (Array *) ap->sval);
1780 			if (*s++ == '\0')
1781 				break;
1782 		}
1783 		free(newt);
1784 
1785 	} else if (!CSV && sep == ' ') { /* usual case: split on white space */
1786 		for (n = 0; ; ) {
1787 #define ISWS(c)	((c) == ' ' || (c) == '\t' || (c) == '\n')
1788 			while (ISWS(*s))
1789 				s++;
1790 			if (*s == '\0')
1791 				break;
1792 			n++;
1793 			t = s;
1794 			do
1795 				s++;
1796 			while (*s != '\0' && !ISWS(*s));
1797 			temp = *s;
1798 			setptr(s, '\0');
1799 			snprintf(num, sizeof(num), "%d", n);
1800 			if (is_number(t, & result))
1801 				setsymtab(num, t, result, STR|NUM, (Array *) ap->sval);
1802 			else
1803 				setsymtab(num, t, 0.0, STR, (Array *) ap->sval);
1804 			setptr(s, temp);
1805 			if (*s != '\0')
1806 				s++;
1807 		}
1808 
1809 	} else if (sep == 0) {	/* new: split(s, a, "") => 1 char/elem */
1810 		for (n = 0; *s != '\0'; s += u8_nextlen(s)) {
1811 			char buf[10];
1812 			n++;
1813 			snprintf(num, sizeof(num), "%d", n);
1814 
1815 			for (j = 0; j < u8_nextlen(s); j++) {
1816 				buf[j] = s[j];
1817 			}
1818 			buf[j] = '\0';
1819 
1820 			if (isdigit((uschar)buf[0]))
1821 				setsymtab(num, buf, atof(buf), STR|NUM, (Array *) ap->sval);
1822 			else
1823 				setsymtab(num, buf, 0.0, STR, (Array *) ap->sval);
1824 		}
1825 
1826 	} else if (*s != '\0') {  /* some random single character */
1827 		for (;;) {
1828 			n++;
1829 			t = s;
1830 			while (*s != sep && *s != '\0')
1831 				s++;
1832 			temp = *s;
1833 			setptr(s, '\0');
1834 			snprintf(num, sizeof(num), "%d", n);
1835 			if (is_number(t, & result))
1836 				setsymtab(num, t, result, STR|NUM, (Array *) ap->sval);
1837 			else
1838 				setsymtab(num, t, 0.0, STR, (Array *) ap->sval);
1839 			setptr(s, temp);
1840 			if (*s++ == '\0')
1841 				break;
1842 		}
1843 	}
1844 	tempfree(ap);
1845 	xfree(origs);
1846 	xfree(origfs);
1847 	x = gettemp();
1848 	x->tval = NUM;
1849 	x->fval = n;
1850 	return(x);
1851 }
1852 
condexpr(Node ** a,int n)1853 Cell *condexpr(Node **a, int n)	/* a[0] ? a[1] : a[2] */
1854 {
1855 	Cell *x;
1856 
1857 	x = execute(a[0]);
1858 	if (istrue(x)) {
1859 		tempfree(x);
1860 		x = execute(a[1]);
1861 	} else {
1862 		tempfree(x);
1863 		x = execute(a[2]);
1864 	}
1865 	return(x);
1866 }
1867 
ifstat(Node ** a,int n)1868 Cell *ifstat(Node **a, int n)	/* if (a[0]) a[1]; else a[2] */
1869 {
1870 	Cell *x;
1871 
1872 	x = execute(a[0]);
1873 	if (istrue(x)) {
1874 		tempfree(x);
1875 		x = execute(a[1]);
1876 	} else if (a[2] != NULL) {
1877 		tempfree(x);
1878 		x = execute(a[2]);
1879 	}
1880 	return(x);
1881 }
1882 
whilestat(Node ** a,int n)1883 Cell *whilestat(Node **a, int n)	/* while (a[0]) a[1] */
1884 {
1885 	Cell *x;
1886 
1887 	for (;;) {
1888 		x = execute(a[0]);
1889 		if (!istrue(x))
1890 			return(x);
1891 		tempfree(x);
1892 		x = execute(a[1]);
1893 		if (isbreak(x)) {
1894 			x = True;
1895 			return(x);
1896 		}
1897 		if (isnext(x) || isexit(x) || isret(x))
1898 			return(x);
1899 		tempfree(x);
1900 	}
1901 }
1902 
dostat(Node ** a,int n)1903 Cell *dostat(Node **a, int n)	/* do a[0]; while(a[1]) */
1904 {
1905 	Cell *x;
1906 
1907 	for (;;) {
1908 		x = execute(a[0]);
1909 		if (isbreak(x))
1910 			return True;
1911 		if (isnext(x) || isexit(x) || isret(x))
1912 			return(x);
1913 		tempfree(x);
1914 		x = execute(a[1]);
1915 		if (!istrue(x))
1916 			return(x);
1917 		tempfree(x);
1918 	}
1919 }
1920 
forstat(Node ** a,int n)1921 Cell *forstat(Node **a, int n)	/* for (a[0]; a[1]; a[2]) a[3] */
1922 {
1923 	Cell *x;
1924 
1925 	x = execute(a[0]);
1926 	tempfree(x);
1927 	for (;;) {
1928 		if (a[1]!=NULL) {
1929 			x = execute(a[1]);
1930 			if (!istrue(x)) return(x);
1931 			else tempfree(x);
1932 		}
1933 		x = execute(a[3]);
1934 		if (isbreak(x))		/* turn off break */
1935 			return True;
1936 		if (isnext(x) || isexit(x) || isret(x))
1937 			return(x);
1938 		tempfree(x);
1939 		x = execute(a[2]);
1940 		tempfree(x);
1941 	}
1942 }
1943 
instat(Node ** a,int n)1944 Cell *instat(Node **a, int n)	/* for (a[0] in a[1]) a[2] */
1945 {
1946 	Cell *x, *vp, *arrayp, *cp, *ncp;
1947 	Array *tp;
1948 	int i;
1949 
1950 	vp = execute(a[0]);
1951 	arrayp = execute(a[1]);
1952 	if (!isarr(arrayp)) {
1953 		return True;
1954 	}
1955 	tp = (Array *) arrayp->sval;
1956 	tempfree(arrayp);
1957 	for (i = 0; i < tp->size; i++) {	/* this routine knows too much */
1958 		for (cp = tp->tab[i]; cp != NULL; cp = ncp) {
1959 			setsval(vp, cp->nval);
1960 			ncp = cp->cnext;
1961 			x = execute(a[2]);
1962 			if (isbreak(x)) {
1963 				tempfree(vp);
1964 				return True;
1965 			}
1966 			if (isnext(x) || isexit(x) || isret(x)) {
1967 				tempfree(vp);
1968 				return(x);
1969 			}
1970 			tempfree(x);
1971 		}
1972 	}
1973 	return True;
1974 }
1975 
nawk_convert(const char * s,int (* fun_c)(int),wint_t (* fun_wc)(wint_t))1976 static char *nawk_convert(const char *s, int (*fun_c)(int),
1977     wint_t (*fun_wc)(wint_t))
1978 {
1979 	char *buf      = NULL;
1980 	char *pbuf     = NULL;
1981 	const char *ps = NULL;
1982 	size_t n       = 0;
1983 	wchar_t wc;
1984 	const size_t sz = awk_mb_cur_max;
1985 	int unused;
1986 
1987 	if (sz == 1) {
1988 		buf = tostring(s);
1989 
1990 		for (pbuf = buf; *pbuf; pbuf++)
1991 			*pbuf = fun_c((uschar)*pbuf);
1992 
1993 		return buf;
1994 	} else {
1995 		/* upper/lower character may be shorter/longer */
1996 		buf = tostringN(s, strlen(s) * sz + 1);
1997 
1998 		(void) mbtowc(NULL, NULL, 0);	/* reset internal state */
1999 		/*
2000 		 * Reset internal state here too.
2001 		 * Assign result to avoid a compiler warning. (Casting to void
2002 		 * doesn't work.)
2003 		 * Increment said variable to avoid a different warning.
2004 		 */
2005 		unused = wctomb(NULL, L'\0');
2006 		unused++;
2007 
2008 		ps   = s;
2009 		pbuf = buf;
2010 		while (n = mbtowc(&wc, ps, sz),
2011 		       n > 0 && n != (size_t)-1 && n != (size_t)-2)
2012 		{
2013 			ps += n;
2014 
2015 			n = wctomb(pbuf, fun_wc(wc));
2016 			if (n == (size_t)-1)
2017 				FATAL("illegal wide character %s", s);
2018 
2019 			pbuf += n;
2020 		}
2021 
2022 		*pbuf = '\0';
2023 
2024 		if (n)
2025 			FATAL("illegal byte sequence %s", s);
2026 
2027 		return buf;
2028 	}
2029 }
2030 
2031 #ifdef __DJGPP__
towupper(wint_t wc)2032 static wint_t towupper(wint_t wc)
2033 {
2034 	if (wc >= 0 && wc < 256)
2035 		return toupper(wc & 0xFF);
2036 
2037 	return wc;
2038 }
2039 
towlower(wint_t wc)2040 static wint_t towlower(wint_t wc)
2041 {
2042 	if (wc >= 0 && wc < 256)
2043 		return tolower(wc & 0xFF);
2044 
2045 	return wc;
2046 }
2047 #endif
2048 
nawk_toupper(const char * s)2049 static char *nawk_toupper(const char *s)
2050 {
2051 	return nawk_convert(s, toupper, towupper);
2052 }
2053 
nawk_tolower(const char * s)2054 static char *nawk_tolower(const char *s)
2055 {
2056 	return nawk_convert(s, tolower, towlower);
2057 }
2058 
2059 
2060 
bltin(Node ** a,int n)2061 Cell *bltin(Node **a, int n)	/* builtin functions. a[0] is type, a[1] is arg list */
2062 {
2063 	Cell *x, *y;
2064 	Awkfloat u;
2065 	int t, sz;
2066 	Awkfloat tmp;
2067 	char *buf, *fmt;
2068 	Node *nextarg;
2069 	FILE *fp;
2070 	int status = 0;
2071 	time_t tv;
2072 	struct tm *tm;
2073 	int estatus = 0;
2074 
2075 	t = ptoi(a[0]);
2076 	x = execute(a[1]);
2077 	nextarg = a[1]->nnext;
2078 	switch (t) {
2079 	case FLENGTH:
2080 		if (isarr(x))
2081 			u = ((Array *) x->sval)->nelem;	/* GROT.  should be function*/
2082 		else
2083 			u = u8_strlen(getsval(x));
2084 		break;
2085 	case FLOG:
2086 		errno = 0;
2087 		u = errcheck(log(getfval(x)), "log");
2088 		break;
2089 	case FINT:
2090 		modf(getfval(x), &u); break;
2091 	case FEXP:
2092 		errno = 0;
2093 		u = errcheck(exp(getfval(x)), "exp");
2094 		break;
2095 	case FSQRT:
2096 		errno = 0;
2097 		u = errcheck(sqrt(getfval(x)), "sqrt");
2098 		break;
2099 	case FSIN:
2100 		u = sin(getfval(x)); break;
2101 	case FCOS:
2102 		u = cos(getfval(x)); break;
2103 	case FATAN:
2104 		if (nextarg == NULL) {
2105 			WARNING("atan2 requires two arguments; returning 1.0");
2106 			u = 1.0;
2107 		} else {
2108 			y = execute(a[1]->nnext);
2109 			u = atan2(getfval(x), getfval(y));
2110 			tempfree(y);
2111 			nextarg = nextarg->nnext;
2112 		}
2113 		break;
2114 	case FCOMPL:
2115 		u = ~((int)getfval(x));
2116 		break;
2117 	case FAND:
2118 		if (nextarg == 0) {
2119 			WARNING("and requires two arguments; returning 0");
2120 			u = 0;
2121 			break;
2122 		}
2123 		y = execute(a[1]->nnext);
2124 		u = ((int)getfval(x)) & ((int)getfval(y));
2125 		tempfree(y);
2126 		nextarg = nextarg->nnext;
2127 		break;
2128 	case FFOR:
2129 		if (nextarg == 0) {
2130 			WARNING("or requires two arguments; returning 0");
2131 			u = 0;
2132 			break;
2133 		}
2134 		y = execute(a[1]->nnext);
2135 		u = ((int)getfval(x)) | ((int)getfval(y));
2136 		tempfree(y);
2137 		nextarg = nextarg->nnext;
2138 		break;
2139 	case FXOR:
2140 		if (nextarg == 0) {
2141 			WARNING("xor requires two arguments; returning 0");
2142 			u = 0;
2143 			break;
2144 		}
2145 		y = execute(a[1]->nnext);
2146 		u = ((int)getfval(x)) ^ ((int)getfval(y));
2147 		tempfree(y);
2148 		nextarg = nextarg->nnext;
2149 		break;
2150 	case FLSHIFT:
2151 		if (nextarg == 0) {
2152 			WARNING("lshift requires two arguments; returning 0");
2153 			u = 0;
2154 			break;
2155 		}
2156 		y = execute(a[1]->nnext);
2157 		u = ((int)getfval(x)) << ((int)getfval(y));
2158 		tempfree(y);
2159 		nextarg = nextarg->nnext;
2160 		break;
2161 	case FRSHIFT:
2162 		if (nextarg == 0) {
2163 			WARNING("rshift requires two arguments; returning 0");
2164 			u = 0;
2165 			break;
2166 		}
2167 		y = execute(a[1]->nnext);
2168 		u = ((int)getfval(x)) >> ((int)getfval(y));
2169 		tempfree(y);
2170 		nextarg = nextarg->nnext;
2171 		break;
2172 	case FSYSTEM:
2173 		fflush(stdout);		/* in case something is buffered already */
2174 		estatus = status = system(getsval(x));
2175 		if (status != -1) {
2176 			if (WIFEXITED(status)) {
2177 				estatus = WEXITSTATUS(status);
2178 			} else if (WIFSIGNALED(status)) {
2179 				estatus = WTERMSIG(status) + 256;
2180 #ifdef WCOREDUMP
2181 				if (WCOREDUMP(status))
2182 					estatus += 256;
2183 #endif
2184 			} else	/* something else?!? */
2185 				estatus = 0;
2186 		}
2187 		/* else estatus was set to -1 */
2188 		u = estatus;
2189 		break;
2190 	case FRAND:
2191 		/* random() returns numbers in [0..2^31-1]
2192 		 * in order to get a number in [0, 1), divide it by 2^31
2193 		 */
2194 		u = (Awkfloat) random() / (0x7fffffffL + 0x1UL);
2195 		break;
2196 	case FSRAND:
2197 		if (isrec(x))	/* no argument provided */
2198 			u = time((time_t *)0);
2199 		else
2200 			u = getfval(x);
2201 		tmp = u;
2202 		srandom((unsigned long) u);
2203 		u = srand_seed;
2204 		srand_seed = tmp;
2205 		break;
2206 	case FTOUPPER:
2207 	case FTOLOWER:
2208 		if (t == FTOUPPER)
2209 			buf = nawk_toupper(getsval(x));
2210 		else
2211 			buf = nawk_tolower(getsval(x));
2212 		tempfree(x);
2213 		x = gettemp();
2214 		setsval(x, buf);
2215 		free(buf);
2216 		return x;
2217 	case FFLUSH:
2218 		if (isrec(x) || strlen(getsval(x)) == 0) {
2219 			flush_all();	/* fflush() or fflush("") -> all */
2220 			u = 0;
2221 		} else if ((fp = openfile(FFLUSH, getsval(x), NULL)) == NULL)
2222 			u = EOF;
2223 		else
2224 			u = fflush(fp);
2225 		break;
2226 	case FSYSTIME:
2227 		u = time((time_t *) 0);
2228 		break;
2229 	case FSTRFTIME:
2230 		/* strftime([format [,timestamp]]) */
2231 		if (nextarg) {
2232 			y = execute(nextarg);
2233 			nextarg = nextarg->nnext;
2234 			tv = (time_t) getfval(y);
2235 			tempfree(y);
2236 		} else
2237 			tv = time((time_t *) 0);
2238 		tm = localtime(&tv);
2239 		if (tm == NULL)
2240 			FATAL("bad time %ld", (long)tv);
2241 
2242 		if (isrec(x)) {
2243 			/* format argument not provided, use default */
2244 			fmt = tostring("%a %b %d %H:%M:%S %Z %Y");
2245 		} else
2246 			fmt = tostring(getsval(x));
2247 
2248 		sz = 32;
2249 		buf = NULL;
2250 		do {
2251 			if ((buf = realloc(buf, (sz *= 2))) == NULL)
2252 				FATAL("out of memory in strftime");
2253 		} while (strftime(buf, sz, fmt, tm) == 0 && fmt[0] != '\0');
2254 
2255 		y = gettemp();
2256 		setsval(y, buf);
2257 		free(fmt);
2258 		free(buf);
2259 
2260 		return y;
2261 	default:	/* can't happen */
2262 		FATAL("illegal function type %d", t);
2263 		break;
2264 	}
2265 	tempfree(x);
2266 	x = gettemp();
2267 	setfval(x, u);
2268 	if (nextarg != NULL) {
2269 		WARNING("warning: function has too many arguments");
2270 		for ( ; nextarg; nextarg = nextarg->nnext) {
2271 			y = execute(nextarg);
2272 			tempfree(y);
2273 		}
2274 	}
2275 	return(x);
2276 }
2277 
printstat(Node ** a,int n)2278 Cell *printstat(Node **a, int n)	/* print a[0] */
2279 {
2280 	Node *x;
2281 	Cell *y;
2282 	FILE *fp;
2283 
2284 	if (a[1] == NULL)	/* a[1] is redirection operator, a[2] is file */
2285 		fp = stdout;
2286 	else
2287 		fp = redirect(ptoi(a[1]), a[2]);
2288 	for (x = a[0]; x != NULL; x = x->nnext) {
2289 		y = execute(x);
2290 		fputs(getpssval(y), fp);
2291 		tempfree(y);
2292 		if (x->nnext == NULL)
2293 			fputs(getsval(orsloc), fp);
2294 		else
2295 			fputs(getsval(ofsloc), fp);
2296 	}
2297 	if (a[1] != NULL)
2298 		fflush(fp);
2299 	if (ferror(fp))
2300 		FATAL("write error on %s", filename(fp));
2301 	return(True);
2302 }
2303 
nullproc(Node ** a,int n)2304 Cell *nullproc(Node **a, int n)
2305 {
2306 	return 0;
2307 }
2308 
2309 
redirect(int a,Node * b)2310 FILE *redirect(int a, Node *b)	/* set up all i/o redirections */
2311 {
2312 	FILE *fp;
2313 	Cell *x;
2314 	char *fname;
2315 
2316 	x = execute(b);
2317 	fname = getsval(x);
2318 	fp = openfile(a, fname, NULL);
2319 	if (fp == NULL)
2320 		FATAL("can't open file %s", fname);
2321 	tempfree(x);
2322 	return fp;
2323 }
2324 
2325 struct files {
2326 	FILE	*fp;
2327 	const char	*fname;
2328 	int	mode;	/* '|', 'a', 'w' => LE/LT, GT */
2329 } *files;
2330 
2331 size_t nfiles;
2332 
stdinit(void)2333 static void stdinit(void)	/* in case stdin, etc., are not constants */
2334 {
2335 	nfiles = FOPEN_MAX;
2336 	files = (struct files *) calloc(nfiles, sizeof(*files));
2337 	if (files == NULL)
2338 		FATAL("can't allocate file memory for %zu files", nfiles);
2339         files[0].fp = stdin;
2340 	files[0].fname = tostring("/dev/stdin");
2341 	files[0].mode = LT;
2342         files[1].fp = stdout;
2343 	files[1].fname = tostring("/dev/stdout");
2344 	files[1].mode = GT;
2345         files[2].fp = stderr;
2346 	files[2].fname = tostring("/dev/stderr");
2347 	files[2].mode = GT;
2348 }
2349 
openfile(int a,const char * us,bool * pnewflag)2350 FILE *openfile(int a, const char *us, bool *pnewflag)
2351 {
2352 	const char *s = us;
2353 	size_t i;
2354 	int m;
2355 	FILE *fp = NULL;
2356 
2357 	if (*s == '\0')
2358 		FATAL("null file name in print or getline");
2359 	for (i = 0; i < nfiles; i++)
2360 		if (files[i].fname && strcmp(s, files[i].fname) == 0 &&
2361 		    (a == files[i].mode || (a==APPEND && files[i].mode==GT) ||
2362 		     a == FFLUSH)) {
2363 			if (pnewflag)
2364 				*pnewflag = false;
2365 			return files[i].fp;
2366 		}
2367 	if (a == FFLUSH)	/* didn't find it, so don't create it! */
2368 		return NULL;
2369 
2370 	for (i = 0; i < nfiles; i++)
2371 		if (files[i].fp == NULL)
2372 			break;
2373 	if (i >= nfiles) {
2374 		struct files *nf;
2375 		size_t nnf = nfiles + FOPEN_MAX;
2376 		nf = (struct files *) realloc(files, nnf * sizeof(*nf));
2377 		if (nf == NULL)
2378 			FATAL("cannot grow files for %s and %zu files", s, nnf);
2379 		memset(&nf[nfiles], 0, FOPEN_MAX * sizeof(*nf));
2380 		nfiles = nnf;
2381 		files = nf;
2382 	}
2383 	fflush(stdout);	/* force a semblance of order */
2384 	m = a;
2385 	if (a == GT) {
2386 		fp = fopen(s, "w");
2387 	} else if (a == APPEND) {
2388 		fp = fopen(s, "a");
2389 		m = GT;	/* so can mix > and >> */
2390 	} else if (a == '|') {	/* output pipe */
2391 		fp = popen(s, "w");
2392 	} else if (a == LE) {	/* input pipe */
2393 		fp = popen(s, "r");
2394 	} else if (a == LT) {	/* getline <file */
2395 		fp = strcmp(s, "-") == 0 ? stdin : fopen(s, "r");	/* "-" is stdin */
2396 	} else	/* can't happen */
2397 		FATAL("illegal redirection %d", a);
2398 	if (fp != NULL) {
2399 		files[i].fname = tostring(s);
2400 		files[i].fp = fp;
2401 		files[i].mode = m;
2402 		if (pnewflag)
2403 			*pnewflag = true;
2404 		if (fp != stdin && fp != stdout && fp != stderr)
2405 			(void) fcntl(fileno(fp), F_SETFD, FD_CLOEXEC);
2406 	}
2407 	return fp;
2408 }
2409 
filename(FILE * fp)2410 const char *filename(FILE *fp)
2411 {
2412 	size_t i;
2413 
2414 	for (i = 0; i < nfiles; i++)
2415 		if (fp == files[i].fp)
2416 			return files[i].fname;
2417 	return "???";
2418 }
2419 
closefile(Node ** a,int n)2420 Cell *closefile(Node **a, int n)
2421 {
2422  	Cell *x;
2423 	size_t i;
2424 	bool stat;
2425 
2426  	x = execute(a[0]);
2427  	getsval(x);
2428 	stat = true;
2429  	for (i = 0; i < nfiles; i++) {
2430 		if (!files[i].fname || strcmp(x->sval, files[i].fname) != 0)
2431 			continue;
2432 		if (files[i].mode == GT || files[i].mode == '|')
2433 			fflush(files[i].fp);
2434 		if (ferror(files[i].fp)) {
2435 			if ((files[i].mode == GT && files[i].fp != stderr)
2436 			  || files[i].mode == '|')
2437 				FATAL("write error on %s", files[i].fname);
2438 			else
2439 				WARNING("i/o error occurred on %s", files[i].fname);
2440 		}
2441 		if (files[i].fp == stdin || files[i].fp == stdout ||
2442 		    files[i].fp == stderr)
2443 			stat = freopen("/dev/null", "r+", files[i].fp) == NULL;
2444 		else if (files[i].mode == '|' || files[i].mode == LE)
2445 			stat = pclose(files[i].fp) == -1;
2446 		else
2447 			stat = fclose(files[i].fp) == EOF;
2448 		if (stat)
2449 			WARNING("i/o error occurred closing %s", files[i].fname);
2450 		xfree(files[i].fname);
2451 		files[i].fname = NULL;	/* watch out for ref thru this */
2452 		files[i].fp = NULL;
2453 		break;
2454  	}
2455  	tempfree(x);
2456  	x = gettemp();
2457 	setfval(x, (Awkfloat) (stat ? -1 : 0));
2458  	return(x);
2459 }
2460 
closeall(void)2461 void closeall(void)
2462 {
2463 	size_t i;
2464 	bool stat = false;
2465 
2466 	for (i = 0; i < nfiles; i++) {
2467 		if (! files[i].fp)
2468 			continue;
2469 		if (files[i].mode == GT || files[i].mode == '|')
2470 			fflush(files[i].fp);
2471 		if (ferror(files[i].fp)) {
2472 			if ((files[i].mode == GT && files[i].fp != stderr)
2473 			  || files[i].mode == '|')
2474 				FATAL("write error on %s", files[i].fname);
2475 			else
2476 				WARNING("i/o error occurred on %s", files[i].fname);
2477 		}
2478 		if (files[i].fp == stdin || files[i].fp == stdout ||
2479 		    files[i].fp == stderr)
2480 			continue;
2481 		if (files[i].mode == '|' || files[i].mode == LE)
2482 			stat = pclose(files[i].fp) == -1;
2483 		else
2484 			stat = fclose(files[i].fp) == EOF;
2485 		if (stat)
2486 			WARNING("i/o error occurred while closing %s", files[i].fname);
2487 	}
2488 }
2489 
flush_all(void)2490 static void flush_all(void)
2491 {
2492 	size_t i;
2493 
2494 	for (i = 0; i < nfiles; i++)
2495 		if (files[i].fp)
2496 			fflush(files[i].fp);
2497 }
2498 
2499 void backsub(char **pb_ptr, const char **sptr_ptr);
2500 
dosub(Node ** a,int subop)2501 Cell *dosub(Node **a, int subop)        /* sub and gsub */
2502 {
2503 	fa *pfa;
2504 	int tempstat = 0;
2505 	char *repl;
2506 	Cell *x;
2507 
2508 	char *buf = NULL;
2509 	char *pb = NULL;
2510 	int bufsz = recsize;
2511 
2512 	const char *r, *s;
2513 	const char *start;
2514 	const char *noempty = NULL;      /* empty match disallowed here */
2515 	size_t m = 0;                    /* match count */
2516 	size_t whichm;                   /* which match to select, 0 = global */
2517 	int mtype;                       /* match type */
2518 
2519 	if (a[0] == NULL) {	/* 0 => a[1] is already-compiled regexpr */
2520 		pfa = (fa *) a[1];
2521 	} else {
2522 		x = execute(a[1]);
2523 		pfa = makedfa(getsval(x), 1);
2524 		tempfree(x);
2525 	}
2526 
2527 	x = execute(a[2]);	/* replacement string */
2528 	repl = tostring(getsval(x));
2529 	tempfree(x);
2530 
2531 	switch (subop) {
2532 	case SUB:
2533 		whichm = 1;
2534 		x = execute(a[3]);    /* source string */
2535 		break;
2536 	case GSUB:
2537 		whichm = 0;
2538 		x = execute(a[3]);    /* source string */
2539 		break;
2540 	default:
2541 		FATAL("dosub: unrecognized subop: %d", subop);
2542 	}
2543 
2544 	start = getsval(x);
2545 	while (pmatch(pfa, start)) {
2546 		if (buf == NULL) {
2547 			if ((pb = buf = (char *) malloc(bufsz)) == NULL)
2548 				FATAL("out of memory in dosub");
2549 			tempstat = pfa->initstat;
2550 			pfa->initstat = 2;
2551 		}
2552 
2553 		/* match types */
2554 		#define	MT_IGNORE  0  /* unselected or invalid */
2555 		#define MT_INSERT  1  /* selected, empty */
2556 		#define MT_REPLACE 2  /* selected, not empty */
2557 
2558 		/* an empty match just after replacement is invalid */
2559 
2560 		if (patbeg == noempty && patlen == 0) {
2561 			mtype = MT_IGNORE;    /* invalid, not counted */
2562 		} else if (whichm == ++m || whichm == 0) {
2563 			mtype = patlen ? MT_REPLACE : MT_INSERT;
2564 		} else {
2565 			mtype = MT_IGNORE;    /* unselected, but counted */
2566 		}
2567 
2568 		/* leading text: */
2569 		if (patbeg > start) {
2570 			adjbuf(&buf, &bufsz, (pb - buf) + (patbeg - start),
2571 				recsize, &pb, "dosub");
2572 			s = start;
2573 			while (s < patbeg)
2574 				*pb++ = *s++;
2575 		}
2576 
2577 		if (mtype == MT_IGNORE)
2578 			goto matching_text;  /* skip replacement text */
2579 
2580 		r = repl;
2581 		while (*r != 0) {
2582 			adjbuf(&buf, &bufsz, 5+pb-buf, recsize, &pb, "dosub");
2583 			if (*r == '\\') {
2584 				backsub(&pb, &r);
2585 			} else if (*r == '&') {
2586 				r++;
2587 				adjbuf(&buf, &bufsz, 1+patlen+pb-buf, recsize,
2588 					&pb, "dosub");
2589 				for (s = patbeg; s < patbeg+patlen; )
2590 					*pb++ = *s++;
2591 			} else {
2592 				*pb++ = *r++;
2593 			}
2594 		}
2595 
2596 matching_text:
2597 		if (mtype == MT_REPLACE || *patbeg == '\0')
2598 			goto next_search;  /* skip matching text */
2599 
2600 		if (patlen == 0)
2601 			patlen = u8_nextlen(patbeg);
2602 		adjbuf(&buf, &bufsz, (pb-buf) + patlen, recsize, &pb, "dosub");
2603 		s = patbeg;
2604 		while (s < patbeg + patlen)
2605 			*pb++ = *s++;
2606 
2607 next_search:
2608 		start = patbeg + patlen;
2609 		if (m == whichm || *patbeg == '\0')
2610 			break;
2611 		if (mtype == MT_REPLACE)
2612 			noempty = start;
2613 
2614 		#undef MT_IGNORE
2615 		#undef MT_INSERT
2616 		#undef MT_REPLACE
2617 	}
2618 
2619 	xfree(repl);
2620 
2621 	if (buf != NULL) {
2622 		pfa->initstat = tempstat;
2623 
2624 		/* trailing text */
2625 		adjbuf(&buf, &bufsz, 1+strlen(start)+pb-buf, 0, &pb, "dosub");
2626 		while ((*pb++ = *start++) != '\0')
2627 			;
2628 
2629 		setsval(x, buf);
2630 		free(buf);
2631 	}
2632 
2633 	tempfree(x);
2634 	x = gettemp();
2635 	x->tval = NUM;
2636 	x->fval = m;
2637 	return x;
2638 }
2639 
gensub(Node ** a,int nnn)2640 Cell *gensub(Node **a, int nnn)	/* global selective substitute */
2641 	/* XXX incomplete - doesn't support backreferences \0 ... \9 */
2642 {
2643 	Cell *x, *y, *res, *h;
2644 	char *rptr;
2645 	const char *sptr;
2646 	char *buf, *pb;
2647 	const char *t, *q;
2648 	fa *pfa;
2649 	int mflag, tempstat, num, whichm;
2650 	int bufsz = recsize;
2651 
2652 	if ((buf = malloc(bufsz)) == NULL)
2653 		FATAL("out of memory in gensub");
2654 	mflag = 0;	/* if mflag == 0, can replace empty string */
2655 	num = 0;
2656 	x = execute(a[4]);	/* source string */
2657 	t = getsval(x);
2658 	res = copycell(x);	/* target string - initially copy of source */
2659 	res->csub = CTEMP;	/* result values are temporary */
2660 	if (a[0] == 0)		/* 0 => a[1] is already-compiled regexpr */
2661 		pfa = (fa *) a[1];	/* regular expression */
2662 	else {
2663 		y = execute(a[1]);
2664 		pfa = makedfa(getsval(y), 1);
2665 		tempfree(y);
2666 	}
2667 	y = execute(a[2]);	/* replacement string */
2668 	h = execute(a[3]);	/* which matches should be replaced */
2669 	sptr = getsval(h);
2670 	if (sptr[0] == 'g' || sptr[0] == 'G')
2671 		whichm = -1;
2672 	else {
2673 		/*
2674 		 * The specified number is index of replacement, starting
2675 		 * from 1. GNU awk treats index lower than 0 same as
2676 		 * 1, we do same for compatibility.
2677 		 */
2678 		whichm = (int) getfval(h) - 1;
2679 		if (whichm < 0)
2680 			whichm = 0;
2681 	}
2682 	tempfree(h);
2683 
2684 	if (pmatch(pfa, t)) {
2685 		char *sl;
2686 
2687 		tempstat = pfa->initstat;
2688 		pfa->initstat = 2;
2689 		pb = buf;
2690 		rptr = getsval(y);
2691 		/*
2692 		 * XXX if there are any backreferences in subst string,
2693 		 * complain now.
2694 		 */
2695 		for (sl = rptr; (sl = strchr(sl, '\\')) && sl[1]; sl++) {
2696 			if (strchr("0123456789", sl[1])) {
2697 				FATAL("gensub doesn't support backreferences (subst \"%s\")", rptr);
2698 			}
2699 		}
2700 
2701 		do {
2702 			if (whichm >= 0 && whichm != num) {
2703 				num++;
2704 				adjbuf(&buf, &bufsz, (pb - buf) + (patbeg - t) + patlen, recsize, &pb, "gensub");
2705 
2706 				/* copy the part of string up to and including
2707 				 * match to output buffer */
2708 				while (t < patbeg + patlen)
2709 					*pb++ = *t++;
2710 				continue;
2711 			}
2712 
2713 			if (patlen == 0 && *patbeg != 0) {	/* matched empty string */
2714 				if (mflag == 0) {	/* can replace empty */
2715 					num++;
2716 					sptr = rptr;
2717 					while (*sptr != 0) {
2718 						adjbuf(&buf, &bufsz, 5+pb-buf, recsize, &pb, "gensub");
2719 						if (*sptr == '\\') {
2720 							backsub(&pb, &sptr);
2721 						} else if (*sptr == '&') {
2722 							sptr++;
2723 							adjbuf(&buf, &bufsz, 1+patlen+pb-buf, recsize, &pb, "gensub");
2724 							for (q = patbeg; q < patbeg+patlen; )
2725 								*pb++ = *q++;
2726 						} else
2727 							*pb++ = *sptr++;
2728 					}
2729 				}
2730 				if (*t == 0)	/* at end */
2731 					goto done;
2732 				adjbuf(&buf, &bufsz, 2+pb-buf, recsize, &pb, "gensub");
2733 				*pb++ = *t++;
2734 				if (pb > buf + bufsz)	/* BUG: not sure of this test */
2735 					FATAL("gensub result0 %.30s too big; can't happen", buf);
2736 				mflag = 0;
2737 			}
2738 			else {	/* matched nonempty string */
2739 				num++;
2740 				sptr = t;
2741 				adjbuf(&buf, &bufsz, 1+(patbeg-sptr)+pb-buf, recsize, &pb, "gensub");
2742 				while (sptr < patbeg)
2743 					*pb++ = *sptr++;
2744 				sptr = rptr;
2745 				while (*sptr != 0) {
2746 					adjbuf(&buf, &bufsz, 5+pb-buf, recsize, &pb, "gensub");
2747 					if (*sptr == '\\') {
2748 						backsub(&pb, &sptr);
2749 					} else if (*sptr == '&') {
2750 						sptr++;
2751 						adjbuf(&buf, &bufsz, 1+patlen+pb-buf, recsize, &pb, "gensub");
2752 						for (q = patbeg; q < patbeg+patlen; )
2753 							*pb++ = *q++;
2754 					} else
2755 						*pb++ = *sptr++;
2756 				}
2757 				t = patbeg + patlen;
2758 				if (patlen == 0 || *t == 0 || *(t-1) == 0)
2759 					goto done;
2760 				if (pb > buf + bufsz)
2761 					FATAL("gensub result1 %.30s too big; can't happen", buf);
2762 				mflag = 1;
2763 			}
2764 		} while (pmatch(pfa,t));
2765 		sptr = t;
2766 		adjbuf(&buf, &bufsz, 1+strlen(sptr)+pb-buf, 0, &pb, "gensub");
2767 		while ((*pb++ = *sptr++) != 0)
2768 			;
2769 	done:	if (pb > buf + bufsz)
2770 			FATAL("gensub result2 %.30s too big; can't happen", buf);
2771 		*pb = '\0';
2772 		setsval(res, buf);
2773 		pfa->initstat = tempstat;
2774 	}
2775 	tempfree(x);
2776 	tempfree(y);
2777 	free(buf);
2778 	return(res);
2779 }
2780 
backsub(char ** pb_ptr,const char ** sptr_ptr)2781 void backsub(char **pb_ptr, const char **sptr_ptr)	/* handle \\& variations */
2782 {						/* sptr[0] == '\\' */
2783 	char *pb = *pb_ptr;
2784 	const char *sptr = *sptr_ptr;
2785 	static bool first = true;
2786 	static bool do_posix = false;
2787 
2788 	if (first) {
2789 		first = false;
2790 		do_posix = (getenv("POSIXLY_CORRECT") != NULL);
2791 	}
2792 
2793 	if (sptr[1] == '\\') {
2794 		if (sptr[2] == '\\' && sptr[3] == '&') { /* \\\& -> \& */
2795 			*pb++ = '\\';
2796 			*pb++ = '&';
2797 			sptr += 4;
2798 		} else if (sptr[2] == '&') {	/* \\& -> \ + matched */
2799 			*pb++ = '\\';
2800 			sptr += 2;
2801 		} else if (do_posix) {		/* \\x -> \x */
2802 			sptr++;
2803 			*pb++ = *sptr++;
2804 		} else {			/* \\x -> \\x */
2805 			*pb++ = *sptr++;
2806 			*pb++ = *sptr++;
2807 		}
2808 	} else if (sptr[1] == '&') {	/* literal & */
2809 		sptr++;
2810 		*pb++ = *sptr++;
2811 	} else				/* literal \ */
2812 		*pb++ = *sptr++;
2813 
2814 	*pb_ptr = pb;
2815 	*sptr_ptr = sptr;
2816 }
2817 
wide_char_to_byte_str(int rune,size_t * outlen)2818 static char *wide_char_to_byte_str(int rune, size_t *outlen)
2819 {
2820 	static char buf[5];
2821 	int len;
2822 
2823 	if (rune < 0 || rune > 0x10FFFF)
2824 		return NULL;
2825 
2826 	memset(buf, 0, sizeof(buf));
2827 
2828 	len = 0;
2829 	if (rune <= 0x0000007F) {
2830 		buf[len++] = rune;
2831 	} else if (rune <= 0x000007FF) {
2832 		// 110xxxxx 10xxxxxx
2833 		buf[len++] = 0xC0 | (rune >> 6);
2834 		buf[len++] = 0x80 | (rune & 0x3F);
2835 	} else if (rune <= 0x0000FFFF) {
2836 		// 1110xxxx 10xxxxxx 10xxxxxx
2837 		buf[len++] = 0xE0 | (rune >> 12);
2838 		buf[len++] = 0x80 | ((rune >> 6) & 0x3F);
2839 		buf[len++] = 0x80 | (rune & 0x3F);
2840 
2841 	} else {
2842 		// 0x00010000 - 0x10FFFF
2843 		// 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
2844 		buf[len++] = 0xF0 | (rune >> 18);
2845 		buf[len++] = 0x80 | ((rune >> 12) & 0x3F);
2846 		buf[len++] = 0x80 | ((rune >> 6) & 0x3F);
2847 		buf[len++] = 0x80 | (rune & 0x3F);
2848 	}
2849 
2850 	*outlen = len;
2851 	buf[len++] = '\0';
2852 
2853 	return buf;
2854 }
2855