xref: /dragonfly/contrib/awk/run.c (revision 35e996c9)
1 /****************************************************************
2 Copyright (C) Lucent Technologies 1997
3 All Rights Reserved
4 
5 Permission to use, copy, modify, and distribute this software and
6 its documentation for any purpose and without fee is hereby
7 granted, provided that the above copyright notice appear in all
8 copies and that both that the copyright notice and this
9 permission notice and warranty disclaimer appear in supporting
10 documentation, and that the name Lucent Technologies or any of
11 its entities not be used in advertising or publicity pertaining
12 to distribution of the software without specific, written prior
13 permission.
14 
15 LUCENT DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
16 INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS.
17 IN NO EVENT SHALL LUCENT OR ANY OF ITS ENTITIES BE LIABLE FOR ANY
18 SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
19 WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER
20 IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,
21 ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF
22 THIS SOFTWARE.
23 ****************************************************************/
24 
25 #define DEBUG
26 #include <stdio.h>
27 #include <ctype.h>
28 #include <errno.h>
29 #include <wctype.h>
30 #include <fcntl.h>
31 #include <setjmp.h>
32 #include <limits.h>
33 #include <math.h>
34 #include <string.h>
35 #include <stdlib.h>
36 #include <time.h>
37 #include <sys/types.h>
38 #include <sys/wait.h>
39 #include "awk.h"
40 #include "awkgram.tab.h"
41 
42 
43 static void stdinit(void);
44 static void flush_all(void);
45 static char *wide_char_to_byte_str(int rune, size_t *outlen);
46 
47 #if 1
48 #define tempfree(x)	do { if (istemp(x)) tfree(x); } while (/*CONSTCOND*/0)
49 #else
50 void tempfree(Cell *p) {
51 	if (p->ctype == OCELL && (p->csub < CUNK || p->csub > CFREE)) {
52 		WARNING("bad csub %d in Cell %d %s",
53 			p->csub, p->ctype, p->sval);
54 	}
55 	if (istemp(p))
56 		tfree(p);
57 }
58 #endif
59 
60 /* do we really need these? */
61 /* #ifdef _NFILE */
62 /* #ifndef FOPEN_MAX */
63 /* #define FOPEN_MAX _NFILE */
64 /* #endif */
65 /* #endif */
66 /*  */
67 /* #ifndef	FOPEN_MAX */
68 /* #define	FOPEN_MAX	40 */	/* max number of open files */
69 /* #endif */
70 /*  */
71 /* #ifndef RAND_MAX */
72 /* #define RAND_MAX	32767 */	/* all that ansi guarantees */
73 /* #endif */
74 
75 jmp_buf env;
76 extern	int	pairstack[];
77 extern	Awkfloat	srand_seed;
78 
79 Node	*winner = NULL;	/* root of parse tree */
80 Cell	*tmps;		/* free temporary cells for execution */
81 
82 static Cell	truecell	={ OBOOL, BTRUE, 0, 0, 1.0, NUM, NULL, NULL };
83 Cell	*True	= &truecell;
84 static Cell	falsecell	={ OBOOL, BFALSE, 0, 0, 0.0, NUM, NULL, NULL };
85 Cell	*False	= &falsecell;
86 static Cell	breakcell	={ OJUMP, JBREAK, 0, 0, 0.0, NUM, NULL, NULL };
87 Cell	*jbreak	= &breakcell;
88 static Cell	contcell	={ OJUMP, JCONT, 0, 0, 0.0, NUM, NULL, NULL };
89 Cell	*jcont	= &contcell;
90 static Cell	nextcell	={ OJUMP, JNEXT, 0, 0, 0.0, NUM, NULL, NULL };
91 Cell	*jnext	= &nextcell;
92 static Cell	nextfilecell	={ OJUMP, JNEXTFILE, 0, 0, 0.0, NUM, NULL, NULL };
93 Cell	*jnextfile	= &nextfilecell;
94 static Cell	exitcell	={ OJUMP, JEXIT, 0, 0, 0.0, NUM, NULL, NULL };
95 Cell	*jexit	= &exitcell;
96 static Cell	retcell		={ OJUMP, JRET, 0, 0, 0.0, NUM, NULL, NULL };
97 Cell	*jret	= &retcell;
98 static Cell	tempcell	={ OCELL, CTEMP, 0, EMPTY, 0.0, NUM|STR|DONTFREE, NULL, NULL };
99 
100 Node	*curnode = NULL;	/* the node being executed, for debugging */
101 
102 /* buffer memory management */
103 int adjbuf(char **pbuf, int *psiz, int minlen, int quantum, char **pbptr,
104 	const char *whatrtn)
105 /* pbuf:    address of pointer to buffer being managed
106  * psiz:    address of buffer size variable
107  * minlen:  minimum length of buffer needed
108  * quantum: buffer size quantum
109  * pbptr:   address of movable pointer into buffer, or 0 if none
110  * whatrtn: name of the calling routine if failure should cause fatal error
111  *
112  * return   0 for realloc failure, !=0 for success
113  */
114 {
115 	if (minlen > *psiz) {
116 		char *tbuf;
117 		int rminlen = quantum ? minlen % quantum : 0;
118 		int boff = pbptr ? *pbptr - *pbuf : 0;
119 		/* round up to next multiple of quantum */
120 		if (rminlen)
121 			minlen += quantum - rminlen;
122 		tbuf = (char *) realloc(*pbuf, minlen);
123 		DPRINTF("adjbuf %s: %d %d (pbuf=%p, tbuf=%p)\n", whatrtn, *psiz, minlen, (void*)*pbuf, (void*)tbuf);
124 		if (tbuf == NULL) {
125 			if (whatrtn)
126 				FATAL("out of memory in %s", whatrtn);
127 			return 0;
128 		}
129 		*pbuf = tbuf;
130 		*psiz = minlen;
131 		if (pbptr)
132 			*pbptr = tbuf + boff;
133 	}
134 	return 1;
135 }
136 
137 void run(Node *a)	/* execution of parse tree starts here */
138 {
139 
140 	stdinit();
141 	execute(a);
142 	closeall();
143 }
144 
145 Cell *execute(Node *u)	/* execute a node of the parse tree */
146 {
147 	Cell *(*proc)(Node **, int);
148 	Cell *x;
149 	Node *a;
150 
151 	if (u == NULL)
152 		return(True);
153 	for (a = u; ; a = a->nnext) {
154 		curnode = a;
155 		if (isvalue(a)) {
156 			x = (Cell *) (a->narg[0]);
157 			if (isfld(x) && !donefld)
158 				fldbld();
159 			else if (isrec(x) && !donerec)
160 				recbld();
161 			return(x);
162 		}
163 		if (notlegal(a->nobj))	/* probably a Cell* but too risky to print */
164 			FATAL("illegal statement");
165 		proc = proctab[a->nobj-FIRSTTOKEN];
166 		x = (*proc)(a->narg, a->nobj);
167 		if (isfld(x) && !donefld)
168 			fldbld();
169 		else if (isrec(x) && !donerec)
170 			recbld();
171 		if (isexpr(a))
172 			return(x);
173 		if (isjump(x))
174 			return(x);
175 		if (a->nnext == NULL)
176 			return(x);
177 		tempfree(x);
178 	}
179 }
180 
181 
182 Cell *program(Node **a, int n)	/* execute an awk program */
183 {				/* a[0] = BEGIN, a[1] = body, a[2] = END */
184 	Cell *x;
185 
186 	if (setjmp(env) != 0)
187 		goto ex;
188 	if (a[0]) {		/* BEGIN */
189 		x = execute(a[0]);
190 		if (isexit(x))
191 			return(True);
192 		if (isjump(x))
193 			FATAL("illegal break, continue, next or nextfile from BEGIN");
194 		tempfree(x);
195 	}
196 	if (a[1] || a[2])
197 		while (getrec(&record, &recsize, true) > 0) {
198 			x = execute(a[1]);
199 			if (isexit(x))
200 				break;
201 			tempfree(x);
202 		}
203   ex:
204 	if (setjmp(env) != 0)	/* handles exit within END */
205 		goto ex1;
206 	if (a[2]) {		/* END */
207 		x = execute(a[2]);
208 		if (isbreak(x) || isnext(x) || iscont(x))
209 			FATAL("illegal break, continue, next or nextfile from END");
210 		tempfree(x);
211 	}
212   ex1:
213 	return(True);
214 }
215 
216 struct Frame {	/* stack frame for awk function calls */
217 	int nargs;	/* number of arguments in this call */
218 	Cell *fcncell;	/* pointer to Cell for function */
219 	Cell **args;	/* pointer to array of arguments after execute */
220 	Cell *retval;	/* return value */
221 };
222 
223 #define	NARGS	50	/* max args in a call */
224 
225 struct Frame *frame = NULL;	/* base of stack frames; dynamically allocated */
226 int	nframe = 0;		/* number of frames allocated */
227 struct Frame *frp = NULL;	/* frame pointer. bottom level unused */
228 
229 Cell *call(Node **a, int n)	/* function call.  very kludgy and fragile */
230 {
231 	static const Cell newcopycell = { OCELL, CCOPY, 0, EMPTY, 0.0, NUM|STR|DONTFREE, NULL, NULL };
232 	int i, ncall, ndef;
233 	int freed = 0; /* handles potential double freeing when fcn & param share a tempcell */
234 	Node *x;
235 	Cell *args[NARGS], *oargs[NARGS];	/* BUG: fixed size arrays */
236 	Cell *y, *z, *fcn;
237 	char *s;
238 
239 	fcn = execute(a[0]);	/* the function itself */
240 	s = fcn->nval;
241 	if (!isfcn(fcn))
242 		FATAL("calling undefined function %s", s);
243 	if (frame == NULL) {
244 		frp = frame = (struct Frame *) calloc(nframe += 100, sizeof(*frame));
245 		if (frame == NULL)
246 			FATAL("out of space for stack frames calling %s", s);
247 	}
248 	for (ncall = 0, x = a[1]; x != NULL; x = x->nnext)	/* args in call */
249 		ncall++;
250 	ndef = (int) fcn->fval;			/* args in defn */
251 	DPRINTF("calling %s, %d args (%d in defn), frp=%d\n", s, ncall, ndef, (int) (frp-frame));
252 	if (ncall > ndef)
253 		WARNING("function %s called with %d args, uses only %d",
254 			s, ncall, ndef);
255 	if (ncall + ndef > NARGS)
256 		FATAL("function %s has %d arguments, limit %d", s, ncall+ndef, NARGS);
257 	for (i = 0, x = a[1]; x != NULL; i++, x = x->nnext) {	/* get call args */
258 		DPRINTF("evaluate args[%d], frp=%d:\n", i, (int) (frp-frame));
259 		y = execute(x);
260 		oargs[i] = y;
261 		DPRINTF("args[%d]: %s %f <%s>, t=%o\n",
262 			i, NN(y->nval), y->fval, isarr(y) ? "(array)" : NN(y->sval), y->tval);
263 		if (isfcn(y))
264 			FATAL("can't use function %s as argument in %s", y->nval, s);
265 		if (isarr(y))
266 			args[i] = y;	/* arrays by ref */
267 		else
268 			args[i] = copycell(y);
269 		tempfree(y);
270 	}
271 	for ( ; i < ndef; i++) {	/* add null args for ones not provided */
272 		args[i] = gettemp();
273 		*args[i] = newcopycell;
274 	}
275 	frp++;	/* now ok to up frame */
276 	if (frp >= frame + nframe) {
277 		int dfp = frp - frame;	/* old index */
278 		frame = (struct Frame *) realloc(frame, (nframe += 100) * sizeof(*frame));
279 		if (frame == NULL)
280 			FATAL("out of space for stack frames in %s", s);
281 		frp = frame + dfp;
282 	}
283 	frp->fcncell = fcn;
284 	frp->args = args;
285 	frp->nargs = ndef;	/* number defined with (excess are locals) */
286 	frp->retval = gettemp();
287 
288 	DPRINTF("start exec of %s, frp=%d\n", s, (int) (frp-frame));
289 	y = execute((Node *)(fcn->sval));	/* execute body */
290 	DPRINTF("finished exec of %s, frp=%d\n", s, (int) (frp-frame));
291 
292 	for (i = 0; i < ndef; i++) {
293 		Cell *t = frp->args[i];
294 		if (isarr(t)) {
295 			if (t->csub == CCOPY) {
296 				if (i >= ncall) {
297 					freesymtab(t);
298 					t->csub = CTEMP;
299 					tempfree(t);
300 				} else {
301 					oargs[i]->tval = t->tval;
302 					oargs[i]->tval &= ~(STR|NUM|DONTFREE);
303 					oargs[i]->sval = t->sval;
304 					tempfree(t);
305 				}
306 			}
307 		} else if (t != y) {	/* kludge to prevent freeing twice */
308 			t->csub = CTEMP;
309 			tempfree(t);
310 		} else if (t == y && t->csub == CCOPY) {
311 			t->csub = CTEMP;
312 			tempfree(t);
313 			freed = 1;
314 		}
315 	}
316 	tempfree(fcn);
317 	if (isexit(y) || isnext(y))
318 		return y;
319 	if (freed == 0) {
320 		tempfree(y);	/* don't free twice! */
321 	}
322 	z = frp->retval;			/* return value */
323 	DPRINTF("%s returns %g |%s| %o\n", s, getfval(z), getsval(z), z->tval);
324 	frp--;
325 	return(z);
326 }
327 
328 Cell *copycell(Cell *x)	/* make a copy of a cell in a temp */
329 {
330 	Cell *y;
331 
332 	/* copy is not constant or field */
333 
334 	y = gettemp();
335 	y->tval = x->tval & ~(CON|FLD|REC);
336 	y->csub = CCOPY;	/* prevents freeing until call is over */
337 	y->nval = x->nval;	/* BUG? */
338 	if (isstr(x) /* || x->ctype == OCELL */) {
339 		y->sval = tostring(x->sval);
340 		y->tval &= ~DONTFREE;
341 	} else
342 		y->tval |= DONTFREE;
343 	y->fval = x->fval;
344 	return y;
345 }
346 
347 Cell *arg(Node **a, int n)	/* nth argument of a function */
348 {
349 
350 	n = ptoi(a[0]);	/* argument number, counting from 0 */
351 	DPRINTF("arg(%d), frp->nargs=%d\n", n, frp->nargs);
352 	if (n+1 > frp->nargs)
353 		FATAL("argument #%d of function %s was not supplied",
354 			n+1, frp->fcncell->nval);
355 	return frp->args[n];
356 }
357 
358 Cell *jump(Node **a, int n)	/* break, continue, next, nextfile, return */
359 {
360 	Cell *y;
361 
362 	switch (n) {
363 	case EXIT:
364 		if (a[0] != NULL) {
365 			y = execute(a[0]);
366 			errorflag = (int) getfval(y);
367 			tempfree(y);
368 		}
369 		longjmp(env, 1);
370 	case RETURN:
371 		if (a[0] != NULL) {
372 			y = execute(a[0]);
373 			if ((y->tval & (STR|NUM)) == (STR|NUM)) {
374 				setsval(frp->retval, getsval(y));
375 				frp->retval->fval = getfval(y);
376 				frp->retval->tval |= NUM;
377 			}
378 			else if (y->tval & STR)
379 				setsval(frp->retval, getsval(y));
380 			else if (y->tval & NUM)
381 				setfval(frp->retval, getfval(y));
382 			else		/* can't happen */
383 				FATAL("bad type variable %d", y->tval);
384 			tempfree(y);
385 		}
386 		return(jret);
387 	case NEXT:
388 		return(jnext);
389 	case NEXTFILE:
390 		nextfile();
391 		return(jnextfile);
392 	case BREAK:
393 		return(jbreak);
394 	case CONTINUE:
395 		return(jcont);
396 	default:	/* can't happen */
397 		FATAL("illegal jump type %d", n);
398 	}
399 	return 0;	/* not reached */
400 }
401 
402 Cell *awkgetline(Node **a, int n)	/* get next line from specific input */
403 {		/* a[0] is variable, a[1] is operator, a[2] is filename */
404 	Cell *r, *x;
405 	extern Cell **fldtab;
406 	FILE *fp;
407 	char *buf;
408 	int bufsize = recsize;
409 	int mode;
410 	bool newflag;
411 	double result;
412 
413 	if ((buf = (char *) malloc(bufsize)) == NULL)
414 		FATAL("out of memory in getline");
415 
416 	fflush(stdout);	/* in case someone is waiting for a prompt */
417 	r = gettemp();
418 	if (a[1] != NULL) {		/* getline < file */
419 		x = execute(a[2]);		/* filename */
420 		mode = ptoi(a[1]);
421 		if (mode == '|')		/* input pipe */
422 			mode = LE;	/* arbitrary flag */
423 		fp = openfile(mode, getsval(x), &newflag);
424 		tempfree(x);
425 		if (fp == NULL)
426 			n = -1;
427 		else
428 			n = readrec(&buf, &bufsize, fp, newflag);
429 		if (n <= 0) {
430 			;
431 		} else if (a[0] != NULL) {	/* getline var <file */
432 			x = execute(a[0]);
433 			setsval(x, buf);
434 			if (is_number(x->sval, & result)) {
435 				x->fval = result;
436 				x->tval |= NUM;
437 			}
438 			tempfree(x);
439 		} else {			/* getline <file */
440 			setsval(fldtab[0], buf);
441 			if (is_number(fldtab[0]->sval, & result)) {
442 				fldtab[0]->fval = result;
443 				fldtab[0]->tval |= NUM;
444 			}
445 		}
446 	} else {			/* bare getline; use current input */
447 		if (a[0] == NULL)	/* getline */
448 			n = getrec(&record, &recsize, true);
449 		else {			/* getline var */
450 			n = getrec(&buf, &bufsize, false);
451 			if (n > 0) {
452 				x = execute(a[0]);
453 				setsval(x, buf);
454 				if (is_number(x->sval, & result)) {
455 					x->fval = result;
456 					x->tval |= NUM;
457 				}
458 				tempfree(x);
459 			}
460 		}
461 	}
462 	setfval(r, (Awkfloat) n);
463 	free(buf);
464 	return r;
465 }
466 
467 Cell *getnf(Node **a, int n)	/* get NF */
468 {
469 	if (!donefld)
470 		fldbld();
471 	return (Cell *) a[0];
472 }
473 
474 static char *
475 makearraystring(Node *p, const char *func)
476 {
477 	char *buf;
478 	int bufsz = recsize;
479 	size_t blen;
480 
481 	if ((buf = (char *) malloc(bufsz)) == NULL) {
482 		FATAL("%s: out of memory", func);
483 	}
484 
485 	blen = 0;
486 	buf[blen] = '\0';
487 
488 	for (; p; p = p->nnext) {
489 		Cell *x = execute(p);	/* expr */
490 		char *s = getsval(x);
491 		size_t seplen = strlen(getsval(subseploc));
492 		size_t nsub = p->nnext ? seplen : 0;
493 		size_t slen = strlen(s);
494 		size_t tlen = blen + slen + nsub;
495 
496 		if (!adjbuf(&buf, &bufsz, tlen + 1, recsize, 0, func)) {
497 			FATAL("%s: out of memory %s[%s...]",
498 			    func, x->nval, buf);
499 		}
500 		memcpy(buf + blen, s, slen);
501 		if (nsub) {
502 			memcpy(buf + blen + slen, *SUBSEP, nsub);
503 		}
504 		buf[tlen] = '\0';
505 		blen = tlen;
506 		tempfree(x);
507 	}
508 	return buf;
509 }
510 
511 Cell *array(Node **a, int n)	/* a[0] is symtab, a[1] is list of subscripts */
512 {
513 	Cell *x, *z;
514 	char *buf;
515 
516 	x = execute(a[0]);	/* Cell* for symbol table */
517 	buf = makearraystring(a[1], __func__);
518 	if (!isarr(x)) {
519 		DPRINTF("making %s into an array\n", NN(x->nval));
520 		if (freeable(x))
521 			xfree(x->sval);
522 		x->tval &= ~(STR|NUM|DONTFREE);
523 		x->tval |= ARR;
524 		x->sval = (char *) makesymtab(NSYMTAB);
525 	}
526 	z = setsymtab(buf, "", 0.0, STR|NUM, (Array *) x->sval);
527 	z->ctype = OCELL;
528 	z->csub = CVAR;
529 	tempfree(x);
530 	free(buf);
531 	return(z);
532 }
533 
534 Cell *awkdelete(Node **a, int n)	/* a[0] is symtab, a[1] is list of subscripts */
535 {
536 	Cell *x;
537 
538 	x = execute(a[0]);	/* Cell* for symbol table */
539 	if (x == symtabloc) {
540 		FATAL("cannot delete SYMTAB or its elements");
541 	}
542 	if (!isarr(x))
543 		return True;
544 	if (a[1] == NULL) {	/* delete the elements, not the table */
545 		freesymtab(x);
546 		x->tval &= ~STR;
547 		x->tval |= ARR;
548 		x->sval = (char *) makesymtab(NSYMTAB);
549 	} else {
550 		char *buf = makearraystring(a[1], __func__);
551 		freeelem(x, buf);
552 		free(buf);
553 	}
554 	tempfree(x);
555 	return True;
556 }
557 
558 Cell *intest(Node **a, int n)	/* a[0] is index (list), a[1] is symtab */
559 {
560 	Cell *ap, *k;
561 	char *buf;
562 
563 	ap = execute(a[1]);	/* array name */
564 	if (!isarr(ap)) {
565 		DPRINTF("making %s into an array\n", ap->nval);
566 		if (freeable(ap))
567 			xfree(ap->sval);
568 		ap->tval &= ~(STR|NUM|DONTFREE);
569 		ap->tval |= ARR;
570 		ap->sval = (char *) makesymtab(NSYMTAB);
571 	}
572 	buf = makearraystring(a[0], __func__);
573 	k = lookup(buf, (Array *) ap->sval);
574 	tempfree(ap);
575 	free(buf);
576 	if (k == NULL)
577 		return(False);
578 	else
579 		return(True);
580 }
581 
582 
583 /* ======== utf-8 code ========== */
584 
585 /*
586  * Awk strings can contain ascii, random 8-bit items (eg Latin-1),
587  * or utf-8.  u8_isutf tests whether a string starts with a valid
588  * utf-8 sequence, and returns 0 if not (e.g., high bit set).
589  * u8_nextlen returns length of next valid sequence, which is
590  * 1 for ascii, 2..4 for utf-8, or 1 for high bit non-utf.
591  * u8_strlen returns length of string in valid utf-8 sequences
592  * and/or high-bit bytes.  Conversion functions go between byte
593  * number and character number.
594  *
595  * In theory, this behaves the same as before for non-utf8 bytes.
596  *
597  * Limited checking! This is a potential security hole.
598  */
599 
600 /* is s the beginning of a valid utf-8 string? */
601 /* return length 1..4 if yes, 0 if no */
602 int u8_isutf(const char *s)
603 {
604 	int n, ret;
605 	unsigned char c;
606 
607 	c = s[0];
608 	if (c < 128 || awk_mb_cur_max == 1)
609 		return 1; /* what if it's 0? */
610 
611 	n = strlen(s);
612 	if (n >= 2 && ((c>>5) & 0x7) == 0x6 && (s[1] & 0xC0) == 0x80) {
613 		ret = 2; /* 110xxxxx 10xxxxxx */
614 	} else if (n >= 3 && ((c>>4) & 0xF) == 0xE && (s[1] & 0xC0) == 0x80
615 			 && (s[2] & 0xC0) == 0x80) {
616 		ret = 3; /* 1110xxxx 10xxxxxx 10xxxxxx */
617 	} else if (n >= 4 && ((c>>3) & 0x1F) == 0x1E && (s[1] & 0xC0) == 0x80
618 			 && (s[2] & 0xC0) == 0x80 && (s[3] & 0xC0) == 0x80) {
619 		ret = 4; /* 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */
620 	} else {
621 		ret = 0;
622 	}
623 	return ret;
624 }
625 
626 /* Convert (prefix of) utf8 string to utf-32 rune. */
627 /* Sets *rune to the value, returns the length. */
628 /* No error checking: watch out. */
629 int u8_rune(int *rune, const char *s)
630 {
631 	int n, ret;
632 	unsigned char c;
633 
634 	c = s[0];
635 	if (c < 128 || awk_mb_cur_max == 1) {
636 		*rune = c;
637 		return 1;
638 	}
639 
640 	n = strlen(s);
641 	if (n >= 2 && ((c>>5) & 0x7) == 0x6 && (s[1] & 0xC0) == 0x80) {
642 		*rune = ((c & 0x1F) << 6) | (s[1] & 0x3F); /* 110xxxxx 10xxxxxx */
643 		ret = 2;
644 	} else if (n >= 3 && ((c>>4) & 0xF) == 0xE && (s[1] & 0xC0) == 0x80
645 			  && (s[2] & 0xC0) == 0x80) {
646 		*rune = ((c & 0xF) << 12) | ((s[1] & 0x3F) << 6) | (s[2] & 0x3F);
647 			/* 1110xxxx 10xxxxxx 10xxxxxx */
648 		ret = 3;
649 	} else if (n >= 4 && ((c>>3) & 0x1F) == 0x1E && (s[1] & 0xC0) == 0x80
650 			  && (s[2] & 0xC0) == 0x80 && (s[3] & 0xC0) == 0x80) {
651 		*rune = ((c & 0x7) << 18) | ((s[1] & 0x3F) << 12) | ((s[2] & 0x3F) << 6) | (s[3] & 0x3F);
652 			/* 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */
653 		ret = 4;
654 	} else {
655 		*rune = c;
656 		ret = 1;
657 	}
658 	return ret; /* returns one byte if sequence doesn't look like utf */
659 }
660 
661 /* return length of next sequence: 1 for ascii or random, 2..4 for valid utf8 */
662 int u8_nextlen(const char *s)
663 {
664 	int len;
665 
666 	len = u8_isutf(s);
667 	if (len == 0)
668 		len = 1;
669 	return len;
670 }
671 
672 /* return number of utf characters or single non-utf bytes */
673 int u8_strlen(const char *s)
674 {
675 	int i, len, n, totlen;
676 	unsigned char c;
677 
678 	n = strlen(s);
679 	totlen = 0;
680 	for (i = 0; i < n; i += len) {
681 		c = s[i];
682 		if (c < 128 || awk_mb_cur_max == 1) {
683 			len = 1;
684 		} else {
685 			len = u8_nextlen(&s[i]);
686 		}
687 		totlen++;
688 		if (i > n)
689 			FATAL("bad utf count [%s] n=%d i=%d\n", s, n, i);
690 	}
691 	return totlen;
692 }
693 
694 /* convert utf-8 char number in a string to its byte offset */
695 int u8_char2byte(const char *s, int charnum)
696 {
697 	int n;
698 	int bytenum = 0;
699 
700 	while (charnum > 0) {
701 		n = u8_nextlen(s);
702 		s += n;
703 		bytenum += n;
704 		charnum--;
705 	}
706 	return bytenum;
707 }
708 
709 /* convert byte offset in s to utf-8 char number that starts there */
710 int u8_byte2char(const char *s, int bytenum)
711 {
712 	int i, len, b;
713 	int charnum = 0; /* BUG: what origin? */
714 	/* should be 0 to match start==0 which means no match */
715 
716 	b = strlen(s);
717 	if (bytenum > b) {
718 		return -1; /* ??? */
719 	}
720 	for (i = 0; i <= bytenum; i += len) {
721 		len = u8_nextlen(s+i);
722 		charnum++;
723 	}
724 	return charnum;
725 }
726 
727 /* runetochar() adapted from rune.c in the Plan 9 distributione */
728 
729 enum
730 {
731 	Runeerror = 128, /* from somewhere else */
732 	Runemax = 0x10FFFF,
733 
734 	Bit1    = 7,
735 	Bitx    = 6,
736 	Bit2    = 5,
737 	Bit3    = 4,
738 	Bit4    = 3,
739 	Bit5    = 2,
740 
741 	T1      = ((1<<(Bit1+1))-1) ^ 0xFF,     /* 0000 0000 */
742 	Tx      = ((1<<(Bitx+1))-1) ^ 0xFF,     /* 1000 0000 */
743 	T2      = ((1<<(Bit2+1))-1) ^ 0xFF,     /* 1100 0000 */
744 	T3      = ((1<<(Bit3+1))-1) ^ 0xFF,     /* 1110 0000 */
745 	T4      = ((1<<(Bit4+1))-1) ^ 0xFF,     /* 1111 0000 */
746 	T5      = ((1<<(Bit5+1))-1) ^ 0xFF,     /* 1111 1000 */
747 
748 	Rune1   = (1<<(Bit1+0*Bitx))-1,	 	/* 0000 0000 0000 0000 0111 1111 */
749 	Rune2   = (1<<(Bit2+1*Bitx))-1,	 	/* 0000 0000 0000 0111 1111 1111 */
750 	Rune3   = (1<<(Bit3+2*Bitx))-1,	 	/* 0000 0000 1111 1111 1111 1111 */
751 	Rune4   = (1<<(Bit4+3*Bitx))-1,	 	/* 0011 1111 1111 1111 1111 1111 */
752 
753 	Maskx   = (1<<Bitx)-1,		  	/* 0011 1111 */
754 	Testx   = Maskx ^ 0xFF,		 	/* 1100 0000 */
755 
756 };
757 
758 int runetochar(char *str, int c)
759 {
760 	/* one character sequence 00000-0007F => 00-7F */
761 	if (c <= Rune1) {
762 		str[0] = c;
763 		return 1;
764 	}
765 
766 	/* two character sequence 00080-007FF => T2 Tx */
767 	if (c <= Rune2) {
768 		str[0] = T2 | (c >> 1*Bitx);
769 		str[1] = Tx | (c & Maskx);
770 		return 2;
771 	}
772 
773 	/* three character sequence 00800-0FFFF => T3 Tx Tx */
774 	if (c > Runemax)
775 		c = Runeerror;
776 	if (c <= Rune3) {
777 		str[0] = T3 |  (c >> 2*Bitx);
778 		str[1] = Tx | ((c >> 1*Bitx) & Maskx);
779 		str[2] = Tx |  (c & Maskx);
780 		return 3;
781 	}
782 
783 	/* four character sequence 010000-1FFFFF => T4 Tx Tx Tx */
784 	str[0] = T4 |  (c >> 3*Bitx);
785 	str[1] = Tx | ((c >> 2*Bitx) & Maskx);
786 	str[2] = Tx | ((c >> 1*Bitx) & Maskx);
787 	str[3] = Tx |  (c & Maskx);
788 	return 4;
789 }
790 
791 
792 /* ========== end of utf8 code =========== */
793 
794 
795 
796 Cell *matchop(Node **a, int n)	/* ~ and match() */
797 {
798 	Cell *x, *y, *z;
799 	char *s, *t;
800 	int i;
801 	int cstart, cpatlen, len;
802 	fa *pfa;
803 	int (*mf)(fa *, const char *) = match, mode = 0;
804 
805 	if (n == MATCHFCN) {
806 		mf = pmatch;
807 		mode = 1;
808 	}
809 	x = execute(a[1]);	/* a[1] = target text */
810 	s = getsval(x);
811 	if (a[0] == NULL)	/* a[1] == 0: already-compiled reg expr */
812 		i = (*mf)((fa *) a[2], s);
813 	else {
814 		y = execute(a[2]);	/* a[2] = regular expr */
815 		t = getsval(y);
816 		pfa = makedfa(t, mode);
817 		i = (*mf)(pfa, s);
818 		tempfree(y);
819 	}
820 	z = x;
821 	if (n == MATCHFCN) {
822 		int start = patbeg - s + 1; /* origin 1 */
823 		if (patlen < 0) {
824 			start = 0; /* not found */
825 		} else {
826 			cstart = u8_byte2char(s, start-1);
827 			cpatlen = 0;
828 			for (i = 0; i < patlen; i += len) {
829 				len = u8_nextlen(patbeg+i);
830 				cpatlen++;
831 			}
832 
833 			start = cstart;
834 			patlen = cpatlen;
835 		}
836 
837 		setfval(rstartloc, (Awkfloat) start);
838 		setfval(rlengthloc, (Awkfloat) patlen);
839 		x = gettemp();
840 		x->tval = NUM;
841 		x->fval = start;
842 	} else if ((n == MATCH && i == 1) || (n == NOTMATCH && i == 0))
843 		x = True;
844 	else
845 		x = False;
846 
847 	tempfree(z);
848 	return x;
849 }
850 
851 
852 Cell *boolop(Node **a, int n)	/* a[0] || a[1], a[0] && a[1], !a[0] */
853 {
854 	Cell *x, *y;
855 	int i;
856 
857 	x = execute(a[0]);
858 	i = istrue(x);
859 	tempfree(x);
860 	switch (n) {
861 	case BOR:
862 		if (i) return(True);
863 		y = execute(a[1]);
864 		i = istrue(y);
865 		tempfree(y);
866 		if (i) return(True);
867 		else return(False);
868 	case AND:
869 		if ( !i ) return(False);
870 		y = execute(a[1]);
871 		i = istrue(y);
872 		tempfree(y);
873 		if (i) return(True);
874 		else return(False);
875 	case NOT:
876 		if (i) return(False);
877 		else return(True);
878 	default:	/* can't happen */
879 		FATAL("unknown boolean operator %d", n);
880 	}
881 	return 0;	/*NOTREACHED*/
882 }
883 
884 Cell *relop(Node **a, int n)	/* a[0 < a[1], etc. */
885 {
886 	int i;
887 	Cell *x, *y;
888 	Awkfloat j;
889 	bool x_is_nan, y_is_nan;
890 
891 	x = execute(a[0]);
892 	y = execute(a[1]);
893 	x_is_nan = isnan(x->fval);
894 	y_is_nan = isnan(y->fval);
895 	if (x->tval&NUM && y->tval&NUM) {
896 		if ((x_is_nan || y_is_nan) && n != NE)
897 			return(False);
898 		j = x->fval - y->fval;
899 		i = j<0? -1: (j>0? 1: 0);
900 	} else {
901 		i = strcmp(getsval(x), getsval(y));
902 	}
903 	tempfree(x);
904 	tempfree(y);
905 	switch (n) {
906 	case LT:	if (i<0) return(True);
907 			else return(False);
908 	case LE:	if (i<=0) return(True);
909 			else return(False);
910 	case NE:	if (x_is_nan && y_is_nan) return(True);
911 			else if (i!=0) return(True);
912 			else return(False);
913 	case EQ:	if (i == 0) return(True);
914 			else return(False);
915 	case GE:	if (i>=0) return(True);
916 			else return(False);
917 	case GT:	if (i>0) return(True);
918 			else return(False);
919 	default:	/* can't happen */
920 		FATAL("unknown relational operator %d", n);
921 	}
922 	return 0;	/*NOTREACHED*/
923 }
924 
925 void tfree(Cell *a)	/* free a tempcell */
926 {
927 	if (freeable(a)) {
928 		DPRINTF("freeing %s %s %o\n", NN(a->nval), NN(a->sval), a->tval);
929 		xfree(a->sval);
930 	}
931 	if (a == tmps)
932 		FATAL("tempcell list is curdled");
933 	a->cnext = tmps;
934 	tmps = a;
935 }
936 
937 Cell *gettemp(void)	/* get a tempcell */
938 {	int i;
939 	Cell *x;
940 
941 	if (!tmps) {
942 		tmps = (Cell *) calloc(100, sizeof(*tmps));
943 		if (!tmps)
944 			FATAL("out of space for temporaries");
945 		for (i = 1; i < 100; i++)
946 			tmps[i-1].cnext = &tmps[i];
947 		tmps[i-1].cnext = NULL;
948 	}
949 	x = tmps;
950 	tmps = x->cnext;
951 	*x = tempcell;
952 	return(x);
953 }
954 
955 Cell *indirect(Node **a, int n)	/* $( a[0] ) */
956 {
957 	Awkfloat val;
958 	Cell *x;
959 	int m;
960 	char *s;
961 
962 	x = execute(a[0]);
963 	val = getfval(x);	/* freebsd: defend against super large field numbers */
964 	if ((Awkfloat)INT_MAX < val)
965 		FATAL("trying to access out of range field %s", x->nval);
966 	m = (int) val;
967 	if (m == 0 && !is_number(s = getsval(x), NULL))	/* suspicion! */
968 		FATAL("illegal field $(%s), name \"%s\"", s, x->nval);
969 		/* BUG: can x->nval ever be null??? */
970 	tempfree(x);
971 	x = fieldadr(m);
972 	x->ctype = OCELL;	/* BUG?  why are these needed? */
973 	x->csub = CFLD;
974 	return(x);
975 }
976 
977 Cell *substr(Node **a, int nnn)		/* substr(a[0], a[1], a[2]) */
978 {
979 	int k, m, n;
980 	int mb, nb;
981 	char *s;
982 	int temp;
983 	Cell *x, *y, *z = NULL;
984 
985 	x = execute(a[0]);
986 	y = execute(a[1]);
987 	if (a[2] != NULL)
988 		z = execute(a[2]);
989 	s = getsval(x);
990 	k = u8_strlen(s) + 1;
991 	if (k <= 1) {
992 		tempfree(x);
993 		tempfree(y);
994 		if (a[2] != NULL) {
995 			tempfree(z);
996 		}
997 		x = gettemp();
998 		setsval(x, "");
999 		return(x);
1000 	}
1001 	m = (int) getfval(y);
1002 	if (m <= 0)
1003 		m = 1;
1004 	else if (m > k)
1005 		m = k;
1006 	tempfree(y);
1007 	if (a[2] != NULL) {
1008 		n = (int) getfval(z);
1009 		tempfree(z);
1010 	} else
1011 		n = k - 1;
1012 	if (n < 0)
1013 		n = 0;
1014 	else if (n > k - m)
1015 		n = k - m;
1016 	/* m is start, n is length from there */
1017 	DPRINTF("substr: m=%d, n=%d, s=%s\n", m, n, s);
1018 	y = gettemp();
1019 	mb = u8_char2byte(s, m-1); /* byte offset of start char in s */
1020 	nb = u8_char2byte(s, m-1+n);  /* byte offset of end+1 char in s */
1021 
1022 	temp = s[nb];	/* with thanks to John Linderman */
1023 	s[nb] = '\0';
1024 	setsval(y, s + mb);
1025 	s[nb] = temp;
1026 	tempfree(x);
1027 	return(y);
1028 }
1029 
1030 Cell *sindex(Node **a, int nnn)		/* index(a[0], a[1]) */
1031 {
1032 	Cell *x, *y, *z;
1033 	char *s1, *s2, *p1, *p2, *q;
1034 	Awkfloat v = 0.0;
1035 
1036 	x = execute(a[0]);
1037 	s1 = getsval(x);
1038 	y = execute(a[1]);
1039 	s2 = getsval(y);
1040 
1041 	z = gettemp();
1042 	for (p1 = s1; *p1 != '\0'; p1++) {
1043 		for (q = p1, p2 = s2; *p2 != '\0' && *q == *p2; q++, p2++)
1044 			continue;
1045 		if (*p2 == '\0') {
1046 			/* v = (Awkfloat) (p1 - s1 + 1);	 origin 1 */
1047 
1048 		   /* should be a function: used in match() as well */
1049 			int i, len;
1050 			v = 0;
1051 			for (i = 0; i < p1-s1+1; i += len) {
1052 				len = u8_nextlen(s1+i);
1053 				v++;
1054 			}
1055 			break;
1056 		}
1057 	}
1058 	tempfree(x);
1059 	tempfree(y);
1060 	setfval(z, v);
1061 	return(z);
1062 }
1063 
1064 int has_utf8(char *s)	/* return 1 if s contains any utf-8 (2 bytes or more) character */
1065 {
1066 	int n;
1067 
1068 	for (n = 0; *s != 0; s += n) {
1069 		n = u8_nextlen(s);
1070 		if (n > 1)
1071 			return 1;
1072 	}
1073 	return 0;
1074 }
1075 
1076 #define	MAXNUMSIZE	50
1077 
1078 int format(char **pbuf, int *pbufsize, const char *s, Node *a)	/* printf-like conversions */
1079 {
1080 	char *fmt;
1081 	char *p, *t;
1082 	const char *os;
1083 	Cell *x;
1084 	int flag = 0, n;
1085 	int fmtwd; /* format width */
1086 	int fmtsz = recsize;
1087 	char *buf = *pbuf;
1088 	int bufsize = *pbufsize;
1089 #define FMTSZ(a)   (fmtsz - ((a) - fmt))
1090 #define BUFSZ(a)   (bufsize - ((a) - buf))
1091 
1092 	static bool first = true;
1093 	static bool have_a_format = false;
1094 
1095 	if (first) {
1096 		char xbuf[100];
1097 
1098 		snprintf(xbuf, sizeof(xbuf), "%a", 42.0);
1099 		have_a_format = (strcmp(xbuf, "0x1.5p+5") == 0);
1100 		first = false;
1101 	}
1102 
1103 	os = s;
1104 	p = buf;
1105 	if ((fmt = (char *) malloc(fmtsz)) == NULL)
1106 		FATAL("out of memory in format()");
1107 	while (*s) {
1108 		adjbuf(&buf, &bufsize, MAXNUMSIZE+1+p-buf, recsize, &p, "format1");
1109 		if (*s != '%') {
1110 			*p++ = *s++;
1111 			continue;
1112 		}
1113 		if (*(s+1) == '%') {
1114 			*p++ = '%';
1115 			s += 2;
1116 			continue;
1117 		}
1118 		fmtwd = atoi(s+1);
1119 		if (fmtwd < 0)
1120 			fmtwd = -fmtwd;
1121 		adjbuf(&buf, &bufsize, fmtwd+1+p-buf, recsize, &p, "format2");
1122 		for (t = fmt; (*t++ = *s) != '\0'; s++) {
1123 			if (!adjbuf(&fmt, &fmtsz, MAXNUMSIZE+1+t-fmt, recsize, &t, "format3"))
1124 				FATAL("format item %.30s... ran format() out of memory", os);
1125 			/* Ignore size specifiers */
1126 			if (strchr("hjLlqtz", *s) != NULL) {	/* the ansi panoply */
1127 				t--;
1128 				continue;
1129 			}
1130 			if (isalpha((uschar)*s))
1131 				break;
1132 			if (*s == '$') {
1133 				FATAL("'$' not permitted in awk formats");
1134 			}
1135 			if (*s == '*') {
1136 				if (a == NULL) {
1137 					FATAL("not enough args in printf(%s)", os);
1138 				}
1139 				x = execute(a);
1140 				a = a->nnext;
1141 				snprintf(t - 1, FMTSZ(t - 1),
1142 				    "%d", fmtwd=(int) getfval(x));
1143 				if (fmtwd < 0)
1144 					fmtwd = -fmtwd;
1145 				adjbuf(&buf, &bufsize, fmtwd+1+p-buf, recsize, &p, "format");
1146 				t = fmt + strlen(fmt);
1147 				tempfree(x);
1148 			}
1149 		}
1150 		*t = '\0';
1151 		if (fmtwd < 0)
1152 			fmtwd = -fmtwd;
1153 		adjbuf(&buf, &bufsize, fmtwd+1+p-buf, recsize, &p, "format4");
1154 		switch (*s) {
1155 		case 'a': case 'A':
1156 			if (have_a_format)
1157 				flag = *s;
1158 			else
1159 				flag = 'f';
1160 			break;
1161 		case 'f': case 'e': case 'g': case 'E': case 'G':
1162 			flag = 'f';
1163 			break;
1164 		case 'd': case 'i': case 'o': case 'x': case 'X': case 'u':
1165 			flag = (*s == 'd' || *s == 'i') ? 'd' : 'u';
1166 			*(t-1) = 'j';
1167 			*t = *s;
1168 			*++t = '\0';
1169 			break;
1170 		case 's':
1171 			flag = 's';
1172 			break;
1173 		case 'c':
1174 			flag = 'c';
1175 			break;
1176 		default:
1177 			WARNING("weird printf conversion %s", fmt);
1178 			flag = '?';
1179 			break;
1180 		}
1181 		if (a == NULL)
1182 			FATAL("not enough args in printf(%s)", os);
1183 		x = execute(a);
1184 		a = a->nnext;
1185 		n = MAXNUMSIZE;
1186 		if (fmtwd > n)
1187 			n = fmtwd;
1188 		adjbuf(&buf, &bufsize, 1+n+p-buf, recsize, &p, "format5");
1189 		switch (flag) {
1190 		case '?':
1191 			snprintf(p, BUFSZ(p), "%s", fmt);	/* unknown, so dump it too */
1192 			t = getsval(x);
1193 			n = strlen(t);
1194 			if (fmtwd > n)
1195 				n = fmtwd;
1196 			adjbuf(&buf, &bufsize, 1+strlen(p)+n+p-buf, recsize, &p, "format6");
1197 			p += strlen(p);
1198 			snprintf(p, BUFSZ(p), "%s", t);
1199 			break;
1200 		case 'a':
1201 		case 'A':
1202 		case 'f':	snprintf(p, BUFSZ(p), fmt, getfval(x)); break;
1203 		case 'd':	snprintf(p, BUFSZ(p), fmt, (intmax_t) getfval(x)); break;
1204 		case 'u':	snprintf(p, BUFSZ(p), fmt, (uintmax_t) getfval(x)); break;
1205 
1206 		case 's': {
1207 			t = getsval(x);
1208 			n = strlen(t);
1209 			/* if simple format or no utf-8 in the string, sprintf works */
1210 			if (!has_utf8(t) || strcmp(fmt,"%s") == 0) {
1211 				if (fmtwd > n)
1212 					n = fmtwd;
1213 				if (!adjbuf(&buf, &bufsize, 1+n+p-buf, recsize, &p, "format7"))
1214 					FATAL("huge string/format (%d chars) in printf %.30s..." \
1215 						" ran format() out of memory", n, t);
1216 				snprintf(p, BUFSZ(p), fmt, t);
1217 				break;
1218 			}
1219 
1220 			/* get here if string has utf-8 chars and fmt is not plain %s */
1221 			/* "%-w.ps", where -, w and .p are all optional */
1222 			/* '0' before the w is a flag character */
1223 			/* fmt points at % */
1224 			int ljust = 0, wid = 0, prec = n, pad = 0;
1225 			char *f = fmt+1;
1226 			if (f[0] == '-') {
1227 				ljust = 1;
1228 				f++;
1229 			}
1230 			// flags '0' and '+' are recognized but skipped
1231 			if (f[0] == '0') {
1232 				f++;
1233 				if (f[0] == '+')
1234 					f++;
1235 			}
1236 			if (f[0] == '+') {
1237 				f++;
1238 				if (f[0] == '0')
1239 					f++;
1240 			}
1241 			if (isdigit(f[0])) { /* there is a wid */
1242 				wid = strtol(f, &f, 10);
1243 			}
1244 			if (f[0] == '.') { /* there is a .prec */
1245 				prec = strtol(++f, &f, 10);
1246 			}
1247 			if (prec > u8_strlen(t))
1248 				prec = u8_strlen(t);
1249 			pad = wid>prec ? wid - prec : 0;  // has to be >= 0
1250 			int i, k, n;
1251 
1252 			if (ljust) { // print prec chars from t, then pad blanks
1253 				n = u8_char2byte(t, prec);
1254 				for (k = 0; k < n; k++) {
1255 					//putchar(t[k]);
1256 					*p++ = t[k];
1257 				}
1258 				for (i = 0; i < pad; i++) {
1259 					//printf(" ");
1260 					*p++ = ' ';
1261 				}
1262 			} else { // print pad blanks, then prec chars from t
1263 				for (i = 0; i < pad; i++) {
1264 					//printf(" ");
1265 					*p++ = ' ';
1266 				}
1267 				n = u8_char2byte(t, prec);
1268 				for (k = 0; k < n; k++) {
1269 					//putchar(t[k]);
1270 					*p++ = t[k];
1271 				}
1272 			}
1273 			*p = 0;
1274 			break;
1275 		}
1276 
1277                case 'c': {
1278 			/*
1279 			 * If a numeric value is given, awk should just turn
1280 			 * it into a character and print it:
1281 			 *      BEGIN { printf("%c\n", 65) }
1282 			 * prints "A".
1283 			 *
1284 			 * But what if the numeric value is > 128 and
1285 			 * represents a valid Unicode code point?!? We do
1286 			 * our best to convert it back into UTF-8. If we
1287 			 * can't, we output the encoding of the Unicode
1288 			 * "invalid character", 0xFFFD.
1289 			 */
1290 			if (isnum(x)) {
1291 				int charval = (int) getfval(x);
1292 
1293 				if (charval != 0) {
1294 					if (charval < 128 || awk_mb_cur_max == 1)
1295 						snprintf(p, BUFSZ(p), fmt, charval);
1296 					else {
1297 						// possible unicode character
1298 						size_t count;
1299 						char *bs = wide_char_to_byte_str(charval, &count);
1300 
1301 						if (bs == NULL)	{ // invalid character
1302 							// use unicode invalid character, 0xFFFD
1303 							static char invalid_char[] = "\357\277\275";
1304 							bs = invalid_char;
1305 							count = 3;
1306 						}
1307 						t = bs;
1308 						n = count;
1309 						goto format_percent_c;
1310 					}
1311 				} else {
1312 					*p++ = '\0'; /* explicit null byte */
1313 					*p = '\0';   /* next output will start here */
1314 				}
1315 				break;
1316 			}
1317 			t = getsval(x);
1318 			n = u8_nextlen(t);
1319 		format_percent_c:
1320 			if (n < 2) { /* not utf8 */
1321 				snprintf(p, BUFSZ(p), fmt, getsval(x)[0]);
1322 				break;
1323 			}
1324 
1325 			// utf8 character, almost same song and dance as for %s
1326 			int ljust = 0, wid = 0, prec = n, pad = 0;
1327 			char *f = fmt+1;
1328 			if (f[0] == '-') {
1329 				ljust = 1;
1330 				f++;
1331 			}
1332 			// flags '0' and '+' are recognized but skipped
1333 			if (f[0] == '0') {
1334 				f++;
1335 				if (f[0] == '+')
1336 					f++;
1337 			}
1338 			if (f[0] == '+') {
1339 				f++;
1340 				if (f[0] == '0')
1341 					f++;
1342 			}
1343 			if (isdigit(f[0])) { /* there is a wid */
1344 				wid = strtol(f, &f, 10);
1345 			}
1346 			if (f[0] == '.') { /* there is a .prec */
1347 				prec = strtol(++f, &f, 10);
1348 			}
1349 			if (prec > 1)           // %c --> only one character
1350 				prec = 1;
1351 			pad = wid>prec ? wid - prec : 0;  // has to be >= 0
1352 			int i;
1353 
1354 			if (ljust) { // print one char from t, then pad blanks
1355 				for (i = 0; i < n; i++)
1356 					*p++ = t[i];
1357 				for (i = 0; i < pad; i++) {
1358 					//printf(" ");
1359 					*p++ = ' ';
1360 				}
1361 			} else { // print pad blanks, then prec chars from t
1362 				for (i = 0; i < pad; i++) {
1363 					//printf(" ");
1364 					*p++ = ' ';
1365 				}
1366 				for (i = 0; i < n; i++)
1367 					*p++ = t[i];
1368 			}
1369 			*p = 0;
1370 			break;
1371 		}
1372 		default:
1373 			FATAL("can't happen: bad conversion %c in format()", flag);
1374 		}
1375 
1376 		tempfree(x);
1377 		p += strlen(p);
1378 		s++;
1379 	}
1380 	*p = '\0';
1381 	free(fmt);
1382 	for ( ; a; a = a->nnext) {		/* evaluate any remaining args */
1383 		x = execute(a);
1384 		tempfree(x);
1385 	}
1386 	*pbuf = buf;
1387 	*pbufsize = bufsize;
1388 	return p - buf;
1389 }
1390 
1391 Cell *awksprintf(Node **a, int n)		/* sprintf(a[0]) */
1392 {
1393 	Cell *x;
1394 	Node *y;
1395 	char *buf;
1396 	int bufsz=3*recsize;
1397 
1398 	if ((buf = (char *) malloc(bufsz)) == NULL)
1399 		FATAL("out of memory in awksprintf");
1400 	y = a[0]->nnext;
1401 	x = execute(a[0]);
1402 	if (format(&buf, &bufsz, getsval(x), y) == -1)
1403 		FATAL("sprintf string %.30s... too long.  can't happen.", buf);
1404 	tempfree(x);
1405 	x = gettemp();
1406 	x->sval = buf;
1407 	x->tval = STR;
1408 	return(x);
1409 }
1410 
1411 Cell *awkprintf(Node **a, int n)		/* printf */
1412 {	/* a[0] is list of args, starting with format string */
1413 	/* a[1] is redirection operator, a[2] is redirection file */
1414 	FILE *fp;
1415 	Cell *x;
1416 	Node *y;
1417 	char *buf;
1418 	int len;
1419 	int bufsz=3*recsize;
1420 
1421 	if ((buf = (char *) malloc(bufsz)) == NULL)
1422 		FATAL("out of memory in awkprintf");
1423 	y = a[0]->nnext;
1424 	x = execute(a[0]);
1425 	if ((len = format(&buf, &bufsz, getsval(x), y)) == -1)
1426 		FATAL("printf string %.30s... too long.  can't happen.", buf);
1427 	tempfree(x);
1428 	if (a[1] == NULL) {
1429 		/* fputs(buf, stdout); */
1430 		fwrite(buf, len, 1, stdout);
1431 		if (ferror(stdout))
1432 			FATAL("write error on stdout");
1433 	} else {
1434 		fp = redirect(ptoi(a[1]), a[2]);
1435 		/* fputs(buf, fp); */
1436 		fwrite(buf, len, 1, fp);
1437 		fflush(fp);
1438 		if (ferror(fp))
1439 			FATAL("write error on %s", filename(fp));
1440 	}
1441 	free(buf);
1442 	return(True);
1443 }
1444 
1445 Cell *arith(Node **a, int n)	/* a[0] + a[1], etc.  also -a[0] */
1446 {
1447 	Awkfloat i, j = 0;
1448 	double v;
1449 	Cell *x, *y, *z;
1450 
1451 	x = execute(a[0]);
1452 	i = getfval(x);
1453 	tempfree(x);
1454 	if (n != UMINUS && n != UPLUS) {
1455 		y = execute(a[1]);
1456 		j = getfval(y);
1457 		tempfree(y);
1458 	}
1459 	z = gettemp();
1460 	switch (n) {
1461 	case ADD:
1462 		i += j;
1463 		break;
1464 	case MINUS:
1465 		i -= j;
1466 		break;
1467 	case MULT:
1468 		i *= j;
1469 		break;
1470 	case DIVIDE:
1471 		if (j == 0)
1472 			FATAL("division by zero");
1473 		i /= j;
1474 		break;
1475 	case MOD:
1476 		if (j == 0)
1477 			FATAL("division by zero in mod");
1478 		modf(i/j, &v);
1479 		i = i - j * v;
1480 		break;
1481 	case UMINUS:
1482 		i = -i;
1483 		break;
1484 	case UPLUS: /* handled by getfval(), above */
1485 		break;
1486 	case POWER:
1487 		if (j >= 0 && modf(j, &v) == 0.0)	/* pos integer exponent */
1488 			i = ipow(i, (int) j);
1489                else {
1490 			errno = 0;
1491 			i = errcheck(pow(i, j), "pow");
1492                }
1493 		break;
1494 	default:	/* can't happen */
1495 		FATAL("illegal arithmetic operator %d", n);
1496 	}
1497 	setfval(z, i);
1498 	return(z);
1499 }
1500 
1501 double ipow(double x, int n)	/* x**n.  ought to be done by pow, but isn't always */
1502 {
1503 	double v;
1504 
1505 	if (n <= 0)
1506 		return 1;
1507 	v = ipow(x, n/2);
1508 	if (n % 2 == 0)
1509 		return v * v;
1510 	else
1511 		return x * v * v;
1512 }
1513 
1514 Cell *incrdecr(Node **a, int n)		/* a[0]++, etc. */
1515 {
1516 	Cell *x, *z;
1517 	int k;
1518 	Awkfloat xf;
1519 
1520 	x = execute(a[0]);
1521 	xf = getfval(x);
1522 	k = (n == PREINCR || n == POSTINCR) ? 1 : -1;
1523 	if (n == PREINCR || n == PREDECR) {
1524 		setfval(x, xf + k);
1525 		return(x);
1526 	}
1527 	z = gettemp();
1528 	setfval(z, xf);
1529 	setfval(x, xf + k);
1530 	tempfree(x);
1531 	return(z);
1532 }
1533 
1534 Cell *assign(Node **a, int n)	/* a[0] = a[1], a[0] += a[1], etc. */
1535 {		/* this is subtle; don't muck with it. */
1536 	Cell *x, *y;
1537 	Awkfloat xf, yf;
1538 	double v;
1539 
1540 	y = execute(a[1]);
1541 	x = execute(a[0]);
1542 	if (n == ASSIGN) {	/* ordinary assignment */
1543 		if (x == y && !(x->tval & (FLD|REC)) && x != nfloc)
1544 			;	/* self-assignment: leave alone unless it's a field or NF */
1545 		else if ((y->tval & (STR|NUM)) == (STR|NUM)) {
1546 			yf = getfval(y);
1547 			setsval(x, getsval(y));
1548 			x->fval = yf;
1549 			x->tval |= NUM;
1550 		}
1551 		else if (isstr(y))
1552 			setsval(x, getsval(y));
1553 		else if (isnum(y))
1554 			setfval(x, getfval(y));
1555 		else
1556 			funnyvar(y, "read value of");
1557 		tempfree(y);
1558 		return(x);
1559 	}
1560 	xf = getfval(x);
1561 	yf = getfval(y);
1562 	switch (n) {
1563 	case ADDEQ:
1564 		xf += yf;
1565 		break;
1566 	case SUBEQ:
1567 		xf -= yf;
1568 		break;
1569 	case MULTEQ:
1570 		xf *= yf;
1571 		break;
1572 	case DIVEQ:
1573 		if (yf == 0)
1574 			FATAL("division by zero in /=");
1575 		xf /= yf;
1576 		break;
1577 	case MODEQ:
1578 		if (yf == 0)
1579 			FATAL("division by zero in %%=");
1580 		modf(xf/yf, &v);
1581 		xf = xf - yf * v;
1582 		break;
1583 	case POWEQ:
1584 		if (yf >= 0 && modf(yf, &v) == 0.0)	/* pos integer exponent */
1585 			xf = ipow(xf, (int) yf);
1586                else {
1587 			errno = 0;
1588 			xf = errcheck(pow(xf, yf), "pow");
1589                }
1590 		break;
1591 	default:
1592 		FATAL("illegal assignment operator %d", n);
1593 		break;
1594 	}
1595 	tempfree(y);
1596 	setfval(x, xf);
1597 	return(x);
1598 }
1599 
1600 Cell *cat(Node **a, int q)	/* a[0] cat a[1] */
1601 {
1602 	Cell *x, *y, *z;
1603 	int n1, n2;
1604 	char *s = NULL;
1605 	int ssz = 0;
1606 
1607 	x = execute(a[0]);
1608 	n1 = strlen(getsval(x));
1609 	adjbuf(&s, &ssz, n1 + 1, recsize, 0, "cat1");
1610 	memcpy(s, x->sval, n1);
1611 
1612 	tempfree(x);
1613 
1614 	y = execute(a[1]);
1615 	n2 = strlen(getsval(y));
1616 	adjbuf(&s, &ssz, n1 + n2 + 1, recsize, 0, "cat2");
1617 	memcpy(s + n1, y->sval, n2);
1618 	s[n1 + n2] = '\0';
1619 
1620 	tempfree(y);
1621 
1622 	z = gettemp();
1623 	z->sval = s;
1624 	z->tval = STR;
1625 
1626 	return(z);
1627 }
1628 
1629 Cell *pastat(Node **a, int n)	/* a[0] { a[1] } */
1630 {
1631 	Cell *x;
1632 
1633 	if (a[0] == NULL)
1634 		x = execute(a[1]);
1635 	else {
1636 		x = execute(a[0]);
1637 		if (istrue(x)) {
1638 			tempfree(x);
1639 			x = execute(a[1]);
1640 		}
1641 	}
1642 	return x;
1643 }
1644 
1645 Cell *dopa2(Node **a, int n)	/* a[0], a[1] { a[2] } */
1646 {
1647 	Cell *x;
1648 	int pair;
1649 
1650 	pair = ptoi(a[3]);
1651 	if (pairstack[pair] == 0) {
1652 		x = execute(a[0]);
1653 		if (istrue(x))
1654 			pairstack[pair] = 1;
1655 		tempfree(x);
1656 	}
1657 	if (pairstack[pair] == 1) {
1658 		x = execute(a[1]);
1659 		if (istrue(x))
1660 			pairstack[pair] = 0;
1661 		tempfree(x);
1662 		x = execute(a[2]);
1663 		return(x);
1664 	}
1665 	return(False);
1666 }
1667 
1668 Cell *split(Node **a, int nnn)	/* split(a[0], a[1], a[2]); a[3] is type */
1669 {
1670 	Cell *x = NULL, *y, *ap;
1671 	const char *s, *origs, *t;
1672 	const char *fs = NULL;
1673 	char *origfs = NULL;
1674 	int sep;
1675 	char temp, num[50];
1676 	int n, tempstat, arg3type;
1677 	int j;
1678 	double result;
1679 
1680 	y = execute(a[0]);	/* source string */
1681 	origs = s = strdup(getsval(y));
1682 	tempfree(y);
1683 	arg3type = ptoi(a[3]);
1684 	if (a[2] == NULL) {		/* BUG: CSV should override implicit fs but not explicit */
1685 		fs = getsval(fsloc);
1686 	} else if (arg3type == STRING) {	/* split(str,arr,"string") */
1687 		x = execute(a[2]);
1688 		fs = origfs = strdup(getsval(x));
1689 		tempfree(x);
1690 	} else if (arg3type == REGEXPR) {
1691 		fs = "(regexpr)";	/* split(str,arr,/regexpr/) */
1692 	} else {
1693 		FATAL("illegal type of split");
1694 	}
1695 	sep = *fs;
1696 	ap = execute(a[1]);	/* array name */
1697 /* BUG 7/26/22: this appears not to reset array: see C1/asplit */
1698 	freesymtab(ap);
1699 	DPRINTF("split: s=|%s|, a=%s, sep=|%s|\n", s, NN(ap->nval), fs);
1700 	ap->tval &= ~STR;
1701 	ap->tval |= ARR;
1702 	ap->sval = (char *) makesymtab(NSYMTAB);
1703 
1704 	n = 0;
1705         if (arg3type == REGEXPR && strlen((char*)((fa*)a[2])->restr) == 0) {
1706 		/* split(s, a, //); have to arrange that it looks like empty sep */
1707 		arg3type = 0;
1708 		fs = "";
1709 		sep = 0;
1710 	}
1711 	if (*s != '\0' && (strlen(fs) > 1 || arg3type == REGEXPR)) {	/* reg expr */
1712 		fa *pfa;
1713 		if (arg3type == REGEXPR) {	/* it's ready already */
1714 			pfa = (fa *) a[2];
1715 		} else {
1716 			pfa = makedfa(fs, 1);
1717 		}
1718 		if (nematch(pfa,s)) {
1719 			tempstat = pfa->initstat;
1720 			pfa->initstat = 2;
1721 			do {
1722 				n++;
1723 				snprintf(num, sizeof(num), "%d", n);
1724 				temp = *patbeg;
1725 				setptr(patbeg, '\0');
1726 				if (is_number(s, & result))
1727 					setsymtab(num, s, result, STR|NUM, (Array *) ap->sval);
1728 				else
1729 					setsymtab(num, s, 0.0, STR, (Array *) ap->sval);
1730 				setptr(patbeg, temp);
1731 				s = patbeg + patlen;
1732 				if (*(patbeg+patlen-1) == '\0' || *s == '\0') {
1733 					n++;
1734 					snprintf(num, sizeof(num), "%d", n);
1735 					setsymtab(num, "", 0.0, STR, (Array *) ap->sval);
1736 					pfa->initstat = tempstat;
1737 					goto spdone;
1738 				}
1739 			} while (nematch(pfa,s));
1740 			pfa->initstat = tempstat; 	/* bwk: has to be here to reset */
1741 							/* cf gsub and refldbld */
1742 		}
1743 		n++;
1744 		snprintf(num, sizeof(num), "%d", n);
1745 		if (is_number(s, & result))
1746 			setsymtab(num, s, result, STR|NUM, (Array *) ap->sval);
1747 		else
1748 			setsymtab(num, s, 0.0, STR, (Array *) ap->sval);
1749   spdone:
1750 		pfa = NULL;
1751 
1752 	} else if (a[2] == NULL && CSV) {	/* CSV only if no explicit separator */
1753 		char *newt = (char *) malloc(strlen(s)); /* for building new string; reuse for each field */
1754 		for (;;) {
1755 			char *fr = newt;
1756 			n++;
1757 			if (*s == '"' ) { /* start of "..." */
1758 				for (s++ ; *s != '\0'; ) {
1759 					if (*s == '"' && s[1] != '\0' && s[1] == '"') {
1760 						s += 2; /* doubled quote */
1761 						*fr++ = '"';
1762 					} else if (*s == '"' && (s[1] == '\0' || s[1] == ',')) {
1763 						s++; /* skip over closing quote */
1764 						break;
1765 					} else {
1766 						*fr++ = *s++;
1767 					}
1768 				}
1769 				*fr++ = 0;
1770 			} else {	/* unquoted field */
1771 				while (*s != ',' && *s != '\0')
1772 					*fr++ = *s++;
1773 				*fr++ = 0;
1774 			}
1775 			snprintf(num, sizeof(num), "%d", n);
1776 			if (is_number(newt, &result))
1777 				setsymtab(num, newt, result, STR|NUM, (Array *) ap->sval);
1778 			else
1779 				setsymtab(num, newt, 0.0, STR, (Array *) ap->sval);
1780 			if (*s++ == '\0')
1781 				break;
1782 		}
1783 		free(newt);
1784 
1785 	} else if (!CSV && sep == ' ') { /* usual case: split on white space */
1786 		for (n = 0; ; ) {
1787 #define ISWS(c)	((c) == ' ' || (c) == '\t' || (c) == '\n')
1788 			while (ISWS(*s))
1789 				s++;
1790 			if (*s == '\0')
1791 				break;
1792 			n++;
1793 			t = s;
1794 			do
1795 				s++;
1796 			while (*s != '\0' && !ISWS(*s));
1797 			temp = *s;
1798 			setptr(s, '\0');
1799 			snprintf(num, sizeof(num), "%d", n);
1800 			if (is_number(t, & result))
1801 				setsymtab(num, t, result, STR|NUM, (Array *) ap->sval);
1802 			else
1803 				setsymtab(num, t, 0.0, STR, (Array *) ap->sval);
1804 			setptr(s, temp);
1805 			if (*s != '\0')
1806 				s++;
1807 		}
1808 
1809 	} else if (sep == 0) {	/* new: split(s, a, "") => 1 char/elem */
1810 		for (n = 0; *s != '\0'; s += u8_nextlen(s)) {
1811 			char buf[10];
1812 			n++;
1813 			snprintf(num, sizeof(num), "%d", n);
1814 
1815 			for (j = 0; j < u8_nextlen(s); j++) {
1816 				buf[j] = s[j];
1817 			}
1818 			buf[j] = '\0';
1819 
1820 			if (isdigit((uschar)buf[0]))
1821 				setsymtab(num, buf, atof(buf), STR|NUM, (Array *) ap->sval);
1822 			else
1823 				setsymtab(num, buf, 0.0, STR, (Array *) ap->sval);
1824 		}
1825 
1826 	} else if (*s != '\0') {  /* some random single character */
1827 		for (;;) {
1828 			n++;
1829 			t = s;
1830 			while (*s != sep && *s != '\n' && *s != '\0')
1831 				s++;
1832 			temp = *s;
1833 			setptr(s, '\0');
1834 			snprintf(num, sizeof(num), "%d", n);
1835 			if (is_number(t, & result))
1836 				setsymtab(num, t, result, STR|NUM, (Array *) ap->sval);
1837 			else
1838 				setsymtab(num, t, 0.0, STR, (Array *) ap->sval);
1839 			setptr(s, temp);
1840 			if (*s++ == '\0')
1841 				break;
1842 		}
1843 	}
1844 	tempfree(ap);
1845 	xfree(origs);
1846 	xfree(origfs);
1847 	x = gettemp();
1848 	x->tval = NUM;
1849 	x->fval = n;
1850 	return(x);
1851 }
1852 
1853 Cell *condexpr(Node **a, int n)	/* a[0] ? a[1] : a[2] */
1854 {
1855 	Cell *x;
1856 
1857 	x = execute(a[0]);
1858 	if (istrue(x)) {
1859 		tempfree(x);
1860 		x = execute(a[1]);
1861 	} else {
1862 		tempfree(x);
1863 		x = execute(a[2]);
1864 	}
1865 	return(x);
1866 }
1867 
1868 Cell *ifstat(Node **a, int n)	/* if (a[0]) a[1]; else a[2] */
1869 {
1870 	Cell *x;
1871 
1872 	x = execute(a[0]);
1873 	if (istrue(x)) {
1874 		tempfree(x);
1875 		x = execute(a[1]);
1876 	} else if (a[2] != NULL) {
1877 		tempfree(x);
1878 		x = execute(a[2]);
1879 	}
1880 	return(x);
1881 }
1882 
1883 Cell *whilestat(Node **a, int n)	/* while (a[0]) a[1] */
1884 {
1885 	Cell *x;
1886 
1887 	for (;;) {
1888 		x = execute(a[0]);
1889 		if (!istrue(x))
1890 			return(x);
1891 		tempfree(x);
1892 		x = execute(a[1]);
1893 		if (isbreak(x)) {
1894 			x = True;
1895 			return(x);
1896 		}
1897 		if (isnext(x) || isexit(x) || isret(x))
1898 			return(x);
1899 		tempfree(x);
1900 	}
1901 }
1902 
1903 Cell *dostat(Node **a, int n)	/* do a[0]; while(a[1]) */
1904 {
1905 	Cell *x;
1906 
1907 	for (;;) {
1908 		x = execute(a[0]);
1909 		if (isbreak(x))
1910 			return True;
1911 		if (isnext(x) || isexit(x) || isret(x))
1912 			return(x);
1913 		tempfree(x);
1914 		x = execute(a[1]);
1915 		if (!istrue(x))
1916 			return(x);
1917 		tempfree(x);
1918 	}
1919 }
1920 
1921 Cell *forstat(Node **a, int n)	/* for (a[0]; a[1]; a[2]) a[3] */
1922 {
1923 	Cell *x;
1924 
1925 	x = execute(a[0]);
1926 	tempfree(x);
1927 	for (;;) {
1928 		if (a[1]!=NULL) {
1929 			x = execute(a[1]);
1930 			if (!istrue(x)) return(x);
1931 			else tempfree(x);
1932 		}
1933 		x = execute(a[3]);
1934 		if (isbreak(x))		/* turn off break */
1935 			return True;
1936 		if (isnext(x) || isexit(x) || isret(x))
1937 			return(x);
1938 		tempfree(x);
1939 		x = execute(a[2]);
1940 		tempfree(x);
1941 	}
1942 }
1943 
1944 Cell *instat(Node **a, int n)	/* for (a[0] in a[1]) a[2] */
1945 {
1946 	Cell *x, *vp, *arrayp, *cp, *ncp;
1947 	Array *tp;
1948 	int i;
1949 
1950 	vp = execute(a[0]);
1951 	arrayp = execute(a[1]);
1952 	if (!isarr(arrayp)) {
1953 		return True;
1954 	}
1955 	tp = (Array *) arrayp->sval;
1956 	tempfree(arrayp);
1957 	for (i = 0; i < tp->size; i++) {	/* this routine knows too much */
1958 		for (cp = tp->tab[i]; cp != NULL; cp = ncp) {
1959 			setsval(vp, cp->nval);
1960 			ncp = cp->cnext;
1961 			x = execute(a[2]);
1962 			if (isbreak(x)) {
1963 				tempfree(vp);
1964 				return True;
1965 			}
1966 			if (isnext(x) || isexit(x) || isret(x)) {
1967 				tempfree(vp);
1968 				return(x);
1969 			}
1970 			tempfree(x);
1971 		}
1972 	}
1973 	return True;
1974 }
1975 
1976 static char *nawk_convert(const char *s, int (*fun_c)(int),
1977     wint_t (*fun_wc)(wint_t))
1978 {
1979 	char *buf      = NULL;
1980 	char *pbuf     = NULL;
1981 	const char *ps = NULL;
1982 	size_t n       = 0;
1983 	wchar_t wc;
1984 	const size_t sz = awk_mb_cur_max;
1985 	int unused;
1986 
1987 	if (sz == 1) {
1988 		buf = tostring(s);
1989 
1990 		for (pbuf = buf; *pbuf; pbuf++)
1991 			*pbuf = fun_c((uschar)*pbuf);
1992 
1993 		return buf;
1994 	} else {
1995 		/* upper/lower character may be shorter/longer */
1996 		buf = tostringN(s, strlen(s) * sz + 1);
1997 
1998 		(void) mbtowc(NULL, NULL, 0);	/* reset internal state */
1999 		/*
2000 		 * Reset internal state here too.
2001 		 * Assign result to avoid a compiler warning. (Casting to void
2002 		 * doesn't work.)
2003 		 * Increment said variable to avoid a different warning.
2004 		 */
2005 		unused = wctomb(NULL, L'\0');
2006 		unused++;
2007 
2008 		ps   = s;
2009 		pbuf = buf;
2010 		while (n = mbtowc(&wc, ps, sz),
2011 		       n > 0 && n != (size_t)-1 && n != (size_t)-2)
2012 		{
2013 			ps += n;
2014 
2015 			n = wctomb(pbuf, fun_wc(wc));
2016 			if (n == (size_t)-1)
2017 				FATAL("illegal wide character %s", s);
2018 
2019 			pbuf += n;
2020 		}
2021 
2022 		*pbuf = '\0';
2023 
2024 		if (n)
2025 			FATAL("illegal byte sequence %s", s);
2026 
2027 		return buf;
2028 	}
2029 }
2030 
2031 #ifdef __DJGPP__
2032 static wint_t towupper(wint_t wc)
2033 {
2034 	if (wc >= 0 && wc < 256)
2035 		return toupper(wc & 0xFF);
2036 
2037 	return wc;
2038 }
2039 
2040 static wint_t towlower(wint_t wc)
2041 {
2042 	if (wc >= 0 && wc < 256)
2043 		return tolower(wc & 0xFF);
2044 
2045 	return wc;
2046 }
2047 #endif
2048 
2049 static char *nawk_toupper(const char *s)
2050 {
2051 	return nawk_convert(s, toupper, towupper);
2052 }
2053 
2054 static char *nawk_tolower(const char *s)
2055 {
2056 	return nawk_convert(s, tolower, towlower);
2057 }
2058 
2059 
2060 
2061 Cell *bltin(Node **a, int n)	/* builtin functions. a[0] is type, a[1] is arg list */
2062 {
2063 	Cell *x, *y;
2064 	Awkfloat u;
2065 	int t;
2066 	Awkfloat tmp;
2067 	char *buf;
2068 	Node *nextarg;
2069 	FILE *fp;
2070 	int status = 0;
2071 	int estatus = 0;
2072 
2073 	t = ptoi(a[0]);
2074 	x = execute(a[1]);
2075 	nextarg = a[1]->nnext;
2076 	switch (t) {
2077 	case FLENGTH:
2078 		if (isarr(x))
2079 			u = ((Array *) x->sval)->nelem;	/* GROT.  should be function*/
2080 		else
2081 			u = u8_strlen(getsval(x));
2082 		break;
2083 	case FLOG:
2084 		errno = 0;
2085 		u = errcheck(log(getfval(x)), "log");
2086 		break;
2087 	case FINT:
2088 		modf(getfval(x), &u); break;
2089 	case FEXP:
2090 		errno = 0;
2091 		u = errcheck(exp(getfval(x)), "exp");
2092 		break;
2093 	case FSQRT:
2094 		errno = 0;
2095 		u = errcheck(sqrt(getfval(x)), "sqrt");
2096 		break;
2097 	case FSIN:
2098 		u = sin(getfval(x)); break;
2099 	case FCOS:
2100 		u = cos(getfval(x)); break;
2101 	case FATAN:
2102 		if (nextarg == NULL) {
2103 			WARNING("atan2 requires two arguments; returning 1.0");
2104 			u = 1.0;
2105 		} else {
2106 			y = execute(a[1]->nnext);
2107 			u = atan2(getfval(x), getfval(y));
2108 			tempfree(y);
2109 			nextarg = nextarg->nnext;
2110 		}
2111 		break;
2112 	case FSYSTEM:
2113 		fflush(stdout);		/* in case something is buffered already */
2114 		estatus = status = system(getsval(x));
2115 		if (status != -1) {
2116 			if (WIFEXITED(status)) {
2117 				estatus = WEXITSTATUS(status);
2118 			} else if (WIFSIGNALED(status)) {
2119 				estatus = WTERMSIG(status) + 256;
2120 #ifdef WCOREDUMP
2121 				if (WCOREDUMP(status))
2122 					estatus += 256;
2123 #endif
2124 			} else	/* something else?!? */
2125 				estatus = 0;
2126 		}
2127 		/* else estatus was set to -1 */
2128 		u = estatus;
2129 		break;
2130 	case FRAND:
2131 		/* random() returns numbers in [0..2^31-1]
2132 		 * in order to get a number in [0, 1), divide it by 2^31
2133 		 */
2134 		u = (Awkfloat) random() / (0x7fffffffL + 0x1UL);
2135 		break;
2136 	case FSRAND:
2137 		if (isrec(x))	/* no argument provided */
2138 			u = time((time_t *)0);
2139 		else
2140 			u = getfval(x);
2141 		tmp = u;
2142 		srandom((unsigned long) u);
2143 		u = srand_seed;
2144 		srand_seed = tmp;
2145 		break;
2146 	case FTOUPPER:
2147 	case FTOLOWER:
2148 		if (t == FTOUPPER)
2149 			buf = nawk_toupper(getsval(x));
2150 		else
2151 			buf = nawk_tolower(getsval(x));
2152 		tempfree(x);
2153 		x = gettemp();
2154 		setsval(x, buf);
2155 		free(buf);
2156 		return x;
2157 	case FFLUSH:
2158 		if (isrec(x) || strlen(getsval(x)) == 0) {
2159 			flush_all();	/* fflush() or fflush("") -> all */
2160 			u = 0;
2161 		} else if ((fp = openfile(FFLUSH, getsval(x), NULL)) == NULL)
2162 			u = EOF;
2163 		else
2164 			u = fflush(fp);
2165 		break;
2166 	default:	/* can't happen */
2167 		FATAL("illegal function type %d", t);
2168 		break;
2169 	}
2170 	tempfree(x);
2171 	x = gettemp();
2172 	setfval(x, u);
2173 	if (nextarg != NULL) {
2174 		WARNING("warning: function has too many arguments");
2175 		for ( ; nextarg; nextarg = nextarg->nnext) {
2176 			y = execute(nextarg);
2177 			tempfree(y);
2178 		}
2179 	}
2180 	return(x);
2181 }
2182 
2183 Cell *printstat(Node **a, int n)	/* print a[0] */
2184 {
2185 	Node *x;
2186 	Cell *y;
2187 	FILE *fp;
2188 
2189 	if (a[1] == NULL)	/* a[1] is redirection operator, a[2] is file */
2190 		fp = stdout;
2191 	else
2192 		fp = redirect(ptoi(a[1]), a[2]);
2193 	for (x = a[0]; x != NULL; x = x->nnext) {
2194 		y = execute(x);
2195 		fputs(getpssval(y), fp);
2196 		tempfree(y);
2197 		if (x->nnext == NULL)
2198 			fputs(getsval(orsloc), fp);
2199 		else
2200 			fputs(getsval(ofsloc), fp);
2201 	}
2202 	if (a[1] != NULL)
2203 		fflush(fp);
2204 	if (ferror(fp))
2205 		FATAL("write error on %s", filename(fp));
2206 	return(True);
2207 }
2208 
2209 Cell *nullproc(Node **a, int n)
2210 {
2211 	return 0;
2212 }
2213 
2214 
2215 FILE *redirect(int a, Node *b)	/* set up all i/o redirections */
2216 {
2217 	FILE *fp;
2218 	Cell *x;
2219 	char *fname;
2220 
2221 	x = execute(b);
2222 	fname = getsval(x);
2223 	fp = openfile(a, fname, NULL);
2224 	if (fp == NULL)
2225 		FATAL("can't open file %s", fname);
2226 	tempfree(x);
2227 	return fp;
2228 }
2229 
2230 struct files {
2231 	FILE	*fp;
2232 	const char	*fname;
2233 	int	mode;	/* '|', 'a', 'w' => LE/LT, GT */
2234 } *files;
2235 
2236 size_t nfiles;
2237 
2238 static void stdinit(void)	/* in case stdin, etc., are not constants */
2239 {
2240 	nfiles = FOPEN_MAX;
2241 	files = (struct files *) calloc(nfiles, sizeof(*files));
2242 	if (files == NULL)
2243 		FATAL("can't allocate file memory for %zu files", nfiles);
2244         files[0].fp = stdin;
2245 	files[0].fname = tostring("/dev/stdin");
2246 	files[0].mode = LT;
2247         files[1].fp = stdout;
2248 	files[1].fname = tostring("/dev/stdout");
2249 	files[1].mode = GT;
2250         files[2].fp = stderr;
2251 	files[2].fname = tostring("/dev/stderr");
2252 	files[2].mode = GT;
2253 }
2254 
2255 FILE *openfile(int a, const char *us, bool *pnewflag)
2256 {
2257 	const char *s = us;
2258 	size_t i;
2259 	int m;
2260 	FILE *fp = NULL;
2261 
2262 	if (*s == '\0')
2263 		FATAL("null file name in print or getline");
2264 	for (i = 0; i < nfiles; i++)
2265 		if (files[i].fname && strcmp(s, files[i].fname) == 0 &&
2266 		    (a == files[i].mode || (a==APPEND && files[i].mode==GT) ||
2267 		     a == FFLUSH)) {
2268 			if (pnewflag)
2269 				*pnewflag = false;
2270 			return files[i].fp;
2271 		}
2272 	if (a == FFLUSH)	/* didn't find it, so don't create it! */
2273 		return NULL;
2274 
2275 	for (i = 0; i < nfiles; i++)
2276 		if (files[i].fp == NULL)
2277 			break;
2278 	if (i >= nfiles) {
2279 		struct files *nf;
2280 		size_t nnf = nfiles + FOPEN_MAX;
2281 		nf = (struct files *) realloc(files, nnf * sizeof(*nf));
2282 		if (nf == NULL)
2283 			FATAL("cannot grow files for %s and %zu files", s, nnf);
2284 		memset(&nf[nfiles], 0, FOPEN_MAX * sizeof(*nf));
2285 		nfiles = nnf;
2286 		files = nf;
2287 	}
2288 	fflush(stdout);	/* force a semblance of order */
2289 	m = a;
2290 	if (a == GT) {
2291 		fp = fopen(s, "w");
2292 	} else if (a == APPEND) {
2293 		fp = fopen(s, "a");
2294 		m = GT;	/* so can mix > and >> */
2295 	} else if (a == '|') {	/* output pipe */
2296 		fp = popen(s, "w");
2297 	} else if (a == LE) {	/* input pipe */
2298 		fp = popen(s, "r");
2299 	} else if (a == LT) {	/* getline <file */
2300 		fp = strcmp(s, "-") == 0 ? stdin : fopen(s, "r");	/* "-" is stdin */
2301 	} else	/* can't happen */
2302 		FATAL("illegal redirection %d", a);
2303 	if (fp != NULL) {
2304 		files[i].fname = tostring(s);
2305 		files[i].fp = fp;
2306 		files[i].mode = m;
2307 		if (pnewflag)
2308 			*pnewflag = true;
2309 		if (fp != stdin && fp != stdout && fp != stderr)
2310 			(void) fcntl(fileno(fp), F_SETFD, FD_CLOEXEC);
2311 	}
2312 	return fp;
2313 }
2314 
2315 const char *filename(FILE *fp)
2316 {
2317 	size_t i;
2318 
2319 	for (i = 0; i < nfiles; i++)
2320 		if (fp == files[i].fp)
2321 			return files[i].fname;
2322 	return "???";
2323 }
2324 
2325 Cell *closefile(Node **a, int n)
2326 {
2327  	Cell *x;
2328 	size_t i;
2329 	bool stat;
2330 
2331  	x = execute(a[0]);
2332  	getsval(x);
2333 	stat = true;
2334  	for (i = 0; i < nfiles; i++) {
2335 		if (!files[i].fname || strcmp(x->sval, files[i].fname) != 0)
2336 			continue;
2337 		if (files[i].mode == GT || files[i].mode == '|')
2338 			fflush(files[i].fp);
2339 		if (ferror(files[i].fp)) {
2340 			if ((files[i].mode == GT && files[i].fp != stderr)
2341 			  || files[i].mode == '|')
2342 				FATAL("write error on %s", files[i].fname);
2343 			else
2344 				WARNING("i/o error occurred on %s", files[i].fname);
2345 		}
2346 		if (files[i].fp == stdin || files[i].fp == stdout ||
2347 		    files[i].fp == stderr)
2348 			stat = freopen("/dev/null", "r+", files[i].fp) == NULL;
2349 		else if (files[i].mode == '|' || files[i].mode == LE)
2350 			stat = pclose(files[i].fp) == -1;
2351 		else
2352 			stat = fclose(files[i].fp) == EOF;
2353 		if (stat)
2354 			WARNING("i/o error occurred closing %s", files[i].fname);
2355 		xfree(files[i].fname);
2356 		files[i].fname = NULL;	/* watch out for ref thru this */
2357 		files[i].fp = NULL;
2358 		break;
2359  	}
2360  	tempfree(x);
2361  	x = gettemp();
2362 	setfval(x, (Awkfloat) (stat ? -1 : 0));
2363  	return(x);
2364 }
2365 
2366 void closeall(void)
2367 {
2368 	size_t i;
2369 	bool stat = false;
2370 
2371 	for (i = 0; i < nfiles; i++) {
2372 		if (! files[i].fp)
2373 			continue;
2374 		if (files[i].mode == GT || files[i].mode == '|')
2375 			fflush(files[i].fp);
2376 		if (ferror(files[i].fp)) {
2377 			if ((files[i].mode == GT && files[i].fp != stderr)
2378 			  || files[i].mode == '|')
2379 				FATAL("write error on %s", files[i].fname);
2380 			else
2381 				WARNING("i/o error occurred on %s", files[i].fname);
2382 		}
2383 		if (files[i].fp == stdin || files[i].fp == stdout ||
2384 		    files[i].fp == stderr)
2385 			continue;
2386 		if (files[i].mode == '|' || files[i].mode == LE)
2387 			stat = pclose(files[i].fp) == -1;
2388 		else
2389 			stat = fclose(files[i].fp) == EOF;
2390 		if (stat)
2391 			WARNING("i/o error occurred while closing %s", files[i].fname);
2392 	}
2393 }
2394 
2395 static void flush_all(void)
2396 {
2397 	size_t i;
2398 
2399 	for (i = 0; i < nfiles; i++)
2400 		if (files[i].fp)
2401 			fflush(files[i].fp);
2402 }
2403 
2404 void backsub(char **pb_ptr, const char **sptr_ptr);
2405 
2406 Cell *dosub(Node **a, int subop)        /* sub and gsub */
2407 {
2408 	fa *pfa;
2409 	int tempstat;
2410 	char *repl;
2411 	Cell *x;
2412 
2413 	char *buf = NULL;
2414 	char *pb = NULL;
2415 	int bufsz = recsize;
2416 
2417 	const char *r, *s;
2418 	const char *start;
2419 	const char *noempty = NULL;      /* empty match disallowed here */
2420 	size_t m = 0;                    /* match count */
2421 	size_t whichm;                   /* which match to select, 0 = global */
2422 	int mtype;                       /* match type */
2423 
2424 	if (a[0] == NULL) {	/* 0 => a[1] is already-compiled regexpr */
2425 		pfa = (fa *) a[1];
2426 	} else {
2427 		x = execute(a[1]);
2428 		pfa = makedfa(getsval(x), 1);
2429 		tempfree(x);
2430 	}
2431 
2432 	x = execute(a[2]);	/* replacement string */
2433 	repl = tostring(getsval(x));
2434 	tempfree(x);
2435 
2436 	switch (subop) {
2437 	case SUB:
2438 		whichm = 1;
2439 		x = execute(a[3]);    /* source string */
2440 		break;
2441 	case GSUB:
2442 		whichm = 0;
2443 		x = execute(a[3]);    /* source string */
2444 		break;
2445 	default:
2446 		FATAL("dosub: unrecognized subop: %d", subop);
2447 	}
2448 
2449 	start = getsval(x);
2450 	while (pmatch(pfa, start)) {
2451 		if (buf == NULL) {
2452 			if ((pb = buf = (char *) malloc(bufsz)) == NULL)
2453 				FATAL("out of memory in dosub");
2454 			tempstat = pfa->initstat;
2455 			pfa->initstat = 2;
2456 		}
2457 
2458 		/* match types */
2459 		#define	MT_IGNORE  0  /* unselected or invalid */
2460 		#define MT_INSERT  1  /* selected, empty */
2461 		#define MT_REPLACE 2  /* selected, not empty */
2462 
2463 		/* an empty match just after replacement is invalid */
2464 
2465 		if (patbeg == noempty && patlen == 0) {
2466 			mtype = MT_IGNORE;    /* invalid, not counted */
2467 		} else if (whichm == ++m || whichm == 0) {
2468 			mtype = patlen ? MT_REPLACE : MT_INSERT;
2469 		} else {
2470 			mtype = MT_IGNORE;    /* unselected, but counted */
2471 		}
2472 
2473 		/* leading text: */
2474 		if (patbeg > start) {
2475 			adjbuf(&buf, &bufsz, (pb - buf) + (patbeg - start),
2476 				recsize, &pb, "dosub");
2477 			s = start;
2478 			while (s < patbeg)
2479 				*pb++ = *s++;
2480 		}
2481 
2482 		if (mtype == MT_IGNORE)
2483 			goto matching_text;  /* skip replacement text */
2484 
2485 		r = repl;
2486 		while (*r != 0) {
2487 			adjbuf(&buf, &bufsz, 5+pb-buf, recsize, &pb, "dosub");
2488 			if (*r == '\\') {
2489 				backsub(&pb, &r);
2490 			} else if (*r == '&') {
2491 				r++;
2492 				adjbuf(&buf, &bufsz, 1+patlen+pb-buf, recsize,
2493 					&pb, "dosub");
2494 				for (s = patbeg; s < patbeg+patlen; )
2495 					*pb++ = *s++;
2496 			} else {
2497 				*pb++ = *r++;
2498 			}
2499 		}
2500 
2501 matching_text:
2502 		if (mtype == MT_REPLACE || *patbeg == '\0')
2503 			goto next_search;  /* skip matching text */
2504 
2505 		if (patlen == 0)
2506 			patlen = u8_nextlen(patbeg);
2507 		adjbuf(&buf, &bufsz, (pb-buf) + patlen, recsize, &pb, "dosub");
2508 		s = patbeg;
2509 		while (s < patbeg + patlen)
2510 			*pb++ = *s++;
2511 
2512 next_search:
2513 		start = patbeg + patlen;
2514 		if (m == whichm || *patbeg == '\0')
2515 			break;
2516 		if (mtype == MT_REPLACE)
2517 			noempty = start;
2518 
2519 		#undef MT_IGNORE
2520 		#undef MT_INSERT
2521 		#undef MT_REPLACE
2522 	}
2523 
2524 	xfree(repl);
2525 
2526 	if (buf != NULL) {
2527 		pfa->initstat = tempstat;
2528 
2529 		/* trailing text */
2530 		adjbuf(&buf, &bufsz, 1+strlen(start)+pb-buf, 0, &pb, "dosub");
2531 		while ((*pb++ = *start++) != '\0')
2532 			;
2533 
2534 		setsval(x, buf);
2535 		free(buf);
2536 	}
2537 
2538 	tempfree(x);
2539 	x = gettemp();
2540 	x->tval = NUM;
2541 	x->fval = m;
2542 	return x;
2543 }
2544 
2545 void backsub(char **pb_ptr, const char **sptr_ptr)	/* handle \\& variations */
2546 {						/* sptr[0] == '\\' */
2547 	char *pb = *pb_ptr;
2548 	const char *sptr = *sptr_ptr;
2549 	static bool first = true;
2550 	static bool do_posix = false;
2551 
2552 	if (first) {
2553 		first = false;
2554 		do_posix = (getenv("POSIXLY_CORRECT") != NULL);
2555 	}
2556 
2557 	if (sptr[1] == '\\') {
2558 		if (sptr[2] == '\\' && sptr[3] == '&') { /* \\\& -> \& */
2559 			*pb++ = '\\';
2560 			*pb++ = '&';
2561 			sptr += 4;
2562 		} else if (sptr[2] == '&') {	/* \\& -> \ + matched */
2563 			*pb++ = '\\';
2564 			sptr += 2;
2565 		} else if (do_posix) {		/* \\x -> \x */
2566 			sptr++;
2567 			*pb++ = *sptr++;
2568 		} else {			/* \\x -> \\x */
2569 			*pb++ = *sptr++;
2570 			*pb++ = *sptr++;
2571 		}
2572 	} else if (sptr[1] == '&') {	/* literal & */
2573 		sptr++;
2574 		*pb++ = *sptr++;
2575 	} else				/* literal \ */
2576 		*pb++ = *sptr++;
2577 
2578 	*pb_ptr = pb;
2579 	*sptr_ptr = sptr;
2580 }
2581 
2582 static char *wide_char_to_byte_str(int rune, size_t *outlen)
2583 {
2584 	static char buf[5];
2585 	int len;
2586 
2587 	if (rune < 0 || rune > 0x10FFFF)
2588 		return NULL;
2589 
2590 	memset(buf, 0, sizeof(buf));
2591 
2592 	len = 0;
2593 	if (rune <= 0x0000007F) {
2594 		buf[len++] = rune;
2595 	} else if (rune <= 0x000007FF) {
2596 		// 110xxxxx 10xxxxxx
2597 		buf[len++] = 0xC0 | (rune >> 6);
2598 		buf[len++] = 0x80 | (rune & 0x3F);
2599 	} else if (rune <= 0x0000FFFF) {
2600 		// 1110xxxx 10xxxxxx 10xxxxxx
2601 		buf[len++] = 0xE0 | (rune >> 12);
2602 		buf[len++] = 0x80 | ((rune >> 6) & 0x3F);
2603 		buf[len++] = 0x80 | (rune & 0x3F);
2604 
2605 	} else {
2606 		// 0x00010000 - 0x10FFFF
2607 		// 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
2608 		buf[len++] = 0xF0 | (rune >> 18);
2609 		buf[len++] = 0x80 | ((rune >> 12) & 0x3F);
2610 		buf[len++] = 0x80 | ((rune >> 6) & 0x3F);
2611 		buf[len++] = 0x80 | (rune & 0x3F);
2612 	}
2613 
2614 	*outlen = len;
2615 	buf[len++] = '\0';
2616 
2617 	return buf;
2618 }
2619