xref: /openbsd/usr.bin/awk/run.c (revision 9ea232b5)
1 /*	$OpenBSD: run.c,v 1.84 2024/01/25 16:40:51 millert Exp $	*/
2 /****************************************************************
3 Copyright (C) Lucent Technologies 1997
4 All Rights Reserved
5 
6 Permission to use, copy, modify, and distribute this software and
7 its documentation for any purpose and without fee is hereby
8 granted, provided that the above copyright notice appear in all
9 copies and that both that the copyright notice and this
10 permission notice and warranty disclaimer appear in supporting
11 documentation, and that the name Lucent Technologies or any of
12 its entities not be used in advertising or publicity pertaining
13 to distribution of the software without specific, written prior
14 permission.
15 
16 LUCENT DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
17 INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS.
18 IN NO EVENT SHALL LUCENT OR ANY OF ITS ENTITIES BE LIABLE FOR ANY
19 SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
20 WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER
21 IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,
22 ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF
23 THIS SOFTWARE.
24 ****************************************************************/
25 
26 #define DEBUG
27 #include <stdio.h>
28 #include <ctype.h>
29 #include <errno.h>
30 #include <wctype.h>
31 #include <fcntl.h>
32 #include <setjmp.h>
33 #include <limits.h>
34 #include <math.h>
35 #include <string.h>
36 #include <stdlib.h>
37 #include <time.h>
38 #include <sys/types.h>
39 #include <sys/wait.h>
40 #include "awk.h"
41 #include "awkgram.tab.h"
42 
43 
44 static void stdinit(void);
45 static void flush_all(void);
46 static char *wide_char_to_byte_str(int rune, size_t *outlen);
47 
48 #if 1
49 #define tempfree(x)	do { if (istemp(x)) tfree(x); } while (/*CONSTCOND*/0)
50 #else
51 void tempfree(Cell *p) {
52 	if (p->ctype == OCELL && (p->csub < CUNK || p->csub > CFREE)) {
53 		WARNING("bad csub %d in Cell %d %s",
54 			p->csub, p->ctype, p->sval);
55 	}
56 	if (istemp(p))
57 		tfree(p);
58 }
59 #endif
60 
61 /* do we really need these? */
62 /* #ifdef _NFILE */
63 /* #ifndef FOPEN_MAX */
64 /* #define FOPEN_MAX _NFILE */
65 /* #endif */
66 /* #endif */
67 /*  */
68 /* #ifndef	FOPEN_MAX */
69 /* #define	FOPEN_MAX	40 */	/* max number of open files */
70 /* #endif */
71 /*  */
72 /* #ifndef RAND_MAX */
73 /* #define RAND_MAX	32767 */	/* all that ansi guarantees */
74 /* #endif */
75 
76 jmp_buf env;
77 extern	int	pairstack[];
78 extern	Awkfloat	srand_seed;
79 
80 Node	*winner = NULL;	/* root of parse tree */
81 Cell	*tmps;		/* free temporary cells for execution */
82 
83 static Cell	truecell	={ OBOOL, BTRUE, 0, 0, 1.0, NUM, NULL, NULL };
84 Cell	*True	= &truecell;
85 static Cell	falsecell	={ OBOOL, BFALSE, 0, 0, 0.0, NUM, NULL, NULL };
86 Cell	*False	= &falsecell;
87 static Cell	breakcell	={ OJUMP, JBREAK, 0, 0, 0.0, NUM, NULL, NULL };
88 Cell	*jbreak	= &breakcell;
89 static Cell	contcell	={ OJUMP, JCONT, 0, 0, 0.0, NUM, NULL, NULL };
90 Cell	*jcont	= &contcell;
91 static Cell	nextcell	={ OJUMP, JNEXT, 0, 0, 0.0, NUM, NULL, NULL };
92 Cell	*jnext	= &nextcell;
93 static Cell	nextfilecell	={ OJUMP, JNEXTFILE, 0, 0, 0.0, NUM, NULL, NULL };
94 Cell	*jnextfile	= &nextfilecell;
95 static Cell	exitcell	={ OJUMP, JEXIT, 0, 0, 0.0, NUM, NULL, NULL };
96 Cell	*jexit	= &exitcell;
97 static Cell	retcell		={ OJUMP, JRET, 0, 0, 0.0, NUM, NULL, NULL };
98 Cell	*jret	= &retcell;
99 static Cell	tempcell	={ OCELL, CTEMP, 0, EMPTY, 0.0, NUM|STR|DONTFREE, NULL, NULL };
100 
101 Node	*curnode = NULL;	/* the node being executed, for debugging */
102 
103 /* buffer memory management */
104 int adjbuf(char **pbuf, int *psiz, int minlen, int quantum, char **pbptr,
105 	const char *whatrtn)
106 /* pbuf:    address of pointer to buffer being managed
107  * psiz:    address of buffer size variable
108  * minlen:  minimum length of buffer needed
109  * quantum: buffer size quantum
110  * pbptr:   address of movable pointer into buffer, or 0 if none
111  * whatrtn: name of the calling routine if failure should cause fatal error
112  *
113  * return   0 for realloc failure, !=0 for success
114  */
115 {
116 	if (minlen > *psiz) {
117 		char *tbuf;
118 		int rminlen = quantum ? minlen % quantum : 0;
119 		int boff = pbptr ? *pbptr - *pbuf : 0;
120 		/* round up to next multiple of quantum */
121 		if (rminlen)
122 			minlen += quantum - rminlen;
123 		tbuf = (char *) realloc(*pbuf, minlen);
124 		DPRINTF("adjbuf %s: %d %d (pbuf=%p, tbuf=%p)\n", whatrtn, *psiz, minlen, (void*)*pbuf, (void*)tbuf);
125 		if (tbuf == NULL) {
126 			if (whatrtn)
127 				FATAL("out of memory in %s", whatrtn);
128 			return 0;
129 		}
130 		*pbuf = tbuf;
131 		*psiz = minlen;
132 		if (pbptr)
133 			*pbptr = tbuf + boff;
134 	}
135 	return 1;
136 }
137 
138 void run(Node *a)	/* execution of parse tree starts here */
139 {
140 
141 	stdinit();
142 	execute(a);
143 	closeall();
144 }
145 
146 Cell *execute(Node *u)	/* execute a node of the parse tree */
147 {
148 	Cell *(*proc)(Node **, int);
149 	Cell *x;
150 	Node *a;
151 
152 	if (u == NULL)
153 		return(True);
154 	for (a = u; ; a = a->nnext) {
155 		curnode = a;
156 		if (isvalue(a)) {
157 			x = (Cell *) (a->narg[0]);
158 			if (isfld(x) && !donefld)
159 				fldbld();
160 			else if (isrec(x) && !donerec)
161 				recbld();
162 			return(x);
163 		}
164 		if (notlegal(a->nobj))	/* probably a Cell* but too risky to print */
165 			FATAL("illegal statement");
166 		proc = proctab[a->nobj-FIRSTTOKEN];
167 		x = (*proc)(a->narg, a->nobj);
168 		if (isfld(x) && !donefld)
169 			fldbld();
170 		else if (isrec(x) && !donerec)
171 			recbld();
172 		if (isexpr(a))
173 			return(x);
174 		if (isjump(x))
175 			return(x);
176 		if (a->nnext == NULL)
177 			return(x);
178 		tempfree(x);
179 	}
180 }
181 
182 
183 Cell *program(Node **a, int n)	/* execute an awk program */
184 {				/* a[0] = BEGIN, a[1] = body, a[2] = END */
185 	Cell *x;
186 
187 	if (setjmp(env) != 0)
188 		goto ex;
189 	if (a[0]) {		/* BEGIN */
190 		x = execute(a[0]);
191 		if (isexit(x))
192 			return(True);
193 		if (isjump(x))
194 			FATAL("illegal break, continue, next or nextfile from BEGIN");
195 		tempfree(x);
196 	}
197 	if (a[1] || a[2])
198 		while (getrec(&record, &recsize, true) > 0) {
199 			x = execute(a[1]);
200 			if (isexit(x))
201 				break;
202 			tempfree(x);
203 		}
204   ex:
205 	if (setjmp(env) != 0)	/* handles exit within END */
206 		goto ex1;
207 	if (a[2]) {		/* END */
208 		x = execute(a[2]);
209 		if (isbreak(x) || isnext(x) || iscont(x))
210 			FATAL("illegal break, continue, next or nextfile from END");
211 		tempfree(x);
212 	}
213   ex1:
214 	return(True);
215 }
216 
217 struct Frame {	/* stack frame for awk function calls */
218 	int nargs;	/* number of arguments in this call */
219 	Cell *fcncell;	/* pointer to Cell for function */
220 	Cell **args;	/* pointer to array of arguments after execute */
221 	Cell *retval;	/* return value */
222 };
223 
224 #define	NARGS	50	/* max args in a call */
225 
226 struct Frame *frame = NULL;	/* base of stack frames; dynamically allocated */
227 int	nframe = 0;		/* number of frames allocated */
228 struct Frame *frp = NULL;	/* frame pointer. bottom level unused */
229 
230 Cell *call(Node **a, int n)	/* function call.  very kludgy and fragile */
231 {
232 	static const Cell newcopycell = { OCELL, CCOPY, 0, EMPTY, 0.0, NUM|STR|DONTFREE, NULL, NULL };
233 	int i, ncall, ndef;
234 	int freed = 0; /* handles potential double freeing when fcn & param share a tempcell */
235 	Node *x;
236 	Cell *args[NARGS], *oargs[NARGS];	/* BUG: fixed size arrays */
237 	Cell *y, *z, *fcn;
238 	char *s;
239 
240 	fcn = execute(a[0]);	/* the function itself */
241 	s = fcn->nval;
242 	if (!isfcn(fcn))
243 		FATAL("calling undefined function %s", s);
244 	if (frame == NULL) {
245 		frp = frame = (struct Frame *) calloc(nframe += 100, sizeof(*frame));
246 		if (frame == NULL)
247 			FATAL("out of space for stack frames calling %s", s);
248 	}
249 	for (ncall = 0, x = a[1]; x != NULL; x = x->nnext)	/* args in call */
250 		ncall++;
251 	ndef = (int) fcn->fval;			/* args in defn */
252 	DPRINTF("calling %s, %d args (%d in defn), frp=%d\n", s, ncall, ndef, (int) (frp-frame));
253 	if (ncall > ndef)
254 		WARNING("function %s called with %d args, uses only %d",
255 			s, ncall, ndef);
256 	if (ncall + ndef > NARGS)
257 		FATAL("function %s has %d arguments, limit %d", s, ncall+ndef, NARGS);
258 	for (i = 0, x = a[1]; x != NULL; i++, x = x->nnext) {	/* get call args */
259 		DPRINTF("evaluate args[%d], frp=%d:\n", i, (int) (frp-frame));
260 		y = execute(x);
261 		oargs[i] = y;
262 		DPRINTF("args[%d]: %s %f <%s>, t=%o\n",
263 			i, NN(y->nval), y->fval, isarr(y) ? "(array)" : NN(y->sval), y->tval);
264 		if (isfcn(y))
265 			FATAL("can't use function %s as argument in %s", y->nval, s);
266 		if (isarr(y))
267 			args[i] = y;	/* arrays by ref */
268 		else
269 			args[i] = copycell(y);
270 		tempfree(y);
271 	}
272 	for ( ; i < ndef; i++) {	/* add null args for ones not provided */
273 		args[i] = gettemp();
274 		*args[i] = newcopycell;
275 	}
276 	frp++;	/* now ok to up frame */
277 	if (frp >= frame + nframe) {
278 		int dfp = frp - frame;	/* old index */
279 		frame = (struct Frame *) reallocarray(frame, (nframe += 100), sizeof(*frame));
280 		if (frame == NULL)
281 			FATAL("out of space for stack frames in %s", s);
282 		frp = frame + dfp;
283 	}
284 	frp->fcncell = fcn;
285 	frp->args = args;
286 	frp->nargs = ndef;	/* number defined with (excess are locals) */
287 	frp->retval = gettemp();
288 
289 	DPRINTF("start exec of %s, frp=%d\n", s, (int) (frp-frame));
290 	y = execute((Node *)(fcn->sval));	/* execute body */
291 	DPRINTF("finished exec of %s, frp=%d\n", s, (int) (frp-frame));
292 
293 	for (i = 0; i < ndef; i++) {
294 		Cell *t = frp->args[i];
295 		if (isarr(t)) {
296 			if (t->csub == CCOPY) {
297 				if (i >= ncall) {
298 					freesymtab(t);
299 					t->csub = CTEMP;
300 					tempfree(t);
301 				} else {
302 					oargs[i]->tval = t->tval;
303 					oargs[i]->tval &= ~(STR|NUM|DONTFREE);
304 					oargs[i]->sval = t->sval;
305 					tempfree(t);
306 				}
307 			}
308 		} else if (t != y) {	/* kludge to prevent freeing twice */
309 			t->csub = CTEMP;
310 			tempfree(t);
311 		} else if (t == y && t->csub == CCOPY) {
312 			t->csub = CTEMP;
313 			tempfree(t);
314 			freed = 1;
315 		}
316 	}
317 	tempfree(fcn);
318 	if (isexit(y) || isnext(y))
319 		return y;
320 	if (freed == 0) {
321 		tempfree(y);	/* don't free twice! */
322 	}
323 	z = frp->retval;			/* return value */
324 	DPRINTF("%s returns %g |%s| %o\n", s, getfval(z), getsval(z), z->tval);
325 	frp--;
326 	return(z);
327 }
328 
329 Cell *copycell(Cell *x)	/* make a copy of a cell in a temp */
330 {
331 	Cell *y;
332 
333 	/* copy is not constant or field */
334 
335 	y = gettemp();
336 	y->tval = x->tval & ~(CON|FLD|REC);
337 	y->csub = CCOPY;	/* prevents freeing until call is over */
338 	y->nval = x->nval;	/* BUG? */
339 	if (isstr(x) /* || x->ctype == OCELL */) {
340 		y->sval = tostring(x->sval);
341 		y->tval &= ~DONTFREE;
342 	} else
343 		y->tval |= DONTFREE;
344 	y->fval = x->fval;
345 	return y;
346 }
347 
348 Cell *arg(Node **a, int n)	/* nth argument of a function */
349 {
350 
351 	n = ptoi(a[0]);	/* argument number, counting from 0 */
352 	DPRINTF("arg(%d), frp->nargs=%d\n", n, frp->nargs);
353 	if (n+1 > frp->nargs)
354 		FATAL("argument #%d of function %s was not supplied",
355 			n+1, frp->fcncell->nval);
356 	return frp->args[n];
357 }
358 
359 Cell *jump(Node **a, int n)	/* break, continue, next, nextfile, return */
360 {
361 	Cell *y;
362 
363 	switch (n) {
364 	case EXIT:
365 		if (a[0] != NULL) {
366 			y = execute(a[0]);
367 			errorflag = (int) getfval(y);
368 			tempfree(y);
369 		}
370 		longjmp(env, 1);
371 	case RETURN:
372 		if (a[0] != NULL) {
373 			y = execute(a[0]);
374 			if ((y->tval & (STR|NUM)) == (STR|NUM)) {
375 				setsval(frp->retval, getsval(y));
376 				frp->retval->fval = getfval(y);
377 				frp->retval->tval |= NUM;
378 			}
379 			else if (y->tval & STR)
380 				setsval(frp->retval, getsval(y));
381 			else if (y->tval & NUM)
382 				setfval(frp->retval, getfval(y));
383 			else		/* can't happen */
384 				FATAL("bad type variable %d", y->tval);
385 			tempfree(y);
386 		}
387 		return(jret);
388 	case NEXT:
389 		return(jnext);
390 	case NEXTFILE:
391 		nextfile();
392 		return(jnextfile);
393 	case BREAK:
394 		return(jbreak);
395 	case CONTINUE:
396 		return(jcont);
397 	default:	/* can't happen */
398 		FATAL("illegal jump type %d", n);
399 	}
400 	return 0;	/* not reached */
401 }
402 
403 Cell *awkgetline(Node **a, int n)	/* get next line from specific input */
404 {		/* a[0] is variable, a[1] is operator, a[2] is filename */
405 	Cell *r, *x;
406 	extern Cell **fldtab;
407 	FILE *fp;
408 	char *buf;
409 	int bufsize = recsize;
410 	int mode;
411 	bool newflag;
412 	double result;
413 
414 	if ((buf = (char *) malloc(bufsize)) == NULL)
415 		FATAL("out of memory in getline");
416 
417 	fflush(stdout);	/* in case someone is waiting for a prompt */
418 	r = gettemp();
419 	if (a[1] != NULL) {		/* getline < file */
420 		x = execute(a[2]);		/* filename */
421 		mode = ptoi(a[1]);
422 		if (mode == '|')		/* input pipe */
423 			mode = LE;	/* arbitrary flag */
424 		fp = openfile(mode, getsval(x), &newflag);
425 		tempfree(x);
426 		if (fp == NULL)
427 			n = -1;
428 		else
429 			n = readrec(&buf, &bufsize, fp, newflag);
430 		if (n <= 0) {
431 			;
432 		} else if (a[0] != NULL) {	/* getline var <file */
433 			x = execute(a[0]);
434 			setsval(x, buf);
435 			if (is_number(x->sval, & result)) {
436 				x->fval = result;
437 				x->tval |= NUM;
438 			}
439 			tempfree(x);
440 		} else {			/* getline <file */
441 			setsval(fldtab[0], buf);
442 			if (is_number(fldtab[0]->sval, & result)) {
443 				fldtab[0]->fval = result;
444 				fldtab[0]->tval |= NUM;
445 			}
446 		}
447 	} else {			/* bare getline; use current input */
448 		if (a[0] == NULL)	/* getline */
449 			n = getrec(&record, &recsize, true);
450 		else {			/* getline var */
451 			n = getrec(&buf, &bufsize, false);
452 			if (n > 0) {
453 				x = execute(a[0]);
454 				setsval(x, buf);
455 				if (is_number(x->sval, & result)) {
456 					x->fval = result;
457 					x->tval |= NUM;
458 				}
459 				tempfree(x);
460 			}
461 		}
462 	}
463 	setfval(r, (Awkfloat) n);
464 	free(buf);
465 	return r;
466 }
467 
468 Cell *getnf(Node **a, int n)	/* get NF */
469 {
470 	if (!donefld)
471 		fldbld();
472 	return (Cell *) a[0];
473 }
474 
475 static char *
476 makearraystring(Node *p, const char *func)
477 {
478 	char *buf;
479 	int bufsz = recsize;
480 	size_t blen;
481 
482 	if ((buf = (char *) malloc(bufsz)) == NULL) {
483 		FATAL("%s: out of memory", func);
484 	}
485 
486 	blen = 0;
487 	buf[blen] = '\0';
488 
489 	for (; p; p = p->nnext) {
490 		Cell *x = execute(p);	/* expr */
491 		char *s = getsval(x);
492 		size_t seplen = strlen(getsval(subseploc));
493 		size_t nsub = p->nnext ? seplen : 0;
494 		size_t slen = strlen(s);
495 		size_t tlen = blen + slen + nsub;
496 
497 		if (!adjbuf(&buf, &bufsz, tlen + 1, recsize, 0, func)) {
498 			FATAL("%s: out of memory %s[%s...]",
499 			    func, x->nval, buf);
500 		}
501 		memcpy(buf + blen, s, slen);
502 		if (nsub) {
503 			memcpy(buf + blen + slen, *SUBSEP, nsub);
504 		}
505 		buf[tlen] = '\0';
506 		blen = tlen;
507 		tempfree(x);
508 	}
509 	return buf;
510 }
511 
512 Cell *array(Node **a, int n)	/* a[0] is symtab, a[1] is list of subscripts */
513 {
514 	Cell *x, *z;
515 	char *buf;
516 
517 	x = execute(a[0]);	/* Cell* for symbol table */
518 	buf = makearraystring(a[1], __func__);
519 	if (!isarr(x)) {
520 		DPRINTF("making %s into an array\n", NN(x->nval));
521 		if (freeable(x))
522 			xfree(x->sval);
523 		x->tval &= ~(STR|NUM|DONTFREE);
524 		x->tval |= ARR;
525 		x->sval = (char *) makesymtab(NSYMTAB);
526 	}
527 	z = setsymtab(buf, "", 0.0, STR|NUM, (Array *) x->sval);
528 	z->ctype = OCELL;
529 	z->csub = CVAR;
530 	tempfree(x);
531 	free(buf);
532 	return(z);
533 }
534 
535 Cell *awkdelete(Node **a, int n)	/* a[0] is symtab, a[1] is list of subscripts */
536 {
537 	Cell *x;
538 
539 	x = execute(a[0]);	/* Cell* for symbol table */
540 	if (x == symtabloc) {
541 		FATAL("cannot delete SYMTAB or its elements");
542 	}
543 	if (!isarr(x))
544 		return True;
545 	if (a[1] == NULL) {	/* delete the elements, not the table */
546 		freesymtab(x);
547 		x->tval &= ~STR;
548 		x->tval |= ARR;
549 		x->sval = (char *) makesymtab(NSYMTAB);
550 	} else {
551 		char *buf = makearraystring(a[1], __func__);
552 		freeelem(x, buf);
553 		free(buf);
554 	}
555 	tempfree(x);
556 	return True;
557 }
558 
559 Cell *intest(Node **a, int n)	/* a[0] is index (list), a[1] is symtab */
560 {
561 	Cell *ap, *k;
562 	char *buf;
563 
564 	ap = execute(a[1]);	/* array name */
565 	if (!isarr(ap)) {
566 		DPRINTF("making %s into an array\n", ap->nval);
567 		if (freeable(ap))
568 			xfree(ap->sval);
569 		ap->tval &= ~(STR|NUM|DONTFREE);
570 		ap->tval |= ARR;
571 		ap->sval = (char *) makesymtab(NSYMTAB);
572 	}
573 	buf = makearraystring(a[0], __func__);
574 	k = lookup(buf, (Array *) ap->sval);
575 	tempfree(ap);
576 	free(buf);
577 	if (k == NULL)
578 		return(False);
579 	else
580 		return(True);
581 }
582 
583 
584 /* ======== utf-8 code ========== */
585 
586 /*
587  * Awk strings can contain ascii, random 8-bit items (eg Latin-1),
588  * or utf-8.  u8_isutf tests whether a string starts with a valid
589  * utf-8 sequence, and returns 0 if not (e.g., high bit set).
590  * u8_nextlen returns length of next valid sequence, which is
591  * 1 for ascii, 2..4 for utf-8, or 1 for high bit non-utf.
592  * u8_strlen returns length of string in valid utf-8 sequences
593  * and/or high-bit bytes.  Conversion functions go between byte
594  * number and character number.
595  *
596  * In theory, this behaves the same as before for non-utf8 bytes.
597  *
598  * Limited checking! This is a potential security hole.
599  */
600 
601 /* is s the beginning of a valid utf-8 string? */
602 /* return length 1..4 if yes, 0 if no */
603 int u8_isutf(const char *s)
604 {
605 	int n, ret;
606 	unsigned char c;
607 
608 	c = s[0];
609 	if (c < 128 || awk_mb_cur_max == 1)
610 		return 1; /* what if it's 0? */
611 
612 	n = strlen(s);
613 	if (n >= 2 && ((c>>5) & 0x7) == 0x6 && (s[1] & 0xC0) == 0x80) {
614 		ret = 2; /* 110xxxxx 10xxxxxx */
615 	} else if (n >= 3 && ((c>>4) & 0xF) == 0xE && (s[1] & 0xC0) == 0x80
616 			 && (s[2] & 0xC0) == 0x80) {
617 		ret = 3; /* 1110xxxx 10xxxxxx 10xxxxxx */
618 	} else if (n >= 4 && ((c>>3) & 0x1F) == 0x1E && (s[1] & 0xC0) == 0x80
619 			 && (s[2] & 0xC0) == 0x80 && (s[3] & 0xC0) == 0x80) {
620 		ret = 4; /* 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */
621 	} else {
622 		ret = 0;
623 	}
624 	return ret;
625 }
626 
627 /* Convert (prefix of) utf8 string to utf-32 rune. */
628 /* Sets *rune to the value, returns the length. */
629 /* No error checking: watch out. */
630 int u8_rune(int *rune, const char *s)
631 {
632 	int n, ret;
633 	unsigned char c;
634 
635 	c = s[0];
636 	if (c < 128 || awk_mb_cur_max == 1) {
637 		*rune = c;
638 		return 1;
639 	}
640 
641 	n = strlen(s);
642 	if (n >= 2 && ((c>>5) & 0x7) == 0x6 && (s[1] & 0xC0) == 0x80) {
643 		*rune = ((c & 0x1F) << 6) | (s[1] & 0x3F); /* 110xxxxx 10xxxxxx */
644 		ret = 2;
645 	} else if (n >= 3 && ((c>>4) & 0xF) == 0xE && (s[1] & 0xC0) == 0x80
646 			  && (s[2] & 0xC0) == 0x80) {
647 		*rune = ((c & 0xF) << 12) | ((s[1] & 0x3F) << 6) | (s[2] & 0x3F);
648 			/* 1110xxxx 10xxxxxx 10xxxxxx */
649 		ret = 3;
650 	} else if (n >= 4 && ((c>>3) & 0x1F) == 0x1E && (s[1] & 0xC0) == 0x80
651 			  && (s[2] & 0xC0) == 0x80 && (s[3] & 0xC0) == 0x80) {
652 		*rune = ((c & 0x7) << 18) | ((s[1] & 0x3F) << 12) | ((s[2] & 0x3F) << 6) | (s[3] & 0x3F);
653 			/* 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */
654 		ret = 4;
655 	} else {
656 		*rune = c;
657 		ret = 1;
658 	}
659 	return ret; /* returns one byte if sequence doesn't look like utf */
660 }
661 
662 /* return length of next sequence: 1 for ascii or random, 2..4 for valid utf8 */
663 int u8_nextlen(const char *s)
664 {
665 	int len;
666 
667 	len = u8_isutf(s);
668 	if (len == 0)
669 		len = 1;
670 	return len;
671 }
672 
673 /* return number of utf characters or single non-utf bytes */
674 int u8_strlen(const char *s)
675 {
676 	int i, len, n, totlen;
677 	unsigned char c;
678 
679 	n = strlen(s);
680 	totlen = 0;
681 	for (i = 0; i < n; i += len) {
682 		c = s[i];
683 		if (c < 128 || awk_mb_cur_max == 1) {
684 			len = 1;
685 		} else {
686 			len = u8_nextlen(&s[i]);
687 		}
688 		totlen++;
689 		if (i > n)
690 			FATAL("bad utf count [%s] n=%d i=%d\n", s, n, i);
691 	}
692 	return totlen;
693 }
694 
695 /* convert utf-8 char number in a string to its byte offset */
696 int u8_char2byte(const char *s, int charnum)
697 {
698 	int n;
699 	int bytenum = 0;
700 
701 	while (charnum > 0) {
702 		n = u8_nextlen(s);
703 		s += n;
704 		bytenum += n;
705 		charnum--;
706 	}
707 	return bytenum;
708 }
709 
710 /* convert byte offset in s to utf-8 char number that starts there */
711 int u8_byte2char(const char *s, int bytenum)
712 {
713 	int i, len, b;
714 	int charnum = 0; /* BUG: what origin? */
715 	/* should be 0 to match start==0 which means no match */
716 
717 	b = strlen(s);
718 	if (bytenum > b) {
719 		return -1; /* ??? */
720 	}
721 	for (i = 0; i <= bytenum; i += len) {
722 		len = u8_nextlen(s+i);
723 		charnum++;
724 	}
725 	return charnum;
726 }
727 
728 /* runetochar() adapted from rune.c in the Plan 9 distributione */
729 
730 enum
731 {
732 	Runeerror = 128, /* from somewhere else */
733 	Runemax = 0x10FFFF,
734 
735 	Bit1    = 7,
736 	Bitx    = 6,
737 	Bit2    = 5,
738 	Bit3    = 4,
739 	Bit4    = 3,
740 	Bit5    = 2,
741 
742 	T1      = ((1<<(Bit1+1))-1) ^ 0xFF,     /* 0000 0000 */
743 	Tx      = ((1<<(Bitx+1))-1) ^ 0xFF,     /* 1000 0000 */
744 	T2      = ((1<<(Bit2+1))-1) ^ 0xFF,     /* 1100 0000 */
745 	T3      = ((1<<(Bit3+1))-1) ^ 0xFF,     /* 1110 0000 */
746 	T4      = ((1<<(Bit4+1))-1) ^ 0xFF,     /* 1111 0000 */
747 	T5      = ((1<<(Bit5+1))-1) ^ 0xFF,     /* 1111 1000 */
748 
749 	Rune1   = (1<<(Bit1+0*Bitx))-1,	 	/* 0000 0000 0000 0000 0111 1111 */
750 	Rune2   = (1<<(Bit2+1*Bitx))-1,	 	/* 0000 0000 0000 0111 1111 1111 */
751 	Rune3   = (1<<(Bit3+2*Bitx))-1,	 	/* 0000 0000 1111 1111 1111 1111 */
752 	Rune4   = (1<<(Bit4+3*Bitx))-1,	 	/* 0011 1111 1111 1111 1111 1111 */
753 
754 	Maskx   = (1<<Bitx)-1,		  	/* 0011 1111 */
755 	Testx   = Maskx ^ 0xFF,		 	/* 1100 0000 */
756 
757 };
758 
759 int runetochar(char *str, int c)
760 {
761 	/* one character sequence 00000-0007F => 00-7F */
762 	if (c <= Rune1) {
763 		str[0] = c;
764 		return 1;
765 	}
766 
767 	/* two character sequence 00080-007FF => T2 Tx */
768 	if (c <= Rune2) {
769 		str[0] = T2 | (c >> 1*Bitx);
770 		str[1] = Tx | (c & Maskx);
771 		return 2;
772 	}
773 
774 	/* three character sequence 00800-0FFFF => T3 Tx Tx */
775 	if (c > Runemax)
776 		c = Runeerror;
777 	if (c <= Rune3) {
778 		str[0] = T3 |  (c >> 2*Bitx);
779 		str[1] = Tx | ((c >> 1*Bitx) & Maskx);
780 		str[2] = Tx |  (c & Maskx);
781 		return 3;
782 	}
783 
784 	/* four character sequence 010000-1FFFFF => T4 Tx Tx Tx */
785 	str[0] = T4 |  (c >> 3*Bitx);
786 	str[1] = Tx | ((c >> 2*Bitx) & Maskx);
787 	str[2] = Tx | ((c >> 1*Bitx) & Maskx);
788 	str[3] = Tx |  (c & Maskx);
789 	return 4;
790 }
791 
792 
793 /* ========== end of utf8 code =========== */
794 
795 
796 
797 Cell *matchop(Node **a, int n)	/* ~ and match() */
798 {
799 	Cell *x, *y, *z;
800 	char *s, *t;
801 	int i;
802 	int cstart, cpatlen, len;
803 	fa *pfa;
804 	int (*mf)(fa *, const char *) = match, mode = 0;
805 
806 	if (n == MATCHFCN) {
807 		mf = pmatch;
808 		mode = 1;
809 	}
810 	x = execute(a[1]);	/* a[1] = target text */
811 	s = getsval(x);
812 	if (a[0] == NULL)	/* a[1] == 0: already-compiled reg expr */
813 		i = (*mf)((fa *) a[2], s);
814 	else {
815 		y = execute(a[2]);	/* a[2] = regular expr */
816 		t = getsval(y);
817 		pfa = makedfa(t, mode);
818 		i = (*mf)(pfa, s);
819 		tempfree(y);
820 	}
821 	z = x;
822 	if (n == MATCHFCN) {
823 		int start = patbeg - s + 1; /* origin 1 */
824 		if (patlen < 0) {
825 			start = 0; /* not found */
826 		} else {
827 			cstart = u8_byte2char(s, start-1);
828 			cpatlen = 0;
829 			for (i = 0; i < patlen; i += len) {
830 				len = u8_nextlen(patbeg+i);
831 				cpatlen++;
832 			}
833 
834 			start = cstart;
835 			patlen = cpatlen;
836 		}
837 
838 		setfval(rstartloc, (Awkfloat) start);
839 		setfval(rlengthloc, (Awkfloat) patlen);
840 		x = gettemp();
841 		x->tval = NUM;
842 		x->fval = start;
843 	} else if ((n == MATCH && i == 1) || (n == NOTMATCH && i == 0))
844 		x = True;
845 	else
846 		x = False;
847 
848 	tempfree(z);
849 	return x;
850 }
851 
852 
853 Cell *boolop(Node **a, int n)	/* a[0] || a[1], a[0] && a[1], !a[0] */
854 {
855 	Cell *x, *y;
856 	int i;
857 
858 	x = execute(a[0]);
859 	i = istrue(x);
860 	tempfree(x);
861 	switch (n) {
862 	case BOR:
863 		if (i) return(True);
864 		y = execute(a[1]);
865 		i = istrue(y);
866 		tempfree(y);
867 		if (i) return(True);
868 		else return(False);
869 	case AND:
870 		if ( !i ) return(False);
871 		y = execute(a[1]);
872 		i = istrue(y);
873 		tempfree(y);
874 		if (i) return(True);
875 		else return(False);
876 	case NOT:
877 		if (i) return(False);
878 		else return(True);
879 	default:	/* can't happen */
880 		FATAL("unknown boolean operator %d", n);
881 	}
882 	return 0;	/*NOTREACHED*/
883 }
884 
885 Cell *relop(Node **a, int n)	/* a[0 < a[1], etc. */
886 {
887 	int i;
888 	Cell *x, *y;
889 	Awkfloat j;
890 	bool x_is_nan, y_is_nan;
891 
892 	x = execute(a[0]);
893 	y = execute(a[1]);
894 	x_is_nan = isnan(x->fval);
895 	y_is_nan = isnan(y->fval);
896 	if (x->tval&NUM && y->tval&NUM) {
897 		if ((x_is_nan || y_is_nan) && n != NE)
898 			return(False);
899 		j = x->fval - y->fval;
900 		i = j<0? -1: (j>0? 1: 0);
901 	} else {
902 		i = strcmp(getsval(x), getsval(y));
903 	}
904 	tempfree(x);
905 	tempfree(y);
906 	switch (n) {
907 	case LT:	if (i<0) return(True);
908 			else return(False);
909 	case LE:	if (i<=0) return(True);
910 			else return(False);
911 	case NE:	if (x_is_nan && y_is_nan) return(True);
912 			else if (i!=0) return(True);
913 			else return(False);
914 	case EQ:	if (i == 0) return(True);
915 			else return(False);
916 	case GE:	if (i>=0) return(True);
917 			else return(False);
918 	case GT:	if (i>0) return(True);
919 			else return(False);
920 	default:	/* can't happen */
921 		FATAL("unknown relational operator %d", n);
922 	}
923 	return 0;	/*NOTREACHED*/
924 }
925 
926 void tfree(Cell *a)	/* free a tempcell */
927 {
928 	if (freeable(a)) {
929 		DPRINTF("freeing %s %s %o\n", NN(a->nval), NN(a->sval), a->tval);
930 		xfree(a->sval);
931 	}
932 	if (a == tmps)
933 		FATAL("tempcell list is curdled");
934 	a->cnext = tmps;
935 	tmps = a;
936 }
937 
938 Cell *gettemp(void)	/* get a tempcell */
939 {	int i;
940 	Cell *x;
941 
942 	if (!tmps) {
943 		tmps = (Cell *) calloc(100, sizeof(*tmps));
944 		if (!tmps)
945 			FATAL("out of space for temporaries");
946 		for (i = 1; i < 100; i++)
947 			tmps[i-1].cnext = &tmps[i];
948 		tmps[i-1].cnext = NULL;
949 	}
950 	x = tmps;
951 	tmps = x->cnext;
952 	*x = tempcell;
953 	return(x);
954 }
955 
956 Cell *indirect(Node **a, int n)	/* $( a[0] ) */
957 {
958 	Awkfloat val;
959 	Cell *x;
960 	int m;
961 	char *s;
962 
963 	x = execute(a[0]);
964 	val = getfval(x);	/* freebsd: defend against super large field numbers */
965 	if ((Awkfloat)INT_MAX < val)
966 		FATAL("trying to access out of range field %s", x->nval);
967 	m = (int) val;
968 	if (m == 0 && !is_number(s = getsval(x), NULL))	/* suspicion! */
969 		FATAL("illegal field $(%s), name \"%s\"", s, x->nval);
970 		/* BUG: can x->nval ever be null??? */
971 	tempfree(x);
972 	x = fieldadr(m);
973 	x->ctype = OCELL;	/* BUG?  why are these needed? */
974 	x->csub = CFLD;
975 	return(x);
976 }
977 
978 Cell *substr(Node **a, int nnn)		/* substr(a[0], a[1], a[2]) */
979 {
980 	int k, m, n;
981 	int mb, nb;
982 	char *s;
983 	int temp;
984 	Cell *x, *y, *z = NULL;
985 
986 	x = execute(a[0]);
987 	y = execute(a[1]);
988 	if (a[2] != NULL)
989 		z = execute(a[2]);
990 	s = getsval(x);
991 	k = u8_strlen(s) + 1;
992 	if (k <= 1) {
993 		tempfree(x);
994 		tempfree(y);
995 		if (a[2] != NULL) {
996 			tempfree(z);
997 		}
998 		x = gettemp();
999 		setsval(x, "");
1000 		return(x);
1001 	}
1002 	m = (int) getfval(y);
1003 	if (m <= 0)
1004 		m = 1;
1005 	else if (m > k)
1006 		m = k;
1007 	tempfree(y);
1008 	if (a[2] != NULL) {
1009 		n = (int) getfval(z);
1010 		tempfree(z);
1011 	} else
1012 		n = k - 1;
1013 	if (n < 0)
1014 		n = 0;
1015 	else if (n > k - m)
1016 		n = k - m;
1017 	/* m is start, n is length from there */
1018 	DPRINTF("substr: m=%d, n=%d, s=%s\n", m, n, s);
1019 	y = gettemp();
1020 	mb = u8_char2byte(s, m-1); /* byte offset of start char in s */
1021 	nb = u8_char2byte(s, m-1+n);  /* byte offset of end+1 char in s */
1022 
1023 	temp = s[nb];	/* with thanks to John Linderman */
1024 	s[nb] = '\0';
1025 	setsval(y, s + mb);
1026 	s[nb] = temp;
1027 	tempfree(x);
1028 	return(y);
1029 }
1030 
1031 Cell *sindex(Node **a, int nnn)		/* index(a[0], a[1]) */
1032 {
1033 	Cell *x, *y, *z;
1034 	char *s1, *s2, *p1, *p2, *q;
1035 	Awkfloat v = 0.0;
1036 
1037 	x = execute(a[0]);
1038 	s1 = getsval(x);
1039 	y = execute(a[1]);
1040 	s2 = getsval(y);
1041 
1042 	z = gettemp();
1043 	for (p1 = s1; *p1 != '\0'; p1++) {
1044 		for (q = p1, p2 = s2; *p2 != '\0' && *q == *p2; q++, p2++)
1045 			continue;
1046 		if (*p2 == '\0') {
1047 			/* v = (Awkfloat) (p1 - s1 + 1);	 origin 1 */
1048 
1049 		   /* should be a function: used in match() as well */
1050 			int i, len;
1051 			v = 0;
1052 			for (i = 0; i < p1-s1+1; i += len) {
1053 				len = u8_nextlen(s1+i);
1054 				v++;
1055 			}
1056 			break;
1057 		}
1058 	}
1059 	tempfree(x);
1060 	tempfree(y);
1061 	setfval(z, v);
1062 	return(z);
1063 }
1064 
1065 int has_utf8(char *s)	/* return 1 if s contains any utf-8 (2 bytes or more) character */
1066 {
1067 	int n;
1068 
1069 	for (n = 0; *s != 0; s += n) {
1070 		n = u8_nextlen(s);
1071 		if (n > 1)
1072 			return 1;
1073 	}
1074 	return 0;
1075 }
1076 
1077 #define	MAXNUMSIZE	50
1078 
1079 int format(char **pbuf, int *pbufsize, const char *s, Node *a)	/* printf-like conversions */
1080 {
1081 	char *fmt;
1082 	char *p, *t;
1083 	const char *os;
1084 	Cell *x;
1085 	int flag = 0, n;
1086 	int fmtwd; /* format width */
1087 	int fmtsz = recsize;
1088 	char *buf = *pbuf;
1089 	int bufsize = *pbufsize;
1090 #define FMTSZ(a)   (fmtsz - ((a) - fmt))
1091 #define BUFSZ(a)   (bufsize - ((a) - buf))
1092 
1093 	static bool first = true;
1094 	static bool have_a_format = false;
1095 
1096 	if (first) {
1097 		char xbuf[100];
1098 
1099 		snprintf(xbuf, sizeof(xbuf), "%a", 42.0);
1100 		have_a_format = (strcmp(xbuf, "0x1.5p+5") == 0);
1101 		first = false;
1102 	}
1103 
1104 	os = s;
1105 	p = buf;
1106 	if ((fmt = (char *) malloc(fmtsz)) == NULL)
1107 		FATAL("out of memory in format()");
1108 	while (*s) {
1109 		adjbuf(&buf, &bufsize, MAXNUMSIZE+1+p-buf, recsize, &p, "format1");
1110 		if (*s != '%') {
1111 			*p++ = *s++;
1112 			continue;
1113 		}
1114 		if (*(s+1) == '%') {
1115 			*p++ = '%';
1116 			s += 2;
1117 			continue;
1118 		}
1119 		fmtwd = atoi(s+1);
1120 		if (fmtwd < 0)
1121 			fmtwd = -fmtwd;
1122 		adjbuf(&buf, &bufsize, fmtwd+1+p-buf, recsize, &p, "format2");
1123 		for (t = fmt; (*t++ = *s) != '\0'; s++) {
1124 			if (!adjbuf(&fmt, &fmtsz, MAXNUMSIZE+1+t-fmt, recsize, &t, "format3"))
1125 				FATAL("format item %.30s... ran format() out of memory", os);
1126 			/* Ignore size specifiers */
1127 			if (strchr("hjLlqtz", *s) != NULL) {	/* the ansi panoply */
1128 				t--;
1129 				continue;
1130 			}
1131 			if (isalpha((uschar)*s))
1132 				break;
1133 			if (*s == '$') {
1134 				FATAL("'$' not permitted in awk formats");
1135 			}
1136 			if (*s == '*') {
1137 				if (a == NULL) {
1138 					FATAL("not enough args in printf(%s)", os);
1139 				}
1140 				x = execute(a);
1141 				a = a->nnext;
1142 				snprintf(t - 1, FMTSZ(t - 1),
1143 				    "%d", fmtwd=(int) getfval(x));
1144 				if (fmtwd < 0)
1145 					fmtwd = -fmtwd;
1146 				adjbuf(&buf, &bufsize, fmtwd+1+p-buf, recsize, &p, "format");
1147 				t = fmt + strlen(fmt);
1148 				tempfree(x);
1149 			}
1150 		}
1151 		*t = '\0';
1152 		if (fmtwd < 0)
1153 			fmtwd = -fmtwd;
1154 		adjbuf(&buf, &bufsize, fmtwd+1+p-buf, recsize, &p, "format4");
1155 		switch (*s) {
1156 		case 'a': case 'A':
1157 			if (have_a_format)
1158 				flag = *s;
1159 			else
1160 				flag = 'f';
1161 			break;
1162 		case 'f': case 'e': case 'g': case 'E': case 'G':
1163 			flag = 'f';
1164 			break;
1165 		case 'd': case 'i': case 'o': case 'x': case 'X': case 'u':
1166 			flag = (*s == 'd' || *s == 'i') ? 'd' : 'u';
1167 			*(t-1) = 'j';
1168 			*t = *s;
1169 			*++t = '\0';
1170 			break;
1171 		case 's':
1172 			flag = 's';
1173 			break;
1174 		case 'c':
1175 			flag = 'c';
1176 			break;
1177 		default:
1178 			WARNING("weird printf conversion %s", fmt);
1179 			flag = '?';
1180 			break;
1181 		}
1182 		if (a == NULL)
1183 			FATAL("not enough args in printf(%s)", os);
1184 		x = execute(a);
1185 		a = a->nnext;
1186 		n = MAXNUMSIZE;
1187 		if (fmtwd > n)
1188 			n = fmtwd;
1189 		adjbuf(&buf, &bufsize, 1+n+p-buf, recsize, &p, "format5");
1190 		switch (flag) {
1191 		case '?':
1192 			snprintf(p, BUFSZ(p), "%s", fmt);	/* unknown, so dump it too */
1193 			t = getsval(x);
1194 			n = strlen(t);
1195 			if (fmtwd > n)
1196 				n = fmtwd;
1197 			adjbuf(&buf, &bufsize, 1+strlen(p)+n+p-buf, recsize, &p, "format6");
1198 			p += strlen(p);
1199 			snprintf(p, BUFSZ(p), "%s", t);
1200 			break;
1201 		case 'a':
1202 		case 'A':
1203 		case 'f':	snprintf(p, BUFSZ(p), fmt, getfval(x)); break;
1204 		case 'd':	snprintf(p, BUFSZ(p), fmt, (intmax_t) getfval(x)); break;
1205 		case 'u':	snprintf(p, BUFSZ(p), fmt, (uintmax_t) getfval(x)); break;
1206 
1207 		case 's': {
1208 			t = getsval(x);
1209 			n = strlen(t);
1210 			/* if simple format or no utf-8 in the string, sprintf works */
1211 			if (!has_utf8(t) || strcmp(fmt,"%s") == 0) {
1212 				if (fmtwd > n)
1213 					n = fmtwd;
1214 				if (!adjbuf(&buf, &bufsize, 1+n+p-buf, recsize, &p, "format7"))
1215 					FATAL("huge string/format (%d chars) in printf %.30s..." \
1216 						" ran format() out of memory", n, t);
1217 				snprintf(p, BUFSZ(p), fmt, t);
1218 				break;
1219 			}
1220 
1221 			/* get here if string has utf-8 chars and fmt is not plain %s */
1222 			/* "%-w.ps", where -, w and .p are all optional */
1223 			/* '0' before the w is a flag character */
1224 			/* fmt points at % */
1225 			int ljust = 0, wid = 0, prec = n, pad = 0;
1226 			char *f = fmt+1;
1227 			if (f[0] == '-') {
1228 				ljust = 1;
1229 				f++;
1230 			}
1231 			// flags '0' and '+' are recognized but skipped
1232 			if (f[0] == '0') {
1233 				f++;
1234 				if (f[0] == '+')
1235 					f++;
1236 			}
1237 			if (f[0] == '+') {
1238 				f++;
1239 				if (f[0] == '0')
1240 					f++;
1241 			}
1242 			if (isdigit((uschar)f[0])) { /* there is a wid */
1243 				wid = strtol(f, &f, 10);
1244 			}
1245 			if (f[0] == '.') { /* there is a .prec */
1246 				prec = strtol(++f, &f, 10);
1247 			}
1248 			if (prec > u8_strlen(t))
1249 				prec = u8_strlen(t);
1250 			pad = wid>prec ? wid - prec : 0;  // has to be >= 0
1251 			int i, k, n;
1252 
1253 			if (ljust) { // print prec chars from t, then pad blanks
1254 				n = u8_char2byte(t, prec);
1255 				for (k = 0; k < n; k++) {
1256 					//putchar(t[k]);
1257 					*p++ = t[k];
1258 				}
1259 				for (i = 0; i < pad; i++) {
1260 					//printf(" ");
1261 					*p++ = ' ';
1262 				}
1263 			} else { // print pad blanks, then prec chars from t
1264 				for (i = 0; i < pad; i++) {
1265 					//printf(" ");
1266 					*p++ = ' ';
1267 				}
1268 				n = u8_char2byte(t, prec);
1269 				for (k = 0; k < n; k++) {
1270 					//putchar(t[k]);
1271 					*p++ = t[k];
1272 				}
1273 			}
1274 			*p = 0;
1275 			break;
1276 		}
1277 
1278                case 'c': {
1279 			/*
1280 			 * If a numeric value is given, awk should just turn
1281 			 * it into a character and print it:
1282 			 *      BEGIN { printf("%c\n", 65) }
1283 			 * prints "A".
1284 			 *
1285 			 * But what if the numeric value is > 128 and
1286 			 * represents a valid Unicode code point?!? We do
1287 			 * our best to convert it back into UTF-8. If we
1288 			 * can't, we output the encoding of the Unicode
1289 			 * "invalid character", 0xFFFD.
1290 			 */
1291 			if (isnum(x)) {
1292 				int charval = (int) getfval(x);
1293 
1294 				if (charval != 0) {
1295 					if (charval < 128 || awk_mb_cur_max == 1)
1296 						snprintf(p, BUFSZ(p), fmt, charval);
1297 					else {
1298 						// possible unicode character
1299 						size_t count;
1300 						char *bs = wide_char_to_byte_str(charval, &count);
1301 
1302 						if (bs == NULL)	{ // invalid character
1303 							// use unicode invalid character, 0xFFFD
1304 							static char invalid_char[] = "\357\277\275";
1305 							bs = invalid_char;
1306 							count = 3;
1307 						}
1308 						t = bs;
1309 						n = count;
1310 						goto format_percent_c;
1311 					}
1312 				} else {
1313 					*p++ = '\0'; /* explicit null byte */
1314 					*p = '\0';   /* next output will start here */
1315 				}
1316 				break;
1317 			}
1318 			t = getsval(x);
1319 			n = u8_nextlen(t);
1320 		format_percent_c:
1321 			if (n < 2) { /* not utf8 */
1322 				snprintf(p, BUFSZ(p), fmt, getsval(x)[0]);
1323 				break;
1324 			}
1325 
1326 			// utf8 character, almost same song and dance as for %s
1327 			int ljust = 0, wid = 0, prec = n, pad = 0;
1328 			char *f = fmt+1;
1329 			if (f[0] == '-') {
1330 				ljust = 1;
1331 				f++;
1332 			}
1333 			// flags '0' and '+' are recognized but skipped
1334 			if (f[0] == '0') {
1335 				f++;
1336 				if (f[0] == '+')
1337 					f++;
1338 			}
1339 			if (f[0] == '+') {
1340 				f++;
1341 				if (f[0] == '0')
1342 					f++;
1343 			}
1344 			if (isdigit((uschar)f[0])) { /* there is a wid */
1345 				wid = strtol(f, &f, 10);
1346 			}
1347 			if (f[0] == '.') { /* there is a .prec */
1348 				prec = strtol(++f, &f, 10);
1349 			}
1350 			if (prec > 1)           // %c --> only one character
1351 				prec = 1;
1352 			pad = wid>prec ? wid - prec : 0;  // has to be >= 0
1353 			int i;
1354 
1355 			if (ljust) { // print one char from t, then pad blanks
1356 				for (i = 0; i < n; i++)
1357 					*p++ = t[i];
1358 				for (i = 0; i < pad; i++) {
1359 					//printf(" ");
1360 					*p++ = ' ';
1361 				}
1362 			} else { // print pad blanks, then prec chars from t
1363 				for (i = 0; i < pad; i++) {
1364 					//printf(" ");
1365 					*p++ = ' ';
1366 				}
1367 				for (i = 0; i < n; i++)
1368 					*p++ = t[i];
1369 			}
1370 			*p = 0;
1371 			break;
1372 		}
1373 		default:
1374 			FATAL("can't happen: bad conversion %c in format()", flag);
1375 		}
1376 
1377 		tempfree(x);
1378 		p += strlen(p);
1379 		s++;
1380 	}
1381 	*p = '\0';
1382 	free(fmt);
1383 	for ( ; a; a = a->nnext) {		/* evaluate any remaining args */
1384 		x = execute(a);
1385 		tempfree(x);
1386 	}
1387 	*pbuf = buf;
1388 	*pbufsize = bufsize;
1389 	return p - buf;
1390 }
1391 
1392 Cell *awksprintf(Node **a, int n)		/* sprintf(a[0]) */
1393 {
1394 	Cell *x;
1395 	Node *y;
1396 	char *buf;
1397 	int bufsz=3*recsize;
1398 
1399 	if ((buf = (char *) malloc(bufsz)) == NULL)
1400 		FATAL("out of memory in awksprintf");
1401 	y = a[0]->nnext;
1402 	x = execute(a[0]);
1403 	if (format(&buf, &bufsz, getsval(x), y) == -1)
1404 		FATAL("sprintf string %.30s... too long.  can't happen.", buf);
1405 	tempfree(x);
1406 	x = gettemp();
1407 	x->sval = buf;
1408 	x->tval = STR;
1409 	return(x);
1410 }
1411 
1412 Cell *awkprintf(Node **a, int n)		/* printf */
1413 {	/* a[0] is list of args, starting with format string */
1414 	/* a[1] is redirection operator, a[2] is redirection file */
1415 	FILE *fp;
1416 	Cell *x;
1417 	Node *y;
1418 	char *buf;
1419 	int len;
1420 	int bufsz=3*recsize;
1421 
1422 	if ((buf = (char *) malloc(bufsz)) == NULL)
1423 		FATAL("out of memory in awkprintf");
1424 	y = a[0]->nnext;
1425 	x = execute(a[0]);
1426 	if ((len = format(&buf, &bufsz, getsval(x), y)) == -1)
1427 		FATAL("printf string %.30s... too long.  can't happen.", buf);
1428 	tempfree(x);
1429 	if (a[1] == NULL) {
1430 		/* fputs(buf, stdout); */
1431 		fwrite(buf, len, 1, stdout);
1432 		if (ferror(stdout))
1433 			FATAL("write error on stdout");
1434 	} else {
1435 		fp = redirect(ptoi(a[1]), a[2]);
1436 		/* fputs(buf, fp); */
1437 		fwrite(buf, len, 1, fp);
1438 		fflush(fp);
1439 		if (ferror(fp))
1440 			FATAL("write error on %s", filename(fp));
1441 	}
1442 	free(buf);
1443 	return(True);
1444 }
1445 
1446 Cell *arith(Node **a, int n)	/* a[0] + a[1], etc.  also -a[0] */
1447 {
1448 	Awkfloat i, j = 0;
1449 	double v;
1450 	Cell *x, *y, *z;
1451 
1452 	x = execute(a[0]);
1453 	i = getfval(x);
1454 	tempfree(x);
1455 	if (n != UMINUS && n != UPLUS) {
1456 		y = execute(a[1]);
1457 		j = getfval(y);
1458 		tempfree(y);
1459 	}
1460 	z = gettemp();
1461 	switch (n) {
1462 	case ADD:
1463 		i += j;
1464 		break;
1465 	case MINUS:
1466 		i -= j;
1467 		break;
1468 	case MULT:
1469 		i *= j;
1470 		break;
1471 	case DIVIDE:
1472 		if (j == 0)
1473 			FATAL("division by zero");
1474 		i /= j;
1475 		break;
1476 	case MOD:
1477 		if (j == 0)
1478 			FATAL("division by zero in mod");
1479 		modf(i/j, &v);
1480 		i = i - j * v;
1481 		break;
1482 	case UMINUS:
1483 		i = -i;
1484 		break;
1485 	case UPLUS: /* handled by getfval(), above */
1486 		break;
1487 	case POWER:
1488 		if (j >= 0 && modf(j, &v) == 0.0)	/* pos integer exponent */
1489 			i = ipow(i, (int) j);
1490                else {
1491 			errno = 0;
1492 			i = errcheck(pow(i, j), "pow");
1493                }
1494 		break;
1495 	default:	/* can't happen */
1496 		FATAL("illegal arithmetic operator %d", n);
1497 	}
1498 	setfval(z, i);
1499 	return(z);
1500 }
1501 
1502 double ipow(double x, int n)	/* x**n.  ought to be done by pow, but isn't always */
1503 {
1504 	double v;
1505 
1506 	if (n <= 0)
1507 		return 1;
1508 	v = ipow(x, n/2);
1509 	if (n % 2 == 0)
1510 		return v * v;
1511 	else
1512 		return x * v * v;
1513 }
1514 
1515 Cell *incrdecr(Node **a, int n)		/* a[0]++, etc. */
1516 {
1517 	Cell *x, *z;
1518 	int k;
1519 	Awkfloat xf;
1520 
1521 	x = execute(a[0]);
1522 	xf = getfval(x);
1523 	k = (n == PREINCR || n == POSTINCR) ? 1 : -1;
1524 	if (n == PREINCR || n == PREDECR) {
1525 		setfval(x, xf + k);
1526 		return(x);
1527 	}
1528 	z = gettemp();
1529 	setfval(z, xf);
1530 	setfval(x, xf + k);
1531 	tempfree(x);
1532 	return(z);
1533 }
1534 
1535 Cell *assign(Node **a, int n)	/* a[0] = a[1], a[0] += a[1], etc. */
1536 {		/* this is subtle; don't muck with it. */
1537 	Cell *x, *y;
1538 	Awkfloat xf, yf;
1539 	double v;
1540 
1541 	y = execute(a[1]);
1542 	x = execute(a[0]);
1543 	if (n == ASSIGN) {	/* ordinary assignment */
1544 		if (x == y && !(x->tval & (FLD|REC)) && x != nfloc)
1545 			;	/* self-assignment: leave alone unless it's a field or NF */
1546 		else if ((y->tval & (STR|NUM)) == (STR|NUM)) {
1547 			yf = getfval(y);
1548 			setsval(x, getsval(y));
1549 			x->fval = yf;
1550 			x->tval |= NUM;
1551 		}
1552 		else if (isstr(y))
1553 			setsval(x, getsval(y));
1554 		else if (isnum(y))
1555 			setfval(x, getfval(y));
1556 		else
1557 			funnyvar(y, "read value of");
1558 		tempfree(y);
1559 		return(x);
1560 	}
1561 	xf = getfval(x);
1562 	yf = getfval(y);
1563 	switch (n) {
1564 	case ADDEQ:
1565 		xf += yf;
1566 		break;
1567 	case SUBEQ:
1568 		xf -= yf;
1569 		break;
1570 	case MULTEQ:
1571 		xf *= yf;
1572 		break;
1573 	case DIVEQ:
1574 		if (yf == 0)
1575 			FATAL("division by zero in /=");
1576 		xf /= yf;
1577 		break;
1578 	case MODEQ:
1579 		if (yf == 0)
1580 			FATAL("division by zero in %%=");
1581 		modf(xf/yf, &v);
1582 		xf = xf - yf * v;
1583 		break;
1584 	case POWEQ:
1585 		if (yf >= 0 && modf(yf, &v) == 0.0)	/* pos integer exponent */
1586 			xf = ipow(xf, (int) yf);
1587                else {
1588 			errno = 0;
1589 			xf = errcheck(pow(xf, yf), "pow");
1590                }
1591 		break;
1592 	default:
1593 		FATAL("illegal assignment operator %d", n);
1594 		break;
1595 	}
1596 	tempfree(y);
1597 	setfval(x, xf);
1598 	return(x);
1599 }
1600 
1601 Cell *cat(Node **a, int q)	/* a[0] cat a[1] */
1602 {
1603 	Cell *x, *y, *z;
1604 	int n1, n2;
1605 	char *s = NULL;
1606 	int ssz = 0;
1607 
1608 	x = execute(a[0]);
1609 	n1 = strlen(getsval(x));
1610 	adjbuf(&s, &ssz, n1 + 1, recsize, 0, "cat1");
1611 	memcpy(s, x->sval, n1);
1612 
1613 	tempfree(x);
1614 
1615 	y = execute(a[1]);
1616 	n2 = strlen(getsval(y));
1617 	adjbuf(&s, &ssz, n1 + n2 + 1, recsize, 0, "cat2");
1618 	memcpy(s + n1, y->sval, n2);
1619 	s[n1 + n2] = '\0';
1620 
1621 	tempfree(y);
1622 
1623 	z = gettemp();
1624 	z->sval = s;
1625 	z->tval = STR;
1626 
1627 	return(z);
1628 }
1629 
1630 Cell *pastat(Node **a, int n)	/* a[0] { a[1] } */
1631 {
1632 	Cell *x;
1633 
1634 	if (a[0] == NULL)
1635 		x = execute(a[1]);
1636 	else {
1637 		x = execute(a[0]);
1638 		if (istrue(x)) {
1639 			tempfree(x);
1640 			x = execute(a[1]);
1641 		}
1642 	}
1643 	return x;
1644 }
1645 
1646 Cell *dopa2(Node **a, int n)	/* a[0], a[1] { a[2] } */
1647 {
1648 	Cell *x;
1649 	int pair;
1650 
1651 	pair = ptoi(a[3]);
1652 	if (pairstack[pair] == 0) {
1653 		x = execute(a[0]);
1654 		if (istrue(x))
1655 			pairstack[pair] = 1;
1656 		tempfree(x);
1657 	}
1658 	if (pairstack[pair] == 1) {
1659 		x = execute(a[1]);
1660 		if (istrue(x))
1661 			pairstack[pair] = 0;
1662 		tempfree(x);
1663 		x = execute(a[2]);
1664 		return(x);
1665 	}
1666 	return(False);
1667 }
1668 
1669 Cell *split(Node **a, int nnn)	/* split(a[0], a[1], a[2]); a[3] is type */
1670 {
1671 	Cell *x = NULL, *y, *ap;
1672 	const char *s, *origs, *t;
1673 	const char *fs = NULL;
1674 	char *origfs = NULL;
1675 	int sep;
1676 	char temp, num[50];
1677 	int j, n, tempstat, arg3type;
1678 	double result;
1679 
1680 	y = execute(a[0]);	/* source string */
1681 	origs = s = strdup(getsval(y));
1682 	if (s == NULL)
1683 		FATAL("out of space in split");
1684 	tempfree(y);
1685 	arg3type = ptoi(a[3]);
1686 	if (a[2] == NULL) {		/* BUG: CSV should override implicit fs but not explicit */
1687 		fs = getsval(fsloc);
1688 	} else if (arg3type == STRING) {	/* split(str,arr,"string") */
1689 		x = execute(a[2]);
1690 		fs = origfs = strdup(getsval(x));
1691 		if (fs == NULL)
1692 			FATAL("out of space in split");
1693 		tempfree(x);
1694 	} else if (arg3type == REGEXPR) {
1695 		fs = "(regexpr)";	/* split(str,arr,/regexpr/) */
1696 	} else {
1697 		FATAL("illegal type of split");
1698 	}
1699 	sep = *fs;
1700 	ap = execute(a[1]);	/* array name */
1701 	/* BUG 7/26/22: this appears not to reset array: see C1/asplit */
1702 	freesymtab(ap);
1703 	DPRINTF("split: s=|%s|, a=%s, sep=|%s|\n", s, NN(ap->nval), fs);
1704 	ap->tval &= ~STR;
1705 	ap->tval |= ARR;
1706 	ap->sval = (char *) makesymtab(NSYMTAB);
1707 
1708 	n = 0;
1709         if (arg3type == REGEXPR && strlen((char*)((fa*)a[2])->restr) == 0) {
1710 		/* split(s, a, //); have to arrange that it looks like empty sep */
1711 		arg3type = 0;
1712 		fs = "";
1713 		sep = 0;
1714 	}
1715 	if (*s != '\0' && (strlen(fs) > 1 || arg3type == REGEXPR)) {	/* reg expr */
1716 		fa *pfa;
1717 		if (arg3type == REGEXPR) {	/* it's ready already */
1718 			pfa = (fa *) a[2];
1719 		} else {
1720 			pfa = makedfa(fs, 1);
1721 		}
1722 		if (nematch(pfa,s)) {
1723 			tempstat = pfa->initstat;
1724 			pfa->initstat = 2;
1725 			do {
1726 				n++;
1727 				snprintf(num, sizeof(num), "%d", n);
1728 				temp = *patbeg;
1729 				setptr(patbeg, '\0');
1730 				if (is_number(s, & result))
1731 					setsymtab(num, s, result, STR|NUM, (Array *) ap->sval);
1732 				else
1733 					setsymtab(num, s, 0.0, STR, (Array *) ap->sval);
1734 				setptr(patbeg, temp);
1735 				s = patbeg + patlen;
1736 				if (*(patbeg+patlen-1) == '\0' || *s == '\0') {
1737 					n++;
1738 					snprintf(num, sizeof(num), "%d", n);
1739 					setsymtab(num, "", 0.0, STR, (Array *) ap->sval);
1740 					pfa->initstat = tempstat;
1741 					goto spdone;
1742 				}
1743 			} while (nematch(pfa,s));
1744 			pfa->initstat = tempstat; 	/* bwk: has to be here to reset */
1745 							/* cf gsub and refldbld */
1746 		}
1747 		n++;
1748 		snprintf(num, sizeof(num), "%d", n);
1749 		if (is_number(s, & result))
1750 			setsymtab(num, s, result, STR|NUM, (Array *) ap->sval);
1751 		else
1752 			setsymtab(num, s, 0.0, STR, (Array *) ap->sval);
1753   spdone:
1754 		pfa = NULL;
1755 
1756 	} else if (a[2] == NULL && CSV) {	/* CSV only if no explicit separator */
1757 		char *newt = (char *) malloc(strlen(s)); /* for building new string; reuse for each field */
1758 		for (;;) {
1759 			char *fr = newt;
1760 			n++;
1761 			if (*s == '"' ) { /* start of "..." */
1762 				for (s++ ; *s != '\0'; ) {
1763 					if (*s == '"' && s[1] != '\0' && s[1] == '"') {
1764 						s += 2; /* doubled quote */
1765 						*fr++ = '"';
1766 					} else if (*s == '"' && (s[1] == '\0' || s[1] == ',')) {
1767 						s++; /* skip over closing quote */
1768 						break;
1769 					} else {
1770 						*fr++ = *s++;
1771 					}
1772 				}
1773 				*fr++ = 0;
1774 			} else {	/* unquoted field */
1775 				while (*s != ',' && *s != '\0')
1776 					*fr++ = *s++;
1777 				*fr++ = 0;
1778 			}
1779 			snprintf(num, sizeof(num), "%d", n);
1780 			if (is_number(newt, &result))
1781 				setsymtab(num, newt, result, STR|NUM, (Array *) ap->sval);
1782 			else
1783 				setsymtab(num, newt, 0.0, STR, (Array *) ap->sval);
1784 			if (*s++ == '\0')
1785 				break;
1786 		}
1787 		free(newt);
1788 
1789 	} else if (!CSV && sep == ' ') { /* usual case: split on white space */
1790 		for (n = 0; ; ) {
1791 #define ISWS(c)	((c) == ' ' || (c) == '\t' || (c) == '\n')
1792 			while (ISWS(*s))
1793 				s++;
1794 			if (*s == '\0')
1795 				break;
1796 			n++;
1797 			t = s;
1798 			do
1799 				s++;
1800 			while (*s != '\0' && !ISWS(*s));
1801 			temp = *s;
1802 			setptr(s, '\0');
1803 			snprintf(num, sizeof(num), "%d", n);
1804 			if (is_number(t, & result))
1805 				setsymtab(num, t, result, STR|NUM, (Array *) ap->sval);
1806 			else
1807 				setsymtab(num, t, 0.0, STR, (Array *) ap->sval);
1808 			setptr(s, temp);
1809 			if (*s != '\0')
1810 				s++;
1811 		}
1812 
1813 	} else if (sep == 0) {	/* new: split(s, a, "") => 1 char/elem */
1814 		for (n = 0; *s != '\0'; s += u8_nextlen(s)) {
1815 			char buf[10];
1816 			n++;
1817 			snprintf(num, sizeof(num), "%d", n);
1818 
1819 			for (j = 0; j < u8_nextlen(s); j++) {
1820 				buf[j] = s[j];
1821 			}
1822 			buf[j] = '\0';
1823 
1824 			if (isdigit((uschar)buf[0]))
1825 				setsymtab(num, buf, atof(buf), STR|NUM, (Array *) ap->sval);
1826 			else
1827 				setsymtab(num, buf, 0.0, STR, (Array *) ap->sval);
1828 		}
1829 
1830 	} else if (*s != '\0') {  /* some random single character */
1831 		for (;;) {
1832 			n++;
1833 			t = s;
1834 			while (*s != sep && *s != '\n' && *s != '\0')
1835 				s++;
1836 			temp = *s;
1837 			setptr(s, '\0');
1838 			snprintf(num, sizeof(num), "%d", n);
1839 			if (is_number(t, & result))
1840 				setsymtab(num, t, result, STR|NUM, (Array *) ap->sval);
1841 			else
1842 				setsymtab(num, t, 0.0, STR, (Array *) ap->sval);
1843 			setptr(s, temp);
1844 			if (*s++ == '\0')
1845 				break;
1846 		}
1847 	}
1848 	tempfree(ap);
1849 	xfree(origs);
1850 	xfree(origfs);
1851 	x = gettemp();
1852 	x->tval = NUM;
1853 	x->fval = n;
1854 	return(x);
1855 }
1856 
1857 Cell *condexpr(Node **a, int n)	/* a[0] ? a[1] : a[2] */
1858 {
1859 	Cell *x;
1860 
1861 	x = execute(a[0]);
1862 	if (istrue(x)) {
1863 		tempfree(x);
1864 		x = execute(a[1]);
1865 	} else {
1866 		tempfree(x);
1867 		x = execute(a[2]);
1868 	}
1869 	return(x);
1870 }
1871 
1872 Cell *ifstat(Node **a, int n)	/* if (a[0]) a[1]; else a[2] */
1873 {
1874 	Cell *x;
1875 
1876 	x = execute(a[0]);
1877 	if (istrue(x)) {
1878 		tempfree(x);
1879 		x = execute(a[1]);
1880 	} else if (a[2] != NULL) {
1881 		tempfree(x);
1882 		x = execute(a[2]);
1883 	}
1884 	return(x);
1885 }
1886 
1887 Cell *whilestat(Node **a, int n)	/* while (a[0]) a[1] */
1888 {
1889 	Cell *x;
1890 
1891 	for (;;) {
1892 		x = execute(a[0]);
1893 		if (!istrue(x))
1894 			return(x);
1895 		tempfree(x);
1896 		x = execute(a[1]);
1897 		if (isbreak(x)) {
1898 			x = True;
1899 			return(x);
1900 		}
1901 		if (isnext(x) || isexit(x) || isret(x))
1902 			return(x);
1903 		tempfree(x);
1904 	}
1905 }
1906 
1907 Cell *dostat(Node **a, int n)	/* do a[0]; while(a[1]) */
1908 {
1909 	Cell *x;
1910 
1911 	for (;;) {
1912 		x = execute(a[0]);
1913 		if (isbreak(x))
1914 			return True;
1915 		if (isnext(x) || isexit(x) || isret(x))
1916 			return(x);
1917 		tempfree(x);
1918 		x = execute(a[1]);
1919 		if (!istrue(x))
1920 			return(x);
1921 		tempfree(x);
1922 	}
1923 }
1924 
1925 Cell *forstat(Node **a, int n)	/* for (a[0]; a[1]; a[2]) a[3] */
1926 {
1927 	Cell *x;
1928 
1929 	x = execute(a[0]);
1930 	tempfree(x);
1931 	for (;;) {
1932 		if (a[1]!=NULL) {
1933 			x = execute(a[1]);
1934 			if (!istrue(x)) return(x);
1935 			else tempfree(x);
1936 		}
1937 		x = execute(a[3]);
1938 		if (isbreak(x))		/* turn off break */
1939 			return True;
1940 		if (isnext(x) || isexit(x) || isret(x))
1941 			return(x);
1942 		tempfree(x);
1943 		x = execute(a[2]);
1944 		tempfree(x);
1945 	}
1946 }
1947 
1948 Cell *instat(Node **a, int n)	/* for (a[0] in a[1]) a[2] */
1949 {
1950 	Cell *x, *vp, *arrayp, *cp, *ncp;
1951 	Array *tp;
1952 	int i;
1953 
1954 	vp = execute(a[0]);
1955 	arrayp = execute(a[1]);
1956 	if (!isarr(arrayp)) {
1957 		return True;
1958 	}
1959 	tp = (Array *) arrayp->sval;
1960 	tempfree(arrayp);
1961 	for (i = 0; i < tp->size; i++) {	/* this routine knows too much */
1962 		for (cp = tp->tab[i]; cp != NULL; cp = ncp) {
1963 			setsval(vp, cp->nval);
1964 			ncp = cp->cnext;
1965 			x = execute(a[2]);
1966 			if (isbreak(x)) {
1967 				tempfree(vp);
1968 				return True;
1969 			}
1970 			if (isnext(x) || isexit(x) || isret(x)) {
1971 				tempfree(vp);
1972 				return(x);
1973 			}
1974 			tempfree(x);
1975 		}
1976 	}
1977 	return True;
1978 }
1979 
1980 static char *nawk_convert(const char *s, int (*fun_c)(int),
1981     wint_t (*fun_wc)(wint_t))
1982 {
1983 	char *buf      = NULL;
1984 	char *pbuf     = NULL;
1985 	const char *ps = NULL;
1986 	size_t n       = 0;
1987 	wchar_t wc;
1988 	const size_t sz = awk_mb_cur_max;
1989 	int unused;
1990 
1991 	if (sz == 1) {
1992 		buf = tostring(s);
1993 
1994 		for (pbuf = buf; *pbuf; pbuf++)
1995 			*pbuf = fun_c((uschar)*pbuf);
1996 
1997 		return buf;
1998 	} else {
1999 		/* upper/lower character may be shorter/longer */
2000 		buf = tostringN(s, strlen(s) * sz + 1);
2001 
2002 		(void) mbtowc(NULL, NULL, 0);	/* reset internal state */
2003 		/*
2004 		 * Reset internal state here too.
2005 		 * Assign result to avoid a compiler warning. (Casting to void
2006 		 * doesn't work.)
2007 		 * Increment said variable to avoid a different warning.
2008 		 */
2009 		unused = wctomb(NULL, L'\0');
2010 		unused++;
2011 
2012 		ps   = s;
2013 		pbuf = buf;
2014 		while (n = mbtowc(&wc, ps, sz),
2015 		       n > 0 && n != (size_t)-1 && n != (size_t)-2)
2016 		{
2017 			ps += n;
2018 
2019 			n = wctomb(pbuf, fun_wc(wc));
2020 			if (n == (size_t)-1)
2021 				FATAL("illegal wide character %s", s);
2022 
2023 			pbuf += n;
2024 		}
2025 
2026 		*pbuf = '\0';
2027 
2028 		if (n)
2029 			FATAL("illegal byte sequence %s", s);
2030 
2031 		return buf;
2032 	}
2033 }
2034 
2035 #ifdef __DJGPP__
2036 static wint_t towupper(wint_t wc)
2037 {
2038 	if (wc >= 0 && wc < 256)
2039 		return toupper(wc & 0xFF);
2040 
2041 	return wc;
2042 }
2043 
2044 static wint_t towlower(wint_t wc)
2045 {
2046 	if (wc >= 0 && wc < 256)
2047 		return tolower(wc & 0xFF);
2048 
2049 	return wc;
2050 }
2051 #endif
2052 
2053 static char *nawk_toupper(const char *s)
2054 {
2055 	return nawk_convert(s, toupper, towupper);
2056 }
2057 
2058 static char *nawk_tolower(const char *s)
2059 {
2060 	return nawk_convert(s, tolower, towlower);
2061 }
2062 
2063 Cell *bltin(Node **a, int n)	/* builtin functions. a[0] is type, a[1] is arg list */
2064 {
2065 	Cell *x, *y;
2066 	Awkfloat u;
2067 	int t, sz;
2068 	Awkfloat tmp;
2069 	char *buf, *fmt;
2070 	Node *nextarg;
2071 	FILE *fp;
2072 	int status = 0;
2073 	time_t tv;
2074 	struct tm *tm, tmbuf;
2075 	int estatus = 0;
2076 
2077 	t = ptoi(a[0]);
2078 	x = execute(a[1]);
2079 	nextarg = a[1]->nnext;
2080 	switch (t) {
2081 	case FLENGTH:
2082 		if (isarr(x))
2083 			u = ((Array *) x->sval)->nelem;	/* GROT.  should be function*/
2084 		else
2085 			u = u8_strlen(getsval(x));
2086 		break;
2087 	case FLOG:
2088 		errno = 0;
2089 		u = errcheck(log(getfval(x)), "log");
2090 		break;
2091 	case FINT:
2092 		modf(getfval(x), &u); break;
2093 	case FEXP:
2094 		errno = 0;
2095 		u = errcheck(exp(getfval(x)), "exp");
2096 		break;
2097 	case FSQRT:
2098 		errno = 0;
2099 		u = errcheck(sqrt(getfval(x)), "sqrt");
2100 		break;
2101 	case FSIN:
2102 		u = sin(getfval(x)); break;
2103 	case FCOS:
2104 		u = cos(getfval(x)); break;
2105 	case FATAN:
2106 		if (nextarg == NULL) {
2107 			WARNING("atan2 requires two arguments; returning 1.0");
2108 			u = 1.0;
2109 		} else {
2110 			y = execute(a[1]->nnext);
2111 			u = atan2(getfval(x), getfval(y));
2112 			tempfree(y);
2113 			nextarg = nextarg->nnext;
2114 		}
2115 		break;
2116 	case FCOMPL:
2117 		u = ~((int)getfval(x));
2118 		break;
2119 	case FAND:
2120 		if (nextarg == 0) {
2121 			WARNING("and requires two arguments; returning 0");
2122 			u = 0;
2123 			break;
2124 		}
2125 		y = execute(a[1]->nnext);
2126 		u = ((int)getfval(x)) & ((int)getfval(y));
2127 		tempfree(y);
2128 		nextarg = nextarg->nnext;
2129 		break;
2130 	case FFOR:
2131 		if (nextarg == 0) {
2132 			WARNING("or requires two arguments; returning 0");
2133 			u = 0;
2134 			break;
2135 		}
2136 		y = execute(a[1]->nnext);
2137 		u = ((int)getfval(x)) | ((int)getfval(y));
2138 		tempfree(y);
2139 		nextarg = nextarg->nnext;
2140 		break;
2141 	case FXOR:
2142 		if (nextarg == 0) {
2143 			WARNING("xor requires two arguments; returning 0");
2144 			u = 0;
2145 			break;
2146 		}
2147 		y = execute(a[1]->nnext);
2148 		u = ((int)getfval(x)) ^ ((int)getfval(y));
2149 		tempfree(y);
2150 		nextarg = nextarg->nnext;
2151 		break;
2152 	case FLSHIFT:
2153 		if (nextarg == 0) {
2154 			WARNING("lshift requires two arguments; returning 0");
2155 			u = 0;
2156 			break;
2157 		}
2158 		y = execute(a[1]->nnext);
2159 		u = ((int)getfval(x)) << ((int)getfval(y));
2160 		tempfree(y);
2161 		nextarg = nextarg->nnext;
2162 		break;
2163 	case FRSHIFT:
2164 		if (nextarg == 0) {
2165 			WARNING("rshift requires two arguments; returning 0");
2166 			u = 0;
2167 			break;
2168 		}
2169 		y = execute(a[1]->nnext);
2170 		u = ((int)getfval(x)) >> ((int)getfval(y));
2171 		tempfree(y);
2172 		nextarg = nextarg->nnext;
2173 		break;
2174 	case FSYSTEM:
2175 		fflush(stdout);		/* in case something is buffered already */
2176 		estatus = status = system(getsval(x));
2177 		if (status != -1) {
2178 			if (WIFEXITED(status)) {
2179 				estatus = WEXITSTATUS(status);
2180 			} else if (WIFSIGNALED(status)) {
2181 				estatus = WTERMSIG(status) + 256;
2182 #ifdef WCOREDUMP
2183 				if (WCOREDUMP(status))
2184 					estatus += 256;
2185 #endif
2186 			} else	/* something else?!? */
2187 				estatus = 0;
2188 		}
2189 		/* else estatus was set to -1 */
2190 		u = estatus;
2191 		break;
2192 	case FRAND:
2193 		/* random() returns numbers in [0..2^31-1]
2194 		 * in order to get a number in [0, 1), divide it by 2^31
2195 		 */
2196 		u = (Awkfloat) random() / (0x7fffffffL + 0x1UL);
2197 		break;
2198 	case FSRAND:
2199 		if (isrec(x)) {		/* no argument provided */
2200 			u = time(NULL);
2201 			tmp = u;
2202 			srandom((unsigned int) u);
2203 		} else {
2204 			u = getfval(x);
2205 			tmp = u;
2206 			srandom_deterministic((unsigned int) u);
2207 		}
2208 		u = srand_seed;
2209 		srand_seed = tmp;
2210 		break;
2211 	case FTOUPPER:
2212 	case FTOLOWER:
2213 		if (t == FTOUPPER)
2214 			buf = nawk_toupper(getsval(x));
2215 		else
2216 			buf = nawk_tolower(getsval(x));
2217 		tempfree(x);
2218 		x = gettemp();
2219 		setsval(x, buf);
2220 		free(buf);
2221 		return x;
2222 	case FFLUSH:
2223 		if (isrec(x) || strlen(getsval(x)) == 0) {
2224 			flush_all();	/* fflush() or fflush("") -> all */
2225 			u = 0;
2226 		} else if ((fp = openfile(FFLUSH, getsval(x), NULL)) == NULL)
2227 			u = EOF;
2228 		else
2229 			u = fflush(fp);
2230 		break;
2231 	case FMKTIME:
2232 		memset(&tmbuf, 0, sizeof(tmbuf));
2233 		tm = &tmbuf;
2234 		t = sscanf(getsval(x), "%d %d %d %d %d %d %d",
2235 		    &tm->tm_year, &tm->tm_mon, &tm->tm_mday, &tm->tm_hour,
2236 		    &tm->tm_min, &tm->tm_sec, &tm->tm_isdst);
2237 		switch (t) {
2238 		case 6:
2239 			tm->tm_isdst = -1;	/* let mktime figure it out */
2240 			/* FALLTHROUGH */
2241 		case 7:
2242 			tm->tm_year -= 1900;
2243 			tm->tm_mon--;
2244 			u = mktime(tm);
2245 			break;
2246 		default:
2247 			u = -1;
2248 			break;
2249 		}
2250 		break;
2251 	case FSYSTIME:
2252 		u = time((time_t *) 0);
2253 		break;
2254 	case FSTRFTIME:
2255 		/* strftime([format [,timestamp]]) */
2256 		if (nextarg) {
2257 			y = execute(nextarg);
2258 			nextarg = nextarg->nnext;
2259 			tv = (time_t) getfval(y);
2260 			tempfree(y);
2261 		} else
2262 			tv = time((time_t *) 0);
2263 		tm = localtime(&tv);
2264 		if (tm == NULL)
2265 			FATAL("bad time %ld", (long)tv);
2266 
2267 		if (isrec(x)) {
2268 			/* format argument not provided, use default */
2269 			fmt = tostring("%a %b %d %H:%M:%S %Z %Y");
2270 		} else
2271 			fmt = tostring(getsval(x));
2272 
2273 		sz = 32;
2274 		buf = NULL;
2275 		do {
2276 			if ((buf = (char *) reallocarray(buf, 2, sz)) == NULL)
2277 				FATAL("out of memory in strftime");
2278 			sz *= 2;
2279 		} while (strftime(buf, sz, fmt, tm) == 0 && fmt[0] != '\0');
2280 
2281 		y = gettemp();
2282 		setsval(y, buf);
2283 		free(fmt);
2284 		free(buf);
2285 
2286 		return y;
2287 	default:	/* can't happen */
2288 		FATAL("illegal function type %d", t);
2289 		break;
2290 	}
2291 	tempfree(x);
2292 	x = gettemp();
2293 	setfval(x, u);
2294 	if (nextarg != NULL) {
2295 		WARNING("warning: function has too many arguments");
2296 		for ( ; nextarg; nextarg = nextarg->nnext) {
2297 			y = execute(nextarg);
2298 			tempfree(y);
2299 		}
2300 	}
2301 	return(x);
2302 }
2303 
2304 Cell *printstat(Node **a, int n)	/* print a[0] */
2305 {
2306 	Node *x;
2307 	Cell *y;
2308 	FILE *fp;
2309 
2310 	if (a[1] == NULL)	/* a[1] is redirection operator, a[2] is file */
2311 		fp = stdout;
2312 	else
2313 		fp = redirect(ptoi(a[1]), a[2]);
2314 	for (x = a[0]; x != NULL; x = x->nnext) {
2315 		y = execute(x);
2316 		fputs(getpssval(y), fp);
2317 		tempfree(y);
2318 		if (x->nnext == NULL)
2319 			fputs(getsval(orsloc), fp);
2320 		else
2321 			fputs(getsval(ofsloc), fp);
2322 	}
2323 	if (a[1] != NULL)
2324 		fflush(fp);
2325 	if (ferror(fp))
2326 		FATAL("write error on %s", filename(fp));
2327 	return(True);
2328 }
2329 
2330 Cell *nullproc(Node **a, int n)
2331 {
2332 	return 0;
2333 }
2334 
2335 
2336 FILE *redirect(int a, Node *b)	/* set up all i/o redirections */
2337 {
2338 	FILE *fp;
2339 	Cell *x;
2340 	char *fname;
2341 
2342 	x = execute(b);
2343 	fname = getsval(x);
2344 	fp = openfile(a, fname, NULL);
2345 	if (fp == NULL)
2346 		FATAL("can't open file %s", fname);
2347 	tempfree(x);
2348 	return fp;
2349 }
2350 
2351 struct files {
2352 	FILE	*fp;
2353 	const char	*fname;
2354 	int	mode;	/* '|', 'a', 'w' => LE/LT, GT */
2355 } *files;
2356 
2357 size_t nfiles;
2358 
2359 static void stdinit(void)	/* in case stdin, etc., are not constants */
2360 {
2361 	nfiles = FOPEN_MAX;
2362 	files = (struct files *) calloc(nfiles, sizeof(*files));
2363 	if (files == NULL)
2364 		FATAL("can't allocate file memory for %zu files", nfiles);
2365         files[0].fp = stdin;
2366 	files[0].fname = tostring("/dev/stdin");
2367 	files[0].mode = LT;
2368         files[1].fp = stdout;
2369 	files[1].fname = tostring("/dev/stdout");
2370 	files[1].mode = GT;
2371         files[2].fp = stderr;
2372 	files[2].fname = tostring("/dev/stderr");
2373 	files[2].mode = GT;
2374 }
2375 
2376 FILE *openfile(int a, const char *us, bool *pnewflag)
2377 {
2378 	const char *s = us;
2379 	size_t i;
2380 	int m;
2381 	FILE *fp = NULL;
2382 
2383 	if (*s == '\0')
2384 		FATAL("null file name in print or getline");
2385 	for (i = 0; i < nfiles; i++)
2386 		if (files[i].fname && strcmp(s, files[i].fname) == 0 &&
2387 		    (a == files[i].mode || (a==APPEND && files[i].mode==GT) ||
2388 		     a == FFLUSH)) {
2389 			if (pnewflag)
2390 				*pnewflag = false;
2391 			return files[i].fp;
2392 		}
2393 	if (a == FFLUSH)	/* didn't find it, so don't create it! */
2394 		return NULL;
2395 
2396 	for (i = 0; i < nfiles; i++)
2397 		if (files[i].fp == NULL)
2398 			break;
2399 	if (i >= nfiles) {
2400 		struct files *nf;
2401 		size_t nnf = nfiles + FOPEN_MAX;
2402 		nf = (struct files *) reallocarray(files, nnf, sizeof(*nf));
2403 		if (nf == NULL)
2404 			FATAL("cannot grow files for %s and %zu files", s, nnf);
2405 		memset(&nf[nfiles], 0, FOPEN_MAX * sizeof(*nf));
2406 		nfiles = nnf;
2407 		files = nf;
2408 	}
2409 	fflush(stdout);	/* force a semblance of order */
2410 	m = a;
2411 	if (a == GT) {
2412 		fp = fopen(s, "w");
2413 	} else if (a == APPEND) {
2414 		fp = fopen(s, "a");
2415 		m = GT;	/* so can mix > and >> */
2416 	} else if (a == '|') {	/* output pipe */
2417 		fp = popen(s, "w");
2418 	} else if (a == LE) {	/* input pipe */
2419 		fp = popen(s, "r");
2420 	} else if (a == LT) {	/* getline <file */
2421 		fp = strcmp(s, "-") == 0 ? stdin : fopen(s, "r");	/* "-" is stdin */
2422 	} else	/* can't happen */
2423 		FATAL("illegal redirection %d", a);
2424 	if (fp != NULL) {
2425 		files[i].fname = tostring(s);
2426 		files[i].fp = fp;
2427 		files[i].mode = m;
2428 		if (pnewflag)
2429 			*pnewflag = true;
2430 		if (fp != stdin && fp != stdout && fp != stderr)
2431 			(void) fcntl(fileno(fp), F_SETFD, FD_CLOEXEC);
2432 	}
2433 	return fp;
2434 }
2435 
2436 const char *filename(FILE *fp)
2437 {
2438 	size_t i;
2439 
2440 	for (i = 0; i < nfiles; i++)
2441 		if (fp == files[i].fp)
2442 			return files[i].fname;
2443 	return "???";
2444 }
2445 
2446 Cell *closefile(Node **a, int n)
2447 {
2448  	Cell *x;
2449 	size_t i;
2450 	bool stat;
2451 
2452  	x = execute(a[0]);
2453  	getsval(x);
2454 	stat = true;
2455  	for (i = 0; i < nfiles; i++) {
2456 		if (!files[i].fname || strcmp(x->sval, files[i].fname) != 0)
2457 			continue;
2458 		if (files[i].mode == GT || files[i].mode == '|')
2459 			fflush(files[i].fp);
2460 		if (ferror(files[i].fp)) {
2461 			if ((files[i].mode == GT && files[i].fp != stderr)
2462 			  || files[i].mode == '|')
2463 				FATAL("write error on %s", files[i].fname);
2464 			else
2465 				WARNING("i/o error occurred on %s", files[i].fname);
2466 		}
2467 		if (files[i].fp == stdin || files[i].fp == stdout ||
2468 		    files[i].fp == stderr)
2469 			stat = freopen("/dev/null", "r+", files[i].fp) == NULL;
2470 		else if (files[i].mode == '|' || files[i].mode == LE)
2471 			stat = pclose(files[i].fp) == -1;
2472 		else
2473 			stat = fclose(files[i].fp) == EOF;
2474 		if (stat)
2475 			WARNING("i/o error occurred closing %s", files[i].fname);
2476 		xfree(files[i].fname);
2477 		files[i].fname = NULL;	/* watch out for ref thru this */
2478 		files[i].fp = NULL;
2479 		break;
2480  	}
2481  	tempfree(x);
2482  	x = gettemp();
2483 	setfval(x, (Awkfloat) (stat ? -1 : 0));
2484  	return(x);
2485 }
2486 
2487 void closeall(void)
2488 {
2489 	size_t i;
2490 	bool stat = false;
2491 
2492 	for (i = 0; i < nfiles; i++) {
2493 		if (! files[i].fp)
2494 			continue;
2495 		if (files[i].mode == GT || files[i].mode == '|')
2496 			fflush(files[i].fp);
2497 		if (ferror(files[i].fp)) {
2498 			if ((files[i].mode == GT && files[i].fp != stderr)
2499 			  || files[i].mode == '|')
2500 				FATAL("write error on %s", files[i].fname);
2501 			else
2502 				WARNING("i/o error occurred on %s", files[i].fname);
2503 		}
2504 		if (files[i].fp == stdin || files[i].fp == stdout ||
2505 		    files[i].fp == stderr)
2506 			continue;
2507 		if (files[i].mode == '|' || files[i].mode == LE)
2508 			stat = pclose(files[i].fp) == -1;
2509 		else
2510 			stat = fclose(files[i].fp) == EOF;
2511 		if (stat)
2512 			WARNING("i/o error occurred while closing %s", files[i].fname);
2513 	}
2514 }
2515 
2516 static void flush_all(void)
2517 {
2518 	size_t i;
2519 
2520 	for (i = 0; i < nfiles; i++)
2521 		if (files[i].fp)
2522 			fflush(files[i].fp);
2523 }
2524 
2525 void backsub(char **pb_ptr, const char **sptr_ptr);
2526 
2527 Cell *dosub(Node **a, int subop)        /* sub and gsub */
2528 {
2529 	fa *pfa;
2530 	int tempstat;
2531 	char *repl;
2532 	Cell *x;
2533 
2534 	char *buf = NULL;
2535 	char *pb = NULL;
2536 	int bufsz = recsize;
2537 
2538 	const char *r, *s;
2539 	const char *start;
2540 	const char *noempty = NULL;      /* empty match disallowed here */
2541 	size_t m = 0;                    /* match count */
2542 	size_t whichm;                   /* which match to select, 0 = global */
2543 	int mtype;                       /* match type */
2544 
2545 	if (a[0] == NULL) {	/* 0 => a[1] is already-compiled regexpr */
2546 		pfa = (fa *) a[1];
2547 	} else {
2548 		x = execute(a[1]);
2549 		pfa = makedfa(getsval(x), 1);
2550 		tempfree(x);
2551 	}
2552 
2553 	x = execute(a[2]);	/* replacement string */
2554 	repl = tostring(getsval(x));
2555 	tempfree(x);
2556 
2557 	switch (subop) {
2558 	case SUB:
2559 		whichm = 1;
2560 		x = execute(a[3]);    /* source string */
2561 		break;
2562 	case GSUB:
2563 		whichm = 0;
2564 		x = execute(a[3]);    /* source string */
2565 		break;
2566 	default:
2567 		FATAL("dosub: unrecognized subop: %d", subop);
2568 	}
2569 
2570 	start = getsval(x);
2571 	while (pmatch(pfa, start)) {
2572 		if (buf == NULL) {
2573 			if ((pb = buf = (char *) malloc(bufsz)) == NULL)
2574 				FATAL("out of memory in dosub");
2575 			tempstat = pfa->initstat;
2576 			pfa->initstat = 2;
2577 		}
2578 
2579 		/* match types */
2580 		#define	MT_IGNORE  0  /* unselected or invalid */
2581 		#define MT_INSERT  1  /* selected, empty */
2582 		#define MT_REPLACE 2  /* selected, not empty */
2583 
2584 		/* an empty match just after replacement is invalid */
2585 
2586 		if (patbeg == noempty && patlen == 0) {
2587 			mtype = MT_IGNORE;    /* invalid, not counted */
2588 		} else if (whichm == ++m || whichm == 0) {
2589 			mtype = patlen ? MT_REPLACE : MT_INSERT;
2590 		} else {
2591 			mtype = MT_IGNORE;    /* unselected, but counted */
2592 		}
2593 
2594 		/* leading text: */
2595 		if (patbeg > start) {
2596 			adjbuf(&buf, &bufsz, (pb - buf) + (patbeg - start),
2597 				recsize, &pb, "dosub");
2598 			s = start;
2599 			while (s < patbeg)
2600 				*pb++ = *s++;
2601 		}
2602 
2603 		if (mtype == MT_IGNORE)
2604 			goto matching_text;  /* skip replacement text */
2605 
2606 		r = repl;
2607 		while (*r != 0) {
2608 			adjbuf(&buf, &bufsz, 5+pb-buf, recsize, &pb, "dosub");
2609 			if (*r == '\\') {
2610 				backsub(&pb, &r);
2611 			} else if (*r == '&') {
2612 				r++;
2613 				adjbuf(&buf, &bufsz, 1+patlen+pb-buf, recsize,
2614 					&pb, "dosub");
2615 				for (s = patbeg; s < patbeg+patlen; )
2616 					*pb++ = *s++;
2617 			} else {
2618 				*pb++ = *r++;
2619 			}
2620 		}
2621 
2622 matching_text:
2623 		if (mtype == MT_REPLACE || *patbeg == '\0')
2624 			goto next_search;  /* skip matching text */
2625 
2626 		if (patlen == 0)
2627 			patlen = u8_nextlen(patbeg);
2628 		adjbuf(&buf, &bufsz, (pb-buf) + patlen, recsize, &pb, "dosub");
2629 		s = patbeg;
2630 		while (s < patbeg + patlen)
2631 			*pb++ = *s++;
2632 
2633 next_search:
2634 		start = patbeg + patlen;
2635 		if (m == whichm || *patbeg == '\0')
2636 			break;
2637 		if (mtype == MT_REPLACE)
2638 			noempty = start;
2639 
2640 		#undef MT_IGNORE
2641 		#undef MT_INSERT
2642 		#undef MT_REPLACE
2643 	}
2644 
2645 	xfree(repl);
2646 
2647 	if (buf != NULL) {
2648 		pfa->initstat = tempstat;
2649 
2650 		/* trailing text */
2651 		adjbuf(&buf, &bufsz, 1+strlen(start)+pb-buf, 0, &pb, "dosub");
2652 		while ((*pb++ = *start++) != '\0')
2653 			;
2654 
2655 		setsval(x, buf);
2656 		free(buf);
2657 	}
2658 
2659 	tempfree(x);
2660 	x = gettemp();
2661 	x->tval = NUM;
2662 	x->fval = m;
2663 	return x;
2664 }
2665 
2666 Cell *gensub(Node **a, int nnn)	/* global selective substitute */
2667 	/* XXX incomplete - doesn't support backreferences \0 ... \9 */
2668 {
2669 	Cell *x, *y, *res, *h;
2670 	char *rptr;
2671 	const char *sptr;
2672 	char *buf, *pb;
2673 	const char *t, *q;
2674 	fa *pfa;
2675 	int mflag, tempstat, num, whichm;
2676 	int bufsz = recsize;
2677 
2678 	if ((buf = (char *) malloc(bufsz)) == NULL)
2679 		FATAL("out of memory in gensub");
2680 	mflag = 0;	/* if mflag == 0, can replace empty string */
2681 	num = 0;
2682 	x = execute(a[4]);	/* source string */
2683 	t = getsval(x);
2684 	res = copycell(x);	/* target string - initially copy of source */
2685 	res->csub = CTEMP;	/* result values are temporary */
2686 	if (a[0] == 0)		/* 0 => a[1] is already-compiled regexpr */
2687 		pfa = (fa *) a[1];	/* regular expression */
2688 	else {
2689 		y = execute(a[1]);
2690 		pfa = makedfa(getsval(y), 1);
2691 		tempfree(y);
2692 	}
2693 	y = execute(a[2]);	/* replacement string */
2694 	h = execute(a[3]);	/* which matches should be replaced */
2695 	sptr = getsval(h);
2696 	if (sptr[0] == 'g' || sptr[0] == 'G')
2697 		whichm = -1;
2698 	else {
2699 		/*
2700 		 * The specified number is index of replacement, starting
2701 		 * from 1. GNU awk treats index lower than 0 same as
2702 		 * 1, we do same for compatibility.
2703 		 */
2704 		whichm = (int) getfval(h) - 1;
2705 		if (whichm < 0)
2706 			whichm = 0;
2707 	}
2708 	tempfree(h);
2709 
2710 	if (pmatch(pfa, t)) {
2711 		char *sl;
2712 
2713 		tempstat = pfa->initstat;
2714 		pfa->initstat = 2;
2715 		pb = buf;
2716 		rptr = getsval(y);
2717 		/*
2718 		 * XXX if there are any backreferences in subst string,
2719 		 * complain now.
2720 		 */
2721 		for (sl = rptr; (sl = strchr(sl, '\\')) && sl[1]; sl++) {
2722 			if (strchr("0123456789", sl[1])) {
2723 				FATAL("gensub doesn't support backreferences (subst \"%s\")", rptr);
2724 			}
2725 		}
2726 
2727 		do {
2728 			if (whichm >= 0 && whichm != num) {
2729 				num++;
2730 				adjbuf(&buf, &bufsz, (pb - buf) + (patbeg - t) + patlen, recsize, &pb, "gensub");
2731 
2732 				/* copy the part of string up to and including
2733 				 * match to output buffer */
2734 				while (t < patbeg + patlen)
2735 					*pb++ = *t++;
2736 				continue;
2737 			}
2738 
2739 			if (patlen == 0 && *patbeg != 0) {	/* matched empty string */
2740 				if (mflag == 0) {	/* can replace empty */
2741 					num++;
2742 					sptr = rptr;
2743 					while (*sptr != 0) {
2744 						adjbuf(&buf, &bufsz, 5+pb-buf, recsize, &pb, "gensub");
2745 						if (*sptr == '\\') {
2746 							backsub(&pb, &sptr);
2747 						} else if (*sptr == '&') {
2748 							sptr++;
2749 							adjbuf(&buf, &bufsz, 1+patlen+pb-buf, recsize, &pb, "gensub");
2750 							for (q = patbeg; q < patbeg+patlen; )
2751 								*pb++ = *q++;
2752 						} else
2753 							*pb++ = *sptr++;
2754 					}
2755 				}
2756 				if (*t == 0)	/* at end */
2757 					goto done;
2758 				adjbuf(&buf, &bufsz, 2+pb-buf, recsize, &pb, "gensub");
2759 				*pb++ = *t++;
2760 				if (pb > buf + bufsz)	/* BUG: not sure of this test */
2761 					FATAL("gensub result0 %.30s too big; can't happen", buf);
2762 				mflag = 0;
2763 			}
2764 			else {	/* matched nonempty string */
2765 				num++;
2766 				sptr = t;
2767 				adjbuf(&buf, &bufsz, 1+(patbeg-sptr)+pb-buf, recsize, &pb, "gensub");
2768 				while (sptr < patbeg)
2769 					*pb++ = *sptr++;
2770 				sptr = rptr;
2771 				while (*sptr != 0) {
2772 					adjbuf(&buf, &bufsz, 5+pb-buf, recsize, &pb, "gensub");
2773 					if (*sptr == '\\') {
2774 						backsub(&pb, &sptr);
2775 					} else if (*sptr == '&') {
2776 						sptr++;
2777 						adjbuf(&buf, &bufsz, 1+patlen+pb-buf, recsize, &pb, "gensub");
2778 						for (q = patbeg; q < patbeg+patlen; )
2779 							*pb++ = *q++;
2780 					} else
2781 						*pb++ = *sptr++;
2782 				}
2783 				t = patbeg + patlen;
2784 				if (patlen == 0 || *t == 0 || *(t-1) == 0)
2785 					goto done;
2786 				if (pb > buf + bufsz)
2787 					FATAL("gensub result1 %.30s too big; can't happen", buf);
2788 				mflag = 1;
2789 			}
2790 		} while (pmatch(pfa,t));
2791 		sptr = t;
2792 		adjbuf(&buf, &bufsz, 1+strlen(sptr)+pb-buf, 0, &pb, "gensub");
2793 		while ((*pb++ = *sptr++) != 0)
2794 			;
2795 	done:	if (pb > buf + bufsz)
2796 			FATAL("gensub result2 %.30s too big; can't happen", buf);
2797 		*pb = '\0';
2798 		setsval(res, buf);
2799 		pfa->initstat = tempstat;
2800 	}
2801 	tempfree(x);
2802 	tempfree(y);
2803 	free(buf);
2804 	return(res);
2805 }
2806 
2807 void backsub(char **pb_ptr, const char **sptr_ptr)	/* handle \\& variations */
2808 {						/* sptr[0] == '\\' */
2809 	char *pb = *pb_ptr;
2810 	const char *sptr = *sptr_ptr;
2811 
2812 	if (sptr[1] == '\\') {
2813 		if (sptr[2] == '\\' && sptr[3] == '&') { /* \\\& -> \& */
2814 			*pb++ = '\\';
2815 			*pb++ = '&';
2816 			sptr += 4;
2817 		} else if (sptr[2] == '&') {	/* \\& -> \ + matched */
2818 			*pb++ = '\\';
2819 			sptr += 2;
2820 		} else if (do_posix) {		/* \\x -> \x */
2821 			sptr++;
2822 			*pb++ = *sptr++;
2823 		} else {			/* \\x -> \\x */
2824 			*pb++ = *sptr++;
2825 			*pb++ = *sptr++;
2826 		}
2827 	} else if (sptr[1] == '&') {	/* literal & */
2828 		sptr++;
2829 		*pb++ = *sptr++;
2830 	} else				/* literal \ */
2831 		*pb++ = *sptr++;
2832 
2833 	*pb_ptr = pb;
2834 	*sptr_ptr = sptr;
2835 }
2836 
2837 static char *wide_char_to_byte_str(int rune, size_t *outlen)
2838 {
2839 	static char buf[5];
2840 	int len;
2841 
2842 	if (rune < 0 || rune > 0x10FFFF)
2843 		return NULL;
2844 
2845 	memset(buf, 0, sizeof(buf));
2846 
2847 	len = 0;
2848 	if (rune <= 0x0000007F) {
2849 		buf[len++] = rune;
2850 	} else if (rune <= 0x000007FF) {
2851 		// 110xxxxx 10xxxxxx
2852 		buf[len++] = 0xC0 | (rune >> 6);
2853 		buf[len++] = 0x80 | (rune & 0x3F);
2854 	} else if (rune <= 0x0000FFFF) {
2855 		// 1110xxxx 10xxxxxx 10xxxxxx
2856 		buf[len++] = 0xE0 | (rune >> 12);
2857 		buf[len++] = 0x80 | ((rune >> 6) & 0x3F);
2858 		buf[len++] = 0x80 | (rune & 0x3F);
2859 
2860 	} else {
2861 		// 0x00010000 - 0x10FFFF
2862 		// 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
2863 		buf[len++] = 0xF0 | (rune >> 18);
2864 		buf[len++] = 0x80 | ((rune >> 12) & 0x3F);
2865 		buf[len++] = 0x80 | ((rune >> 6) & 0x3F);
2866 		buf[len++] = 0x80 | (rune & 0x3F);
2867 	}
2868 
2869 	*outlen = len;
2870 	buf[len++] = '\0';
2871 
2872 	return buf;
2873 }
2874