xref: /openbsd/usr.bin/awk/run.c (revision d415bd75)
1 /*	$OpenBSD: run.c,v 1.80 2023/10/28 22:38:22 millert Exp $	*/
2 /****************************************************************
3 Copyright (C) Lucent Technologies 1997
4 All Rights Reserved
5 
6 Permission to use, copy, modify, and distribute this software and
7 its documentation for any purpose and without fee is hereby
8 granted, provided that the above copyright notice appear in all
9 copies and that both that the copyright notice and this
10 permission notice and warranty disclaimer appear in supporting
11 documentation, and that the name Lucent Technologies or any of
12 its entities not be used in advertising or publicity pertaining
13 to distribution of the software without specific, written prior
14 permission.
15 
16 LUCENT DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
17 INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS.
18 IN NO EVENT SHALL LUCENT OR ANY OF ITS ENTITIES BE LIABLE FOR ANY
19 SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
20 WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER
21 IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,
22 ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF
23 THIS SOFTWARE.
24 ****************************************************************/
25 
26 #define DEBUG
27 #include <stdio.h>
28 #include <ctype.h>
29 #include <errno.h>
30 #include <wctype.h>
31 #include <fcntl.h>
32 #include <setjmp.h>
33 #include <limits.h>
34 #include <math.h>
35 #include <string.h>
36 #include <stdlib.h>
37 #include <time.h>
38 #include <sys/types.h>
39 #include <sys/wait.h>
40 #include "awk.h"
41 #include "awkgram.tab.h"
42 
43 
44 static void stdinit(void);
45 static void flush_all(void);
46 static char *wide_char_to_byte_str(int rune, size_t *outlen);
47 
48 #if 1
49 #define tempfree(x)	do { if (istemp(x)) tfree(x); } while (/*CONSTCOND*/0)
50 #else
51 void tempfree(Cell *p) {
52 	if (p->ctype == OCELL && (p->csub < CUNK || p->csub > CFREE)) {
53 		WARNING("bad csub %d in Cell %d %s",
54 			p->csub, p->ctype, p->sval);
55 	}
56 	if (istemp(p))
57 		tfree(p);
58 }
59 #endif
60 
61 /* do we really need these? */
62 /* #ifdef _NFILE */
63 /* #ifndef FOPEN_MAX */
64 /* #define FOPEN_MAX _NFILE */
65 /* #endif */
66 /* #endif */
67 /*  */
68 /* #ifndef	FOPEN_MAX */
69 /* #define	FOPEN_MAX	40 */	/* max number of open files */
70 /* #endif */
71 /*  */
72 /* #ifndef RAND_MAX */
73 /* #define RAND_MAX	32767 */	/* all that ansi guarantees */
74 /* #endif */
75 
76 jmp_buf env;
77 extern	int	pairstack[];
78 extern	Awkfloat	srand_seed;
79 
80 Node	*winner = NULL;	/* root of parse tree */
81 Cell	*tmps;		/* free temporary cells for execution */
82 
83 static Cell	truecell	={ OBOOL, BTRUE, 0, 0, 1.0, NUM, NULL, NULL };
84 Cell	*True	= &truecell;
85 static Cell	falsecell	={ OBOOL, BFALSE, 0, 0, 0.0, NUM, NULL, NULL };
86 Cell	*False	= &falsecell;
87 static Cell	breakcell	={ OJUMP, JBREAK, 0, 0, 0.0, NUM, NULL, NULL };
88 Cell	*jbreak	= &breakcell;
89 static Cell	contcell	={ OJUMP, JCONT, 0, 0, 0.0, NUM, NULL, NULL };
90 Cell	*jcont	= &contcell;
91 static Cell	nextcell	={ OJUMP, JNEXT, 0, 0, 0.0, NUM, NULL, NULL };
92 Cell	*jnext	= &nextcell;
93 static Cell	nextfilecell	={ OJUMP, JNEXTFILE, 0, 0, 0.0, NUM, NULL, NULL };
94 Cell	*jnextfile	= &nextfilecell;
95 static Cell	exitcell	={ OJUMP, JEXIT, 0, 0, 0.0, NUM, NULL, NULL };
96 Cell	*jexit	= &exitcell;
97 static Cell	retcell		={ OJUMP, JRET, 0, 0, 0.0, NUM, NULL, NULL };
98 Cell	*jret	= &retcell;
99 static Cell	tempcell	={ OCELL, CTEMP, 0, EMPTY, 0.0, NUM|STR|DONTFREE, NULL, NULL };
100 
101 Node	*curnode = NULL;	/* the node being executed, for debugging */
102 
103 /* buffer memory management */
104 int adjbuf(char **pbuf, int *psiz, int minlen, int quantum, char **pbptr,
105 	const char *whatrtn)
106 /* pbuf:    address of pointer to buffer being managed
107  * psiz:    address of buffer size variable
108  * minlen:  minimum length of buffer needed
109  * quantum: buffer size quantum
110  * pbptr:   address of movable pointer into buffer, or 0 if none
111  * whatrtn: name of the calling routine if failure should cause fatal error
112  *
113  * return   0 for realloc failure, !=0 for success
114  */
115 {
116 	if (minlen > *psiz) {
117 		char *tbuf;
118 		int rminlen = quantum ? minlen % quantum : 0;
119 		int boff = pbptr ? *pbptr - *pbuf : 0;
120 		/* round up to next multiple of quantum */
121 		if (rminlen)
122 			minlen += quantum - rminlen;
123 		tbuf = (char *) realloc(*pbuf, minlen);
124 		DPRINTF("adjbuf %s: %d %d (pbuf=%p, tbuf=%p)\n", whatrtn, *psiz, minlen, (void*)*pbuf, (void*)tbuf);
125 		if (tbuf == NULL) {
126 			if (whatrtn)
127 				FATAL("out of memory in %s", whatrtn);
128 			return 0;
129 		}
130 		*pbuf = tbuf;
131 		*psiz = minlen;
132 		if (pbptr)
133 			*pbptr = tbuf + boff;
134 	}
135 	return 1;
136 }
137 
138 void run(Node *a)	/* execution of parse tree starts here */
139 {
140 
141 	stdinit();
142 	execute(a);
143 	closeall();
144 }
145 
146 Cell *execute(Node *u)	/* execute a node of the parse tree */
147 {
148 	Cell *(*proc)(Node **, int);
149 	Cell *x;
150 	Node *a;
151 
152 	if (u == NULL)
153 		return(True);
154 	for (a = u; ; a = a->nnext) {
155 		curnode = a;
156 		if (isvalue(a)) {
157 			x = (Cell *) (a->narg[0]);
158 			if (isfld(x) && !donefld)
159 				fldbld();
160 			else if (isrec(x) && !donerec)
161 				recbld();
162 			return(x);
163 		}
164 		if (notlegal(a->nobj))	/* probably a Cell* but too risky to print */
165 			FATAL("illegal statement");
166 		proc = proctab[a->nobj-FIRSTTOKEN];
167 		x = (*proc)(a->narg, a->nobj);
168 		if (isfld(x) && !donefld)
169 			fldbld();
170 		else if (isrec(x) && !donerec)
171 			recbld();
172 		if (isexpr(a))
173 			return(x);
174 		if (isjump(x))
175 			return(x);
176 		if (a->nnext == NULL)
177 			return(x);
178 		tempfree(x);
179 	}
180 }
181 
182 
183 Cell *program(Node **a, int n)	/* execute an awk program */
184 {				/* a[0] = BEGIN, a[1] = body, a[2] = END */
185 	Cell *x;
186 
187 	if (setjmp(env) != 0)
188 		goto ex;
189 	if (a[0]) {		/* BEGIN */
190 		x = execute(a[0]);
191 		if (isexit(x))
192 			return(True);
193 		if (isjump(x))
194 			FATAL("illegal break, continue, next or nextfile from BEGIN");
195 		tempfree(x);
196 	}
197 	if (a[1] || a[2])
198 		while (getrec(&record, &recsize, true) > 0) {
199 			x = execute(a[1]);
200 			if (isexit(x))
201 				break;
202 			tempfree(x);
203 		}
204   ex:
205 	if (setjmp(env) != 0)	/* handles exit within END */
206 		goto ex1;
207 	if (a[2]) {		/* END */
208 		x = execute(a[2]);
209 		if (isbreak(x) || isnext(x) || iscont(x))
210 			FATAL("illegal break, continue, next or nextfile from END");
211 		tempfree(x);
212 	}
213   ex1:
214 	return(True);
215 }
216 
217 struct Frame {	/* stack frame for awk function calls */
218 	int nargs;	/* number of arguments in this call */
219 	Cell *fcncell;	/* pointer to Cell for function */
220 	Cell **args;	/* pointer to array of arguments after execute */
221 	Cell *retval;	/* return value */
222 };
223 
224 #define	NARGS	50	/* max args in a call */
225 
226 struct Frame *frame = NULL;	/* base of stack frames; dynamically allocated */
227 int	nframe = 0;		/* number of frames allocated */
228 struct Frame *frp = NULL;	/* frame pointer. bottom level unused */
229 
230 Cell *call(Node **a, int n)	/* function call.  very kludgy and fragile */
231 {
232 	static const Cell newcopycell = { OCELL, CCOPY, 0, EMPTY, 0.0, NUM|STR|DONTFREE, NULL, NULL };
233 	int i, ncall, ndef;
234 	int freed = 0; /* handles potential double freeing when fcn & param share a tempcell */
235 	Node *x;
236 	Cell *args[NARGS], *oargs[NARGS];	/* BUG: fixed size arrays */
237 	Cell *y, *z, *fcn;
238 	char *s;
239 
240 	fcn = execute(a[0]);	/* the function itself */
241 	s = fcn->nval;
242 	if (!isfcn(fcn))
243 		FATAL("calling undefined function %s", s);
244 	if (frame == NULL) {
245 		frp = frame = (struct Frame *) calloc(nframe += 100, sizeof(*frame));
246 		if (frame == NULL)
247 			FATAL("out of space for stack frames calling %s", s);
248 	}
249 	for (ncall = 0, x = a[1]; x != NULL; x = x->nnext)	/* args in call */
250 		ncall++;
251 	ndef = (int) fcn->fval;			/* args in defn */
252 	DPRINTF("calling %s, %d args (%d in defn), frp=%d\n", s, ncall, ndef, (int) (frp-frame));
253 	if (ncall > ndef)
254 		WARNING("function %s called with %d args, uses only %d",
255 			s, ncall, ndef);
256 	if (ncall + ndef > NARGS)
257 		FATAL("function %s has %d arguments, limit %d", s, ncall+ndef, NARGS);
258 	for (i = 0, x = a[1]; x != NULL; i++, x = x->nnext) {	/* get call args */
259 		DPRINTF("evaluate args[%d], frp=%d:\n", i, (int) (frp-frame));
260 		y = execute(x);
261 		oargs[i] = y;
262 		DPRINTF("args[%d]: %s %f <%s>, t=%o\n",
263 			i, NN(y->nval), y->fval, isarr(y) ? "(array)" : NN(y->sval), y->tval);
264 		if (isfcn(y))
265 			FATAL("can't use function %s as argument in %s", y->nval, s);
266 		if (isarr(y))
267 			args[i] = y;	/* arrays by ref */
268 		else
269 			args[i] = copycell(y);
270 		tempfree(y);
271 	}
272 	for ( ; i < ndef; i++) {	/* add null args for ones not provided */
273 		args[i] = gettemp();
274 		*args[i] = newcopycell;
275 	}
276 	frp++;	/* now ok to up frame */
277 	if (frp >= frame + nframe) {
278 		int dfp = frp - frame;	/* old index */
279 		frame = (struct Frame *) reallocarray(frame, (nframe += 100), sizeof(*frame));
280 		if (frame == NULL)
281 			FATAL("out of space for stack frames in %s", s);
282 		frp = frame + dfp;
283 	}
284 	frp->fcncell = fcn;
285 	frp->args = args;
286 	frp->nargs = ndef;	/* number defined with (excess are locals) */
287 	frp->retval = gettemp();
288 
289 	DPRINTF("start exec of %s, frp=%d\n", s, (int) (frp-frame));
290 	y = execute((Node *)(fcn->sval));	/* execute body */
291 	DPRINTF("finished exec of %s, frp=%d\n", s, (int) (frp-frame));
292 
293 	for (i = 0; i < ndef; i++) {
294 		Cell *t = frp->args[i];
295 		if (isarr(t)) {
296 			if (t->csub == CCOPY) {
297 				if (i >= ncall) {
298 					freesymtab(t);
299 					t->csub = CTEMP;
300 					tempfree(t);
301 				} else {
302 					oargs[i]->tval = t->tval;
303 					oargs[i]->tval &= ~(STR|NUM|DONTFREE);
304 					oargs[i]->sval = t->sval;
305 					tempfree(t);
306 				}
307 			}
308 		} else if (t != y) {	/* kludge to prevent freeing twice */
309 			t->csub = CTEMP;
310 			tempfree(t);
311 		} else if (t == y && t->csub == CCOPY) {
312 			t->csub = CTEMP;
313 			tempfree(t);
314 			freed = 1;
315 		}
316 	}
317 	tempfree(fcn);
318 	if (isexit(y) || isnext(y))
319 		return y;
320 	if (freed == 0) {
321 		tempfree(y);	/* don't free twice! */
322 	}
323 	z = frp->retval;			/* return value */
324 	DPRINTF("%s returns %g |%s| %o\n", s, getfval(z), getsval(z), z->tval);
325 	frp--;
326 	return(z);
327 }
328 
329 Cell *copycell(Cell *x)	/* make a copy of a cell in a temp */
330 {
331 	Cell *y;
332 
333 	/* copy is not constant or field */
334 
335 	y = gettemp();
336 	y->tval = x->tval & ~(CON|FLD|REC);
337 	y->csub = CCOPY;	/* prevents freeing until call is over */
338 	y->nval = x->nval;	/* BUG? */
339 	if (isstr(x) /* || x->ctype == OCELL */) {
340 		y->sval = tostring(x->sval);
341 		y->tval &= ~DONTFREE;
342 	} else
343 		y->tval |= DONTFREE;
344 	y->fval = x->fval;
345 	return y;
346 }
347 
348 Cell *arg(Node **a, int n)	/* nth argument of a function */
349 {
350 
351 	n = ptoi(a[0]);	/* argument number, counting from 0 */
352 	DPRINTF("arg(%d), frp->nargs=%d\n", n, frp->nargs);
353 	if (n+1 > frp->nargs)
354 		FATAL("argument #%d of function %s was not supplied",
355 			n+1, frp->fcncell->nval);
356 	return frp->args[n];
357 }
358 
359 Cell *jump(Node **a, int n)	/* break, continue, next, nextfile, return */
360 {
361 	Cell *y;
362 
363 	switch (n) {
364 	case EXIT:
365 		if (a[0] != NULL) {
366 			y = execute(a[0]);
367 			errorflag = (int) getfval(y);
368 			tempfree(y);
369 		}
370 		longjmp(env, 1);
371 	case RETURN:
372 		if (a[0] != NULL) {
373 			y = execute(a[0]);
374 			if ((y->tval & (STR|NUM)) == (STR|NUM)) {
375 				setsval(frp->retval, getsval(y));
376 				frp->retval->fval = getfval(y);
377 				frp->retval->tval |= NUM;
378 			}
379 			else if (y->tval & STR)
380 				setsval(frp->retval, getsval(y));
381 			else if (y->tval & NUM)
382 				setfval(frp->retval, getfval(y));
383 			else		/* can't happen */
384 				FATAL("bad type variable %d", y->tval);
385 			tempfree(y);
386 		}
387 		return(jret);
388 	case NEXT:
389 		return(jnext);
390 	case NEXTFILE:
391 		nextfile();
392 		return(jnextfile);
393 	case BREAK:
394 		return(jbreak);
395 	case CONTINUE:
396 		return(jcont);
397 	default:	/* can't happen */
398 		FATAL("illegal jump type %d", n);
399 	}
400 	return 0;	/* not reached */
401 }
402 
403 Cell *awkgetline(Node **a, int n)	/* get next line from specific input */
404 {		/* a[0] is variable, a[1] is operator, a[2] is filename */
405 	Cell *r, *x;
406 	extern Cell **fldtab;
407 	FILE *fp;
408 	char *buf;
409 	int bufsize = recsize;
410 	int mode;
411 	bool newflag;
412 	double result;
413 
414 	if ((buf = (char *) malloc(bufsize)) == NULL)
415 		FATAL("out of memory in getline");
416 
417 	fflush(stdout);	/* in case someone is waiting for a prompt */
418 	r = gettemp();
419 	if (a[1] != NULL) {		/* getline < file */
420 		x = execute(a[2]);		/* filename */
421 		mode = ptoi(a[1]);
422 		if (mode == '|')		/* input pipe */
423 			mode = LE;	/* arbitrary flag */
424 		fp = openfile(mode, getsval(x), &newflag);
425 		tempfree(x);
426 		if (fp == NULL)
427 			n = -1;
428 		else
429 			n = readrec(&buf, &bufsize, fp, newflag);
430 		if (n <= 0) {
431 			;
432 		} else if (a[0] != NULL) {	/* getline var <file */
433 			x = execute(a[0]);
434 			setsval(x, buf);
435 			if (is_number(x->sval, & result)) {
436 				x->fval = result;
437 				x->tval |= NUM;
438 			}
439 			tempfree(x);
440 		} else {			/* getline <file */
441 			setsval(fldtab[0], buf);
442 			if (is_number(fldtab[0]->sval, & result)) {
443 				fldtab[0]->fval = result;
444 				fldtab[0]->tval |= NUM;
445 			}
446 		}
447 	} else {			/* bare getline; use current input */
448 		if (a[0] == NULL)	/* getline */
449 			n = getrec(&record, &recsize, true);
450 		else {			/* getline var */
451 			n = getrec(&buf, &bufsize, false);
452 			if (n > 0) {
453 				x = execute(a[0]);
454 				setsval(x, buf);
455 				if (is_number(x->sval, & result)) {
456 					x->fval = result;
457 					x->tval |= NUM;
458 				}
459 				tempfree(x);
460 			}
461 		}
462 	}
463 	setfval(r, (Awkfloat) n);
464 	free(buf);
465 	return r;
466 }
467 
468 Cell *getnf(Node **a, int n)	/* get NF */
469 {
470 	if (!donefld)
471 		fldbld();
472 	return (Cell *) a[0];
473 }
474 
475 static char *
476 makearraystring(Node *p, const char *func)
477 {
478 	char *buf;
479 	int bufsz = recsize;
480 	size_t blen;
481 
482 	if ((buf = (char *) malloc(bufsz)) == NULL) {
483 		FATAL("%s: out of memory", func);
484 	}
485 
486 	blen = 0;
487 	buf[blen] = '\0';
488 
489 	for (; p; p = p->nnext) {
490 		Cell *x = execute(p);	/* expr */
491 		char *s = getsval(x);
492 		size_t seplen = strlen(getsval(subseploc));
493 		size_t nsub = p->nnext ? seplen : 0;
494 		size_t slen = strlen(s);
495 		size_t tlen = blen + slen + nsub;
496 
497 		if (!adjbuf(&buf, &bufsz, tlen + 1, recsize, 0, func)) {
498 			FATAL("%s: out of memory %s[%s...]",
499 			    func, x->nval, buf);
500 		}
501 		memcpy(buf + blen, s, slen);
502 		if (nsub) {
503 			memcpy(buf + blen + slen, *SUBSEP, nsub);
504 		}
505 		buf[tlen] = '\0';
506 		blen = tlen;
507 		tempfree(x);
508 	}
509 	return buf;
510 }
511 
512 Cell *array(Node **a, int n)	/* a[0] is symtab, a[1] is list of subscripts */
513 {
514 	Cell *x, *z;
515 	char *buf;
516 
517 	x = execute(a[0]);	/* Cell* for symbol table */
518 	buf = makearraystring(a[1], __func__);
519 	if (!isarr(x)) {
520 		DPRINTF("making %s into an array\n", NN(x->nval));
521 		if (freeable(x))
522 			xfree(x->sval);
523 		x->tval &= ~(STR|NUM|DONTFREE);
524 		x->tval |= ARR;
525 		x->sval = (char *) makesymtab(NSYMTAB);
526 	}
527 	z = setsymtab(buf, "", 0.0, STR|NUM, (Array *) x->sval);
528 	z->ctype = OCELL;
529 	z->csub = CVAR;
530 	tempfree(x);
531 	free(buf);
532 	return(z);
533 }
534 
535 Cell *awkdelete(Node **a, int n)	/* a[0] is symtab, a[1] is list of subscripts */
536 {
537 	Cell *x;
538 
539 	x = execute(a[0]);	/* Cell* for symbol table */
540 	if (x == symtabloc) {
541 		FATAL("cannot delete SYMTAB or its elements");
542 	}
543 	if (!isarr(x))
544 		return True;
545 	if (a[1] == NULL) {	/* delete the elements, not the table */
546 		freesymtab(x);
547 		x->tval &= ~STR;
548 		x->tval |= ARR;
549 		x->sval = (char *) makesymtab(NSYMTAB);
550 	} else {
551 		char *buf = makearraystring(a[1], __func__);
552 		freeelem(x, buf);
553 		free(buf);
554 	}
555 	tempfree(x);
556 	return True;
557 }
558 
559 Cell *intest(Node **a, int n)	/* a[0] is index (list), a[1] is symtab */
560 {
561 	Cell *ap, *k;
562 	char *buf;
563 
564 	ap = execute(a[1]);	/* array name */
565 	if (!isarr(ap)) {
566 		DPRINTF("making %s into an array\n", ap->nval);
567 		if (freeable(ap))
568 			xfree(ap->sval);
569 		ap->tval &= ~(STR|NUM|DONTFREE);
570 		ap->tval |= ARR;
571 		ap->sval = (char *) makesymtab(NSYMTAB);
572 	}
573 	buf = makearraystring(a[0], __func__);
574 	k = lookup(buf, (Array *) ap->sval);
575 	tempfree(ap);
576 	free(buf);
577 	if (k == NULL)
578 		return(False);
579 	else
580 		return(True);
581 }
582 
583 
584 /* ======== utf-8 code ========== */
585 
586 /*
587  * Awk strings can contain ascii, random 8-bit items (eg Latin-1),
588  * or utf-8.  u8_isutf tests whether a string starts with a valid
589  * utf-8 sequence, and returns 0 if not (e.g., high bit set).
590  * u8_nextlen returns length of next valid sequence, which is
591  * 1 for ascii, 2..4 for utf-8, or 1 for high bit non-utf.
592  * u8_strlen returns length of string in valid utf-8 sequences
593  * and/or high-bit bytes.  Conversion functions go between byte
594  * number and character number.
595  *
596  * In theory, this behaves the same as before for non-utf8 bytes.
597  *
598  * Limited checking! This is a potential security hole.
599  */
600 
601 /* is s the beginning of a valid utf-8 string? */
602 /* return length 1..4 if yes, 0 if no */
603 int u8_isutf(const char *s)
604 {
605 	int n, ret;
606 	unsigned char c;
607 
608 	c = s[0];
609 	if (c < 128 || awk_mb_cur_max == 1)
610 		return 1; /* what if it's 0? */
611 
612 	n = strlen(s);
613 	if (n >= 2 && ((c>>5) & 0x7) == 0x6 && (s[1] & 0xC0) == 0x80) {
614 		ret = 2; /* 110xxxxx 10xxxxxx */
615 	} else if (n >= 3 && ((c>>4) & 0xF) == 0xE && (s[1] & 0xC0) == 0x80
616 			 && (s[2] & 0xC0) == 0x80) {
617 		ret = 3; /* 1110xxxx 10xxxxxx 10xxxxxx */
618 	} else if (n >= 4 && ((c>>3) & 0x1F) == 0x1E && (s[1] & 0xC0) == 0x80
619 			 && (s[2] & 0xC0) == 0x80 && (s[3] & 0xC0) == 0x80) {
620 		ret = 4; /* 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */
621 	} else {
622 		ret = 0;
623 	}
624 	return ret;
625 }
626 
627 /* Convert (prefix of) utf8 string to utf-32 rune. */
628 /* Sets *rune to the value, returns the length. */
629 /* No error checking: watch out. */
630 int u8_rune(int *rune, const char *s)
631 {
632 	int n, ret;
633 	unsigned char c;
634 
635 	c = s[0];
636 	if (c < 128 || awk_mb_cur_max == 1) {
637 		*rune = c;
638 		return 1;
639 	}
640 
641 	n = strlen(s);
642 	if (n >= 2 && ((c>>5) & 0x7) == 0x6 && (s[1] & 0xC0) == 0x80) {
643 		*rune = ((c & 0x1F) << 6) | (s[1] & 0x3F); /* 110xxxxx 10xxxxxx */
644 		ret = 2;
645 	} else if (n >= 3 && ((c>>4) & 0xF) == 0xE && (s[1] & 0xC0) == 0x80
646 			  && (s[2] & 0xC0) == 0x80) {
647 		*rune = ((c & 0xF) << 12) | ((s[1] & 0x3F) << 6) | (s[2] & 0x3F);
648 			/* 1110xxxx 10xxxxxx 10xxxxxx */
649 		ret = 3;
650 	} else if (n >= 4 && ((c>>3) & 0x1F) == 0x1E && (s[1] & 0xC0) == 0x80
651 			  && (s[2] & 0xC0) == 0x80 && (s[3] & 0xC0) == 0x80) {
652 		*rune = ((c & 0x7) << 18) | ((s[1] & 0x3F) << 12) | ((s[2] & 0x3F) << 6) | (s[3] & 0x3F);
653 			/* 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */
654 		ret = 4;
655 	} else {
656 		*rune = c;
657 		ret = 1;
658 	}
659 	return ret; /* returns one byte if sequence doesn't look like utf */
660 }
661 
662 /* return length of next sequence: 1 for ascii or random, 2..4 for valid utf8 */
663 int u8_nextlen(const char *s)
664 {
665 	int len;
666 
667 	len = u8_isutf(s);
668 	if (len == 0)
669 		len = 1;
670 	return len;
671 }
672 
673 /* return number of utf characters or single non-utf bytes */
674 int u8_strlen(const char *s)
675 {
676 	int i, len, n, totlen;
677 	unsigned char c;
678 
679 	n = strlen(s);
680 	totlen = 0;
681 	for (i = 0; i < n; i += len) {
682 		c = s[i];
683 		if (c < 128 || awk_mb_cur_max == 1) {
684 			len = 1;
685 		} else {
686 			len = u8_nextlen(&s[i]);
687 		}
688 		totlen++;
689 		if (i > n)
690 			FATAL("bad utf count [%s] n=%d i=%d\n", s, n, i);
691 	}
692 	return totlen;
693 }
694 
695 /* convert utf-8 char number in a string to its byte offset */
696 int u8_char2byte(const char *s, int charnum)
697 {
698 	int n;
699 	int bytenum = 0;
700 
701 	while (charnum > 0) {
702 		n = u8_nextlen(s);
703 		s += n;
704 		bytenum += n;
705 		charnum--;
706 	}
707 	return bytenum;
708 }
709 
710 /* convert byte offset in s to utf-8 char number that starts there */
711 int u8_byte2char(const char *s, int bytenum)
712 {
713 	int i, len, b;
714 	int charnum = 0; /* BUG: what origin? */
715 	/* should be 0 to match start==0 which means no match */
716 
717 	b = strlen(s);
718 	if (bytenum > b) {
719 		return -1; /* ??? */
720 	}
721 	for (i = 0; i <= bytenum; i += len) {
722 		len = u8_nextlen(s+i);
723 		charnum++;
724 	}
725 	return charnum;
726 }
727 
728 /* runetochar() adapted from rune.c in the Plan 9 distributione */
729 
730 enum
731 {
732 	Runeerror = 128, /* from somewhere else */
733 	Runemax = 0x10FFFF,
734 
735 	Bit1    = 7,
736 	Bitx    = 6,
737 	Bit2    = 5,
738 	Bit3    = 4,
739 	Bit4    = 3,
740 	Bit5    = 2,
741 
742 	T1      = ((1<<(Bit1+1))-1) ^ 0xFF,     /* 0000 0000 */
743 	Tx      = ((1<<(Bitx+1))-1) ^ 0xFF,     /* 1000 0000 */
744 	T2      = ((1<<(Bit2+1))-1) ^ 0xFF,     /* 1100 0000 */
745 	T3      = ((1<<(Bit3+1))-1) ^ 0xFF,     /* 1110 0000 */
746 	T4      = ((1<<(Bit4+1))-1) ^ 0xFF,     /* 1111 0000 */
747 	T5      = ((1<<(Bit5+1))-1) ^ 0xFF,     /* 1111 1000 */
748 
749 	Rune1   = (1<<(Bit1+0*Bitx))-1,	 	/* 0000 0000 0000 0000 0111 1111 */
750 	Rune2   = (1<<(Bit2+1*Bitx))-1,	 	/* 0000 0000 0000 0111 1111 1111 */
751 	Rune3   = (1<<(Bit3+2*Bitx))-1,	 	/* 0000 0000 1111 1111 1111 1111 */
752 	Rune4   = (1<<(Bit4+3*Bitx))-1,	 	/* 0011 1111 1111 1111 1111 1111 */
753 
754 	Maskx   = (1<<Bitx)-1,		  	/* 0011 1111 */
755 	Testx   = Maskx ^ 0xFF,		 	/* 1100 0000 */
756 
757 };
758 
759 int runetochar(char *str, int c)
760 {
761 	/* one character sequence 00000-0007F => 00-7F */
762 	if (c <= Rune1) {
763 		str[0] = c;
764 		return 1;
765 	}
766 
767 	/* two character sequence 00080-007FF => T2 Tx */
768 	if (c <= Rune2) {
769 		str[0] = T2 | (c >> 1*Bitx);
770 		str[1] = Tx | (c & Maskx);
771 		return 2;
772 	}
773 
774 	/* three character sequence 00800-0FFFF => T3 Tx Tx */
775 	if (c > Runemax)
776 		c = Runeerror;
777 	if (c <= Rune3) {
778 		str[0] = T3 |  (c >> 2*Bitx);
779 		str[1] = Tx | ((c >> 1*Bitx) & Maskx);
780 		str[2] = Tx |  (c & Maskx);
781 		return 3;
782 	}
783 
784 	/* four character sequence 010000-1FFFFF => T4 Tx Tx Tx */
785 	str[0] = T4 |  (c >> 3*Bitx);
786 	str[1] = Tx | ((c >> 2*Bitx) & Maskx);
787 	str[2] = Tx | ((c >> 1*Bitx) & Maskx);
788 	str[3] = Tx |  (c & Maskx);
789 	return 4;
790 }
791 
792 
793 /* ========== end of utf8 code =========== */
794 
795 
796 
797 Cell *matchop(Node **a, int n)	/* ~ and match() */
798 {
799 	Cell *x, *y;
800 	char *s, *t;
801 	int i;
802 	int cstart, cpatlen, len;
803 	fa *pfa;
804 	int (*mf)(fa *, const char *) = match, mode = 0;
805 
806 	if (n == MATCHFCN) {
807 		mf = pmatch;
808 		mode = 1;
809 	}
810 	x = execute(a[1]);	/* a[1] = target text */
811 	s = getsval(x);
812 	if (a[0] == NULL)	/* a[1] == 0: already-compiled reg expr */
813 		i = (*mf)((fa *) a[2], s);
814 	else {
815 		y = execute(a[2]);	/* a[2] = regular expr */
816 		t = getsval(y);
817 		pfa = makedfa(t, mode);
818 		i = (*mf)(pfa, s);
819 		tempfree(y);
820 	}
821 	tempfree(x);
822 	if (n == MATCHFCN) {
823 		int start = patbeg - s + 1; /* origin 1 */
824 		if (patlen < 0) {
825 			start = 0; /* not found */
826 		} else {
827 			cstart = u8_byte2char(s, start-1);
828 			cpatlen = 0;
829 			for (i = 0; i < patlen; i += len) {
830 				len = u8_nextlen(patbeg+i);
831 				cpatlen++;
832 			}
833 
834 			start = cstart;
835 			patlen = cpatlen;
836 		}
837 
838 		setfval(rstartloc, (Awkfloat) start);
839 		setfval(rlengthloc, (Awkfloat) patlen);
840 		x = gettemp();
841 		x->tval = NUM;
842 		x->fval = start;
843 		return x;
844 	} else if ((n == MATCH && i == 1) || (n == NOTMATCH && i == 0))
845 		return(True);
846 	else
847 		return(False);
848 }
849 
850 
851 Cell *boolop(Node **a, int n)	/* a[0] || a[1], a[0] && a[1], !a[0] */
852 {
853 	Cell *x, *y;
854 	int i;
855 
856 	x = execute(a[0]);
857 	i = istrue(x);
858 	tempfree(x);
859 	switch (n) {
860 	case BOR:
861 		if (i) return(True);
862 		y = execute(a[1]);
863 		i = istrue(y);
864 		tempfree(y);
865 		if (i) return(True);
866 		else return(False);
867 	case AND:
868 		if ( !i ) return(False);
869 		y = execute(a[1]);
870 		i = istrue(y);
871 		tempfree(y);
872 		if (i) return(True);
873 		else return(False);
874 	case NOT:
875 		if (i) return(False);
876 		else return(True);
877 	default:	/* can't happen */
878 		FATAL("unknown boolean operator %d", n);
879 	}
880 	return 0;	/*NOTREACHED*/
881 }
882 
883 Cell *relop(Node **a, int n)	/* a[0 < a[1], etc. */
884 {
885 	int i;
886 	Cell *x, *y;
887 	Awkfloat j;
888 	bool x_is_nan, y_is_nan;
889 
890 	x = execute(a[0]);
891 	y = execute(a[1]);
892 	x_is_nan = isnan(x->fval);
893 	y_is_nan = isnan(y->fval);
894 	if (x->tval&NUM && y->tval&NUM) {
895 		if ((x_is_nan || y_is_nan) && n != NE)
896 			return(False);
897 		j = x->fval - y->fval;
898 		i = j<0? -1: (j>0? 1: 0);
899 	} else {
900 		i = strcmp(getsval(x), getsval(y));
901 	}
902 	tempfree(x);
903 	tempfree(y);
904 	switch (n) {
905 	case LT:	if (i<0) return(True);
906 			else return(False);
907 	case LE:	if (i<=0) return(True);
908 			else return(False);
909 	case NE:	if (x_is_nan && y_is_nan) return(True);
910 			else if (i!=0) return(True);
911 			else return(False);
912 	case EQ:	if (i == 0) return(True);
913 			else return(False);
914 	case GE:	if (i>=0) return(True);
915 			else return(False);
916 	case GT:	if (i>0) return(True);
917 			else return(False);
918 	default:	/* can't happen */
919 		FATAL("unknown relational operator %d", n);
920 	}
921 	return 0;	/*NOTREACHED*/
922 }
923 
924 void tfree(Cell *a)	/* free a tempcell */
925 {
926 	if (freeable(a)) {
927 		DPRINTF("freeing %s %s %o\n", NN(a->nval), NN(a->sval), a->tval);
928 		xfree(a->sval);
929 	}
930 	if (a == tmps)
931 		FATAL("tempcell list is curdled");
932 	a->cnext = tmps;
933 	tmps = a;
934 }
935 
936 Cell *gettemp(void)	/* get a tempcell */
937 {	int i;
938 	Cell *x;
939 
940 	if (!tmps) {
941 		tmps = (Cell *) calloc(100, sizeof(*tmps));
942 		if (!tmps)
943 			FATAL("out of space for temporaries");
944 		for (i = 1; i < 100; i++)
945 			tmps[i-1].cnext = &tmps[i];
946 		tmps[i-1].cnext = NULL;
947 	}
948 	x = tmps;
949 	tmps = x->cnext;
950 	*x = tempcell;
951 	return(x);
952 }
953 
954 Cell *indirect(Node **a, int n)	/* $( a[0] ) */
955 {
956 	Awkfloat val;
957 	Cell *x;
958 	int m;
959 	char *s;
960 
961 	x = execute(a[0]);
962 	val = getfval(x);	/* freebsd: defend against super large field numbers */
963 	if ((Awkfloat)INT_MAX < val)
964 		FATAL("trying to access out of range field %s", x->nval);
965 	m = (int) val;
966 	if (m == 0 && !is_number(s = getsval(x), NULL))	/* suspicion! */
967 		FATAL("illegal field $(%s), name \"%s\"", s, x->nval);
968 		/* BUG: can x->nval ever be null??? */
969 	tempfree(x);
970 	x = fieldadr(m);
971 	x->ctype = OCELL;	/* BUG?  why are these needed? */
972 	x->csub = CFLD;
973 	return(x);
974 }
975 
976 Cell *substr(Node **a, int nnn)		/* substr(a[0], a[1], a[2]) */
977 {
978 	int k, m, n;
979 	int mb, nb;
980 	char *s;
981 	int temp;
982 	Cell *x, *y, *z = NULL;
983 
984 	x = execute(a[0]);
985 	y = execute(a[1]);
986 	if (a[2] != NULL)
987 		z = execute(a[2]);
988 	s = getsval(x);
989 	k = u8_strlen(s) + 1;
990 	if (k <= 1) {
991 		tempfree(x);
992 		tempfree(y);
993 		if (a[2] != NULL) {
994 			tempfree(z);
995 		}
996 		x = gettemp();
997 		setsval(x, "");
998 		return(x);
999 	}
1000 	m = (int) getfval(y);
1001 	if (m <= 0)
1002 		m = 1;
1003 	else if (m > k)
1004 		m = k;
1005 	tempfree(y);
1006 	if (a[2] != NULL) {
1007 		n = (int) getfval(z);
1008 		tempfree(z);
1009 	} else
1010 		n = k - 1;
1011 	if (n < 0)
1012 		n = 0;
1013 	else if (n > k - m)
1014 		n = k - m;
1015 	/* m is start, n is length from there */
1016 	DPRINTF("substr: m=%d, n=%d, s=%s\n", m, n, s);
1017 	y = gettemp();
1018 	mb = u8_char2byte(s, m-1); /* byte offset of start char in s */
1019 	nb = u8_char2byte(s, m-1+n);  /* byte offset of end+1 char in s */
1020 
1021 	temp = s[nb];	/* with thanks to John Linderman */
1022 	s[nb] = '\0';
1023 	setsval(y, s + mb);
1024 	s[nb] = temp;
1025 	tempfree(x);
1026 	return(y);
1027 }
1028 
1029 Cell *sindex(Node **a, int nnn)		/* index(a[0], a[1]) */
1030 {
1031 	Cell *x, *y, *z;
1032 	char *s1, *s2, *p1, *p2, *q;
1033 	Awkfloat v = 0.0;
1034 
1035 	x = execute(a[0]);
1036 	s1 = getsval(x);
1037 	y = execute(a[1]);
1038 	s2 = getsval(y);
1039 
1040 	z = gettemp();
1041 	for (p1 = s1; *p1 != '\0'; p1++) {
1042 		for (q = p1, p2 = s2; *p2 != '\0' && *q == *p2; q++, p2++)
1043 			continue;
1044 		if (*p2 == '\0') {
1045 			/* v = (Awkfloat) (p1 - s1 + 1);	 origin 1 */
1046 
1047 		   /* should be a function: used in match() as well */
1048 			int i, len;
1049 			v = 0;
1050 			for (i = 0; i < p1-s1+1; i += len) {
1051 				len = u8_nextlen(s1+i);
1052 				v++;
1053 			}
1054 			break;
1055 		}
1056 	}
1057 	tempfree(x);
1058 	tempfree(y);
1059 	setfval(z, v);
1060 	return(z);
1061 }
1062 
1063 int has_utf8(char *s)	/* return 1 if s contains any utf-8 (2 bytes or more) character */
1064 {
1065 	int n;
1066 
1067 	for (n = 0; *s != 0; s += n) {
1068 		n = u8_nextlen(s);
1069 		if (n > 1)
1070 			return 1;
1071 	}
1072 	return 0;
1073 }
1074 
1075 #define	MAXNUMSIZE	50
1076 
1077 int format(char **pbuf, int *pbufsize, const char *s, Node *a)	/* printf-like conversions */
1078 {
1079 	char *fmt;
1080 	char *p, *t;
1081 	const char *os;
1082 	Cell *x;
1083 	int flag = 0, n;
1084 	int fmtwd; /* format width */
1085 	int fmtsz = recsize;
1086 	char *buf = *pbuf;
1087 	int bufsize = *pbufsize;
1088 #define FMTSZ(a)   (fmtsz - ((a) - fmt))
1089 #define BUFSZ(a)   (bufsize - ((a) - buf))
1090 
1091 	static bool first = true;
1092 	static bool have_a_format = false;
1093 
1094 	if (first) {
1095 		char xbuf[100];
1096 
1097 		snprintf(xbuf, sizeof(xbuf), "%a", 42.0);
1098 		have_a_format = (strcmp(xbuf, "0x1.5p+5") == 0);
1099 		first = false;
1100 	}
1101 
1102 	os = s;
1103 	p = buf;
1104 	if ((fmt = (char *) malloc(fmtsz)) == NULL)
1105 		FATAL("out of memory in format()");
1106 	while (*s) {
1107 		adjbuf(&buf, &bufsize, MAXNUMSIZE+1+p-buf, recsize, &p, "format1");
1108 		if (*s != '%') {
1109 			*p++ = *s++;
1110 			continue;
1111 		}
1112 		if (*(s+1) == '%') {
1113 			*p++ = '%';
1114 			s += 2;
1115 			continue;
1116 		}
1117 		fmtwd = atoi(s+1);
1118 		if (fmtwd < 0)
1119 			fmtwd = -fmtwd;
1120 		adjbuf(&buf, &bufsize, fmtwd+1+p-buf, recsize, &p, "format2");
1121 		for (t = fmt; (*t++ = *s) != '\0'; s++) {
1122 			if (!adjbuf(&fmt, &fmtsz, MAXNUMSIZE+1+t-fmt, recsize, &t, "format3"))
1123 				FATAL("format item %.30s... ran format() out of memory", os);
1124 			/* Ignore size specifiers */
1125 			if (strchr("hjLlqtz", *s) != NULL) {	/* the ansi panoply */
1126 				t--;
1127 				continue;
1128 			}
1129 			if (isalpha((uschar)*s))
1130 				break;
1131 			if (*s == '$') {
1132 				FATAL("'$' not permitted in awk formats");
1133 			}
1134 			if (*s == '*') {
1135 				if (a == NULL) {
1136 					FATAL("not enough args in printf(%s)", os);
1137 				}
1138 				x = execute(a);
1139 				a = a->nnext;
1140 				snprintf(t - 1, FMTSZ(t - 1),
1141 				    "%d", fmtwd=(int) getfval(x));
1142 				if (fmtwd < 0)
1143 					fmtwd = -fmtwd;
1144 				adjbuf(&buf, &bufsize, fmtwd+1+p-buf, recsize, &p, "format");
1145 				t = fmt + strlen(fmt);
1146 				tempfree(x);
1147 			}
1148 		}
1149 		*t = '\0';
1150 		if (fmtwd < 0)
1151 			fmtwd = -fmtwd;
1152 		adjbuf(&buf, &bufsize, fmtwd+1+p-buf, recsize, &p, "format4");
1153 		switch (*s) {
1154 		case 'a': case 'A':
1155 			if (have_a_format)
1156 				flag = *s;
1157 			else
1158 				flag = 'f';
1159 			break;
1160 		case 'f': case 'e': case 'g': case 'E': case 'G':
1161 			flag = 'f';
1162 			break;
1163 		case 'd': case 'i': case 'o': case 'x': case 'X': case 'u':
1164 			flag = (*s == 'd' || *s == 'i') ? 'd' : 'u';
1165 			*(t-1) = 'j';
1166 			*t = *s;
1167 			*++t = '\0';
1168 			break;
1169 		case 's':
1170 			flag = 's';
1171 			break;
1172 		case 'c':
1173 			flag = 'c';
1174 			break;
1175 		default:
1176 			WARNING("weird printf conversion %s", fmt);
1177 			flag = '?';
1178 			break;
1179 		}
1180 		if (a == NULL)
1181 			FATAL("not enough args in printf(%s)", os);
1182 		x = execute(a);
1183 		a = a->nnext;
1184 		n = MAXNUMSIZE;
1185 		if (fmtwd > n)
1186 			n = fmtwd;
1187 		adjbuf(&buf, &bufsize, 1+n+p-buf, recsize, &p, "format5");
1188 		switch (flag) {
1189 		case '?':
1190 			snprintf(p, BUFSZ(p), "%s", fmt);	/* unknown, so dump it too */
1191 			t = getsval(x);
1192 			n = strlen(t);
1193 			if (fmtwd > n)
1194 				n = fmtwd;
1195 			adjbuf(&buf, &bufsize, 1+strlen(p)+n+p-buf, recsize, &p, "format6");
1196 			p += strlen(p);
1197 			snprintf(p, BUFSZ(p), "%s", t);
1198 			break;
1199 		case 'a':
1200 		case 'A':
1201 		case 'f':	snprintf(p, BUFSZ(p), fmt, getfval(x)); break;
1202 		case 'd':	snprintf(p, BUFSZ(p), fmt, (intmax_t) getfval(x)); break;
1203 		case 'u':	snprintf(p, BUFSZ(p), fmt, (uintmax_t) getfval(x)); break;
1204 
1205 		case 's': {
1206 			t = getsval(x);
1207 			n = strlen(t);
1208 			/* if simple format or no utf-8 in the string, sprintf works */
1209 			if (!has_utf8(t) || strcmp(fmt,"%s") == 0) {
1210 				if (fmtwd > n)
1211 					n = fmtwd;
1212 				if (!adjbuf(&buf, &bufsize, 1+n+p-buf, recsize, &p, "format7"))
1213 					FATAL("huge string/format (%d chars) in printf %.30s..." \
1214 						" ran format() out of memory", n, t);
1215 				snprintf(p, BUFSZ(p), fmt, t);
1216 				break;
1217 			}
1218 
1219 			/* get here if string has utf-8 chars and fmt is not plain %s */
1220 			/* "%-w.ps", where -, w and .p are all optional */
1221 			/* '0' before the w is a flag character */
1222 			/* fmt points at % */
1223 			int ljust = 0, wid = 0, prec = n, pad = 0;
1224 			char *f = fmt+1;
1225 			if (f[0] == '-') {
1226 				ljust = 1;
1227 				f++;
1228 			}
1229 			// flags '0' and '+' are recognized but skipped
1230 			if (f[0] == '0') {
1231 				f++;
1232 				if (f[0] == '+')
1233 					f++;
1234 			}
1235 			if (f[0] == '+') {
1236 				f++;
1237 				if (f[0] == '0')
1238 					f++;
1239 			}
1240 			if (isdigit((uschar)f[0])) { /* there is a wid */
1241 				wid = strtol(f, &f, 10);
1242 			}
1243 			if (f[0] == '.') { /* there is a .prec */
1244 				prec = strtol(++f, &f, 10);
1245 			}
1246 			if (prec > u8_strlen(t))
1247 				prec = u8_strlen(t);
1248 			pad = wid>prec ? wid - prec : 0;  // has to be >= 0
1249 			int i, k, n;
1250 
1251 			if (ljust) { // print prec chars from t, then pad blanks
1252 				n = u8_char2byte(t, prec);
1253 				for (k = 0; k < n; k++) {
1254 					//putchar(t[k]);
1255 					*p++ = t[k];
1256 				}
1257 				for (i = 0; i < pad; i++) {
1258 					//printf(" ");
1259 					*p++ = ' ';
1260 				}
1261 			} else { // print pad blanks, then prec chars from t
1262 				for (i = 0; i < pad; i++) {
1263 					//printf(" ");
1264 					*p++ = ' ';
1265 				}
1266 				n = u8_char2byte(t, prec);
1267 				for (k = 0; k < n; k++) {
1268 					//putchar(t[k]);
1269 					*p++ = t[k];
1270 				}
1271 			}
1272 			*p = 0;
1273 			break;
1274 		}
1275 
1276                case 'c': {
1277 			/*
1278 			 * If a numeric value is given, awk should just turn
1279 			 * it into a character and print it:
1280 			 *      BEGIN { printf("%c\n", 65) }
1281 			 * prints "A".
1282 			 *
1283 			 * But what if the numeric value is > 128 and
1284 			 * represents a valid Unicode code point?!? We do
1285 			 * our best to convert it back into UTF-8. If we
1286 			 * can't, we output the encoding of the Unicode
1287 			 * "invalid character", 0xFFFD.
1288 			 */
1289 			if (isnum(x)) {
1290 				int charval = (int) getfval(x);
1291 
1292 				if (charval != 0) {
1293 					if (charval < 128 || awk_mb_cur_max == 1)
1294 						snprintf(p, BUFSZ(p), fmt, charval);
1295 					else {
1296 						// possible unicode character
1297 						size_t count;
1298 						char *bs = wide_char_to_byte_str(charval, &count);
1299 
1300 						if (bs == NULL)	{ // invalid character
1301 							// use unicode invalid character, 0xFFFD
1302 							bs = "\357\277\275";
1303 							count = 3;
1304 						}
1305 						t = bs;
1306 						n = count;
1307 						goto format_percent_c;
1308 					}
1309 				} else {
1310 					*p++ = '\0'; /* explicit null byte */
1311 					*p = '\0';   /* next output will start here */
1312 				}
1313 				break;
1314 			}
1315 			t = getsval(x);
1316 			n = u8_nextlen(t);
1317 		format_percent_c:
1318 			if (n < 2) { /* not utf8 */
1319 				snprintf(p, BUFSZ(p), fmt, getsval(x)[0]);
1320 				break;
1321 			}
1322 
1323 			// utf8 character, almost same song and dance as for %s
1324 			int ljust = 0, wid = 0, prec = n, pad = 0;
1325 			char *f = fmt+1;
1326 			if (f[0] == '-') {
1327 				ljust = 1;
1328 				f++;
1329 			}
1330 			// flags '0' and '+' are recognized but skipped
1331 			if (f[0] == '0') {
1332 				f++;
1333 				if (f[0] == '+')
1334 					f++;
1335 			}
1336 			if (f[0] == '+') {
1337 				f++;
1338 				if (f[0] == '0')
1339 					f++;
1340 			}
1341 			if (isdigit((uschar)f[0])) { /* there is a wid */
1342 				wid = strtol(f, &f, 10);
1343 			}
1344 			if (f[0] == '.') { /* there is a .prec */
1345 				prec = strtol(++f, &f, 10);
1346 			}
1347 			if (prec > 1)           // %c --> only one character
1348 				prec = 1;
1349 			pad = wid>prec ? wid - prec : 0;  // has to be >= 0
1350 			int i;
1351 
1352 			if (ljust) { // print one char from t, then pad blanks
1353 				for (i = 0; i < n; i++)
1354 					*p++ = t[i];
1355 				for (i = 0; i < pad; i++) {
1356 					//printf(" ");
1357 					*p++ = ' ';
1358 				}
1359 			} else { // print pad blanks, then prec chars from t
1360 				for (i = 0; i < pad; i++) {
1361 					//printf(" ");
1362 					*p++ = ' ';
1363 				}
1364 				for (i = 0; i < n; i++)
1365 					*p++ = t[i];
1366 			}
1367 			*p = 0;
1368 			break;
1369 		}
1370 		default:
1371 			FATAL("can't happen: bad conversion %c in format()", flag);
1372 		}
1373 
1374 		tempfree(x);
1375 		p += strlen(p);
1376 		s++;
1377 	}
1378 	*p = '\0';
1379 	free(fmt);
1380 	for ( ; a; a = a->nnext) {		/* evaluate any remaining args */
1381 		x = execute(a);
1382 		tempfree(x);
1383 	}
1384 	*pbuf = buf;
1385 	*pbufsize = bufsize;
1386 	return p - buf;
1387 }
1388 
1389 Cell *awksprintf(Node **a, int n)		/* sprintf(a[0]) */
1390 {
1391 	Cell *x;
1392 	Node *y;
1393 	char *buf;
1394 	int bufsz=3*recsize;
1395 
1396 	if ((buf = (char *) malloc(bufsz)) == NULL)
1397 		FATAL("out of memory in awksprintf");
1398 	y = a[0]->nnext;
1399 	x = execute(a[0]);
1400 	if (format(&buf, &bufsz, getsval(x), y) == -1)
1401 		FATAL("sprintf string %.30s... too long.  can't happen.", buf);
1402 	tempfree(x);
1403 	x = gettemp();
1404 	x->sval = buf;
1405 	x->tval = STR;
1406 	return(x);
1407 }
1408 
1409 Cell *awkprintf(Node **a, int n)		/* printf */
1410 {	/* a[0] is list of args, starting with format string */
1411 	/* a[1] is redirection operator, a[2] is redirection file */
1412 	FILE *fp;
1413 	Cell *x;
1414 	Node *y;
1415 	char *buf;
1416 	int len;
1417 	int bufsz=3*recsize;
1418 
1419 	if ((buf = (char *) malloc(bufsz)) == NULL)
1420 		FATAL("out of memory in awkprintf");
1421 	y = a[0]->nnext;
1422 	x = execute(a[0]);
1423 	if ((len = format(&buf, &bufsz, getsval(x), y)) == -1)
1424 		FATAL("printf string %.30s... too long.  can't happen.", buf);
1425 	tempfree(x);
1426 	if (a[1] == NULL) {
1427 		/* fputs(buf, stdout); */
1428 		fwrite(buf, len, 1, stdout);
1429 		if (ferror(stdout))
1430 			FATAL("write error on stdout");
1431 	} else {
1432 		fp = redirect(ptoi(a[1]), a[2]);
1433 		/* fputs(buf, fp); */
1434 		fwrite(buf, len, 1, fp);
1435 		fflush(fp);
1436 		if (ferror(fp))
1437 			FATAL("write error on %s", filename(fp));
1438 	}
1439 	free(buf);
1440 	return(True);
1441 }
1442 
1443 Cell *arith(Node **a, int n)	/* a[0] + a[1], etc.  also -a[0] */
1444 {
1445 	Awkfloat i, j = 0;
1446 	double v;
1447 	Cell *x, *y, *z;
1448 
1449 	x = execute(a[0]);
1450 	i = getfval(x);
1451 	tempfree(x);
1452 	if (n != UMINUS && n != UPLUS) {
1453 		y = execute(a[1]);
1454 		j = getfval(y);
1455 		tempfree(y);
1456 	}
1457 	z = gettemp();
1458 	switch (n) {
1459 	case ADD:
1460 		i += j;
1461 		break;
1462 	case MINUS:
1463 		i -= j;
1464 		break;
1465 	case MULT:
1466 		i *= j;
1467 		break;
1468 	case DIVIDE:
1469 		if (j == 0)
1470 			FATAL("division by zero");
1471 		i /= j;
1472 		break;
1473 	case MOD:
1474 		if (j == 0)
1475 			FATAL("division by zero in mod");
1476 		modf(i/j, &v);
1477 		i = i - j * v;
1478 		break;
1479 	case UMINUS:
1480 		i = -i;
1481 		break;
1482 	case UPLUS: /* handled by getfval(), above */
1483 		break;
1484 	case POWER:
1485 		if (j >= 0 && modf(j, &v) == 0.0)	/* pos integer exponent */
1486 			i = ipow(i, (int) j);
1487                else {
1488 			errno = 0;
1489 			i = errcheck(pow(i, j), "pow");
1490                }
1491 		break;
1492 	default:	/* can't happen */
1493 		FATAL("illegal arithmetic operator %d", n);
1494 	}
1495 	setfval(z, i);
1496 	return(z);
1497 }
1498 
1499 double ipow(double x, int n)	/* x**n.  ought to be done by pow, but isn't always */
1500 {
1501 	double v;
1502 
1503 	if (n <= 0)
1504 		return 1;
1505 	v = ipow(x, n/2);
1506 	if (n % 2 == 0)
1507 		return v * v;
1508 	else
1509 		return x * v * v;
1510 }
1511 
1512 Cell *incrdecr(Node **a, int n)		/* a[0]++, etc. */
1513 {
1514 	Cell *x, *z;
1515 	int k;
1516 	Awkfloat xf;
1517 
1518 	x = execute(a[0]);
1519 	xf = getfval(x);
1520 	k = (n == PREINCR || n == POSTINCR) ? 1 : -1;
1521 	if (n == PREINCR || n == PREDECR) {
1522 		setfval(x, xf + k);
1523 		return(x);
1524 	}
1525 	z = gettemp();
1526 	setfval(z, xf);
1527 	setfval(x, xf + k);
1528 	tempfree(x);
1529 	return(z);
1530 }
1531 
1532 Cell *assign(Node **a, int n)	/* a[0] = a[1], a[0] += a[1], etc. */
1533 {		/* this is subtle; don't muck with it. */
1534 	Cell *x, *y;
1535 	Awkfloat xf, yf;
1536 	double v;
1537 
1538 	y = execute(a[1]);
1539 	x = execute(a[0]);
1540 	if (n == ASSIGN) {	/* ordinary assignment */
1541 		if (x == y && !(x->tval & (FLD|REC)) && x != nfloc)
1542 			;	/* self-assignment: leave alone unless it's a field or NF */
1543 		else if ((y->tval & (STR|NUM)) == (STR|NUM)) {
1544 			setsval(x, getsval(y));
1545 			x->fval = getfval(y);
1546 			x->tval |= NUM;
1547 		}
1548 		else if (isstr(y))
1549 			setsval(x, getsval(y));
1550 		else if (isnum(y))
1551 			setfval(x, getfval(y));
1552 		else
1553 			funnyvar(y, "read value of");
1554 		tempfree(y);
1555 		return(x);
1556 	}
1557 	xf = getfval(x);
1558 	yf = getfval(y);
1559 	switch (n) {
1560 	case ADDEQ:
1561 		xf += yf;
1562 		break;
1563 	case SUBEQ:
1564 		xf -= yf;
1565 		break;
1566 	case MULTEQ:
1567 		xf *= yf;
1568 		break;
1569 	case DIVEQ:
1570 		if (yf == 0)
1571 			FATAL("division by zero in /=");
1572 		xf /= yf;
1573 		break;
1574 	case MODEQ:
1575 		if (yf == 0)
1576 			FATAL("division by zero in %%=");
1577 		modf(xf/yf, &v);
1578 		xf = xf - yf * v;
1579 		break;
1580 	case POWEQ:
1581 		if (yf >= 0 && modf(yf, &v) == 0.0)	/* pos integer exponent */
1582 			xf = ipow(xf, (int) yf);
1583                else {
1584 			errno = 0;
1585 			xf = errcheck(pow(xf, yf), "pow");
1586                }
1587 		break;
1588 	default:
1589 		FATAL("illegal assignment operator %d", n);
1590 		break;
1591 	}
1592 	tempfree(y);
1593 	setfval(x, xf);
1594 	return(x);
1595 }
1596 
1597 Cell *cat(Node **a, int q)	/* a[0] cat a[1] */
1598 {
1599 	Cell *x, *y, *z;
1600 	int n1, n2;
1601 	char *s = NULL;
1602 	int ssz = 0;
1603 
1604 	x = execute(a[0]);
1605 	n1 = strlen(getsval(x));
1606 	adjbuf(&s, &ssz, n1 + 1, recsize, 0, "cat1");
1607 	memcpy(s, x->sval, n1);
1608 
1609 	tempfree(x);
1610 
1611 	y = execute(a[1]);
1612 	n2 = strlen(getsval(y));
1613 	adjbuf(&s, &ssz, n1 + n2 + 1, recsize, 0, "cat2");
1614 	memcpy(s + n1, y->sval, n2);
1615 	s[n1 + n2] = '\0';
1616 
1617 	tempfree(y);
1618 
1619 	z = gettemp();
1620 	z->sval = s;
1621 	z->tval = STR;
1622 
1623 	return(z);
1624 }
1625 
1626 Cell *pastat(Node **a, int n)	/* a[0] { a[1] } */
1627 {
1628 	Cell *x;
1629 
1630 	if (a[0] == NULL)
1631 		x = execute(a[1]);
1632 	else {
1633 		x = execute(a[0]);
1634 		if (istrue(x)) {
1635 			tempfree(x);
1636 			x = execute(a[1]);
1637 		}
1638 	}
1639 	return x;
1640 }
1641 
1642 Cell *dopa2(Node **a, int n)	/* a[0], a[1] { a[2] } */
1643 {
1644 	Cell *x;
1645 	int pair;
1646 
1647 	pair = ptoi(a[3]);
1648 	if (pairstack[pair] == 0) {
1649 		x = execute(a[0]);
1650 		if (istrue(x))
1651 			pairstack[pair] = 1;
1652 		tempfree(x);
1653 	}
1654 	if (pairstack[pair] == 1) {
1655 		x = execute(a[1]);
1656 		if (istrue(x))
1657 			pairstack[pair] = 0;
1658 		tempfree(x);
1659 		x = execute(a[2]);
1660 		return(x);
1661 	}
1662 	return(False);
1663 }
1664 
1665 Cell *split(Node **a, int nnn)	/* split(a[0], a[1], a[2]); a[3] is type */
1666 {
1667 	Cell *x = NULL, *y, *ap;
1668 	const char *s, *origs, *t;
1669 	const char *fs = NULL;
1670 	char *origfs = NULL;
1671 	int sep;
1672 	char temp, num[50];
1673 	int j, n, tempstat, arg3type;
1674 	double result;
1675 
1676 	y = execute(a[0]);	/* source string */
1677 	origs = s = strdup(getsval(y));
1678 	if (s == NULL)
1679 		FATAL("out of space in split");
1680 	tempfree(y);
1681 	arg3type = ptoi(a[3]);
1682 	if (a[2] == NULL) {		/* BUG: CSV should override implicit fs but not explicit */
1683 		fs = getsval(fsloc);
1684 	} else if (arg3type == STRING) {	/* split(str,arr,"string") */
1685 		x = execute(a[2]);
1686 		fs = origfs = strdup(getsval(x));
1687 		if (fs == NULL)
1688 			FATAL("out of space in split");
1689 		tempfree(x);
1690 	} else if (arg3type == REGEXPR) {
1691 		fs = "(regexpr)";	/* split(str,arr,/regexpr/) */
1692 	} else {
1693 		FATAL("illegal type of split");
1694 	}
1695 	sep = *fs;
1696 	ap = execute(a[1]);	/* array name */
1697 	/* BUG 7/26/22: this appears not to reset array: see C1/asplit */
1698 	freesymtab(ap);
1699 	DPRINTF("split: s=|%s|, a=%s, sep=|%s|\n", s, NN(ap->nval), fs);
1700 	ap->tval &= ~STR;
1701 	ap->tval |= ARR;
1702 	ap->sval = (char *) makesymtab(NSYMTAB);
1703 
1704 	n = 0;
1705         if (arg3type == REGEXPR && strlen((char*)((fa*)a[2])->restr) == 0) {
1706 		/* split(s, a, //); have to arrange that it looks like empty sep */
1707 		arg3type = 0;
1708 		fs = "";
1709 		sep = 0;
1710 	}
1711 	if (*s != '\0' && (strlen(fs) > 1 || arg3type == REGEXPR)) {	/* reg expr */
1712 		fa *pfa;
1713 		if (arg3type == REGEXPR) {	/* it's ready already */
1714 			pfa = (fa *) a[2];
1715 		} else {
1716 			pfa = makedfa(fs, 1);
1717 		}
1718 		if (nematch(pfa,s)) {
1719 			tempstat = pfa->initstat;
1720 			pfa->initstat = 2;
1721 			do {
1722 				n++;
1723 				snprintf(num, sizeof(num), "%d", n);
1724 				temp = *patbeg;
1725 				setptr(patbeg, '\0');
1726 				if (is_number(s, & result))
1727 					setsymtab(num, s, result, STR|NUM, (Array *) ap->sval);
1728 				else
1729 					setsymtab(num, s, 0.0, STR, (Array *) ap->sval);
1730 				setptr(patbeg, temp);
1731 				s = patbeg + patlen;
1732 				if (*(patbeg+patlen-1) == '\0' || *s == '\0') {
1733 					n++;
1734 					snprintf(num, sizeof(num), "%d", n);
1735 					setsymtab(num, "", 0.0, STR, (Array *) ap->sval);
1736 					pfa->initstat = tempstat;
1737 					goto spdone;
1738 				}
1739 			} while (nematch(pfa,s));
1740 			pfa->initstat = tempstat; 	/* bwk: has to be here to reset */
1741 							/* cf gsub and refldbld */
1742 		}
1743 		n++;
1744 		snprintf(num, sizeof(num), "%d", n);
1745 		if (is_number(s, & result))
1746 			setsymtab(num, s, result, STR|NUM, (Array *) ap->sval);
1747 		else
1748 			setsymtab(num, s, 0.0, STR, (Array *) ap->sval);
1749   spdone:
1750 		pfa = NULL;
1751 
1752 	} else if (a[2] == NULL && CSV) {	/* CSV only if no explicit separator */
1753 		char *newt = (char *) malloc(strlen(s)); /* for building new string; reuse for each field */
1754 		for (;;) {
1755 			char *fr = newt;
1756 			n++;
1757 			if (*s == '"' ) { /* start of "..." */
1758 				for (s++ ; *s != '\0'; ) {
1759 					if (*s == '"' && s[1] != '\0' && s[1] == '"') {
1760 						s += 2; /* doubled quote */
1761 						*fr++ = '"';
1762 					} else if (*s == '"' && (s[1] == '\0' || s[1] == ',')) {
1763 						s++; /* skip over closing quote */
1764 						break;
1765 					} else {
1766 						*fr++ = *s++;
1767 					}
1768 				}
1769 				*fr++ = 0;
1770 			} else {	/* unquoted field */
1771 				while (*s != ',' && *s != '\0')
1772 					*fr++ = *s++;
1773 				*fr++ = 0;
1774 			}
1775 			snprintf(num, sizeof(num), "%d", n);
1776 			if (is_number(newt, &result))
1777 				setsymtab(num, newt, result, STR|NUM, (Array *) ap->sval);
1778 			else
1779 				setsymtab(num, newt, 0.0, STR, (Array *) ap->sval);
1780 			if (*s++ == '\0')
1781 				break;
1782 		}
1783 		free(newt);
1784 
1785 	} else if (!CSV && sep == ' ') { /* usual case: split on white space */
1786 		for (n = 0; ; ) {
1787 #define ISWS(c)	((c) == ' ' || (c) == '\t' || (c) == '\n')
1788 			while (ISWS(*s))
1789 				s++;
1790 			if (*s == '\0')
1791 				break;
1792 			n++;
1793 			t = s;
1794 			do
1795 				s++;
1796 			while (*s != '\0' && !ISWS(*s));
1797 			temp = *s;
1798 			setptr(s, '\0');
1799 			snprintf(num, sizeof(num), "%d", n);
1800 			if (is_number(t, & result))
1801 				setsymtab(num, t, result, STR|NUM, (Array *) ap->sval);
1802 			else
1803 				setsymtab(num, t, 0.0, STR, (Array *) ap->sval);
1804 			setptr(s, temp);
1805 			if (*s != '\0')
1806 				s++;
1807 		}
1808 
1809 	} else if (sep == 0) {	/* new: split(s, a, "") => 1 char/elem */
1810 		for (n = 0; *s != '\0'; s += u8_nextlen(s)) {
1811 			char buf[10];
1812 			n++;
1813 			snprintf(num, sizeof(num), "%d", n);
1814 
1815 			for (j = 0; j < u8_nextlen(s); j++) {
1816 				buf[j] = s[j];
1817 			}
1818 			buf[j] = '\0';
1819 
1820 			if (isdigit((uschar)buf[0]))
1821 				setsymtab(num, buf, atof(buf), STR|NUM, (Array *) ap->sval);
1822 			else
1823 				setsymtab(num, buf, 0.0, STR, (Array *) ap->sval);
1824 		}
1825 
1826 	} else if (*s != '\0') {  /* some random single character */
1827 		for (;;) {
1828 			n++;
1829 			t = s;
1830 			while (*s != sep && *s != '\n' && *s != '\0')
1831 				s++;
1832 			temp = *s;
1833 			setptr(s, '\0');
1834 			snprintf(num, sizeof(num), "%d", n);
1835 			if (is_number(t, & result))
1836 				setsymtab(num, t, result, STR|NUM, (Array *) ap->sval);
1837 			else
1838 				setsymtab(num, t, 0.0, STR, (Array *) ap->sval);
1839 			setptr(s, temp);
1840 			if (*s++ == '\0')
1841 				break;
1842 		}
1843 	}
1844 	tempfree(ap);
1845 	xfree(origs);
1846 	xfree(origfs);
1847 	x = gettemp();
1848 	x->tval = NUM;
1849 	x->fval = n;
1850 	return(x);
1851 }
1852 
1853 Cell *condexpr(Node **a, int n)	/* a[0] ? a[1] : a[2] */
1854 {
1855 	Cell *x;
1856 
1857 	x = execute(a[0]);
1858 	if (istrue(x)) {
1859 		tempfree(x);
1860 		x = execute(a[1]);
1861 	} else {
1862 		tempfree(x);
1863 		x = execute(a[2]);
1864 	}
1865 	return(x);
1866 }
1867 
1868 Cell *ifstat(Node **a, int n)	/* if (a[0]) a[1]; else a[2] */
1869 {
1870 	Cell *x;
1871 
1872 	x = execute(a[0]);
1873 	if (istrue(x)) {
1874 		tempfree(x);
1875 		x = execute(a[1]);
1876 	} else if (a[2] != NULL) {
1877 		tempfree(x);
1878 		x = execute(a[2]);
1879 	}
1880 	return(x);
1881 }
1882 
1883 Cell *whilestat(Node **a, int n)	/* while (a[0]) a[1] */
1884 {
1885 	Cell *x;
1886 
1887 	for (;;) {
1888 		x = execute(a[0]);
1889 		if (!istrue(x))
1890 			return(x);
1891 		tempfree(x);
1892 		x = execute(a[1]);
1893 		if (isbreak(x)) {
1894 			x = True;
1895 			return(x);
1896 		}
1897 		if (isnext(x) || isexit(x) || isret(x))
1898 			return(x);
1899 		tempfree(x);
1900 	}
1901 }
1902 
1903 Cell *dostat(Node **a, int n)	/* do a[0]; while(a[1]) */
1904 {
1905 	Cell *x;
1906 
1907 	for (;;) {
1908 		x = execute(a[0]);
1909 		if (isbreak(x))
1910 			return True;
1911 		if (isnext(x) || isexit(x) || isret(x))
1912 			return(x);
1913 		tempfree(x);
1914 		x = execute(a[1]);
1915 		if (!istrue(x))
1916 			return(x);
1917 		tempfree(x);
1918 	}
1919 }
1920 
1921 Cell *forstat(Node **a, int n)	/* for (a[0]; a[1]; a[2]) a[3] */
1922 {
1923 	Cell *x;
1924 
1925 	x = execute(a[0]);
1926 	tempfree(x);
1927 	for (;;) {
1928 		if (a[1]!=NULL) {
1929 			x = execute(a[1]);
1930 			if (!istrue(x)) return(x);
1931 			else tempfree(x);
1932 		}
1933 		x = execute(a[3]);
1934 		if (isbreak(x))		/* turn off break */
1935 			return True;
1936 		if (isnext(x) || isexit(x) || isret(x))
1937 			return(x);
1938 		tempfree(x);
1939 		x = execute(a[2]);
1940 		tempfree(x);
1941 	}
1942 }
1943 
1944 Cell *instat(Node **a, int n)	/* for (a[0] in a[1]) a[2] */
1945 {
1946 	Cell *x, *vp, *arrayp, *cp, *ncp;
1947 	Array *tp;
1948 	int i;
1949 
1950 	vp = execute(a[0]);
1951 	arrayp = execute(a[1]);
1952 	if (!isarr(arrayp)) {
1953 		return True;
1954 	}
1955 	tp = (Array *) arrayp->sval;
1956 	tempfree(arrayp);
1957 	for (i = 0; i < tp->size; i++) {	/* this routine knows too much */
1958 		for (cp = tp->tab[i]; cp != NULL; cp = ncp) {
1959 			setsval(vp, cp->nval);
1960 			ncp = cp->cnext;
1961 			x = execute(a[2]);
1962 			if (isbreak(x)) {
1963 				tempfree(vp);
1964 				return True;
1965 			}
1966 			if (isnext(x) || isexit(x) || isret(x)) {
1967 				tempfree(vp);
1968 				return(x);
1969 			}
1970 			tempfree(x);
1971 		}
1972 	}
1973 	return True;
1974 }
1975 
1976 static char *nawk_convert(const char *s, int (*fun_c)(int),
1977     wint_t (*fun_wc)(wint_t))
1978 {
1979 	char *buf      = NULL;
1980 	char *pbuf     = NULL;
1981 	const char *ps = NULL;
1982 	size_t n       = 0;
1983 	wchar_t wc;
1984 	const size_t sz = awk_mb_cur_max;
1985 	int unused;
1986 
1987 	if (sz == 1) {
1988 		buf = tostring(s);
1989 
1990 		for (pbuf = buf; *pbuf; pbuf++)
1991 			*pbuf = fun_c((uschar)*pbuf);
1992 
1993 		return buf;
1994 	} else {
1995 		/* upper/lower character may be shorter/longer */
1996 		buf = tostringN(s, strlen(s) * sz + 1);
1997 
1998 		(void) mbtowc(NULL, NULL, 0);	/* reset internal state */
1999 		/*
2000 		 * Reset internal state here too.
2001 		 * Assign result to avoid a compiler warning. (Casting to void
2002 		 * doesn't work.)
2003 		 * Increment said variable to avoid a different warning.
2004 		 */
2005 		unused = wctomb(NULL, L'\0');
2006 		unused++;
2007 
2008 		ps   = s;
2009 		pbuf = buf;
2010 		while (n = mbtowc(&wc, ps, sz),
2011 		       n > 0 && n != (size_t)-1 && n != (size_t)-2)
2012 		{
2013 			ps += n;
2014 
2015 			n = wctomb(pbuf, fun_wc(wc));
2016 			if (n == (size_t)-1)
2017 				FATAL("illegal wide character %s", s);
2018 
2019 			pbuf += n;
2020 		}
2021 
2022 		*pbuf = '\0';
2023 
2024 		if (n)
2025 			FATAL("illegal byte sequence %s", s);
2026 
2027 		return buf;
2028 	}
2029 }
2030 
2031 #ifdef __DJGPP__
2032 static wint_t towupper(wint_t wc)
2033 {
2034 	if (wc >= 0 && wc < 256)
2035 		return toupper(wc & 0xFF);
2036 
2037 	return wc;
2038 }
2039 
2040 static wint_t towlower(wint_t wc)
2041 {
2042 	if (wc >= 0 && wc < 256)
2043 		return tolower(wc & 0xFF);
2044 
2045 	return wc;
2046 }
2047 #endif
2048 
2049 static char *nawk_toupper(const char *s)
2050 {
2051 	return nawk_convert(s, toupper, towupper);
2052 }
2053 
2054 static char *nawk_tolower(const char *s)
2055 {
2056 	return nawk_convert(s, tolower, towlower);
2057 }
2058 
2059 Cell *bltin(Node **a, int n)	/* builtin functions. a[0] is type, a[1] is arg list */
2060 {
2061 	Cell *x, *y;
2062 	Awkfloat u;
2063 	int t, sz;
2064 	Awkfloat tmp;
2065 	char *buf, *fmt;
2066 	Node *nextarg;
2067 	FILE *fp;
2068 	int status = 0;
2069 	time_t tv;
2070 	struct tm *tm, tmbuf;
2071 
2072 	t = ptoi(a[0]);
2073 	x = execute(a[1]);
2074 	nextarg = a[1]->nnext;
2075 	switch (t) {
2076 	case FLENGTH:
2077 		if (isarr(x))
2078 			u = ((Array *) x->sval)->nelem;	/* GROT.  should be function*/
2079 		else
2080 			u = u8_strlen(getsval(x));
2081 		break;
2082 	case FLOG:
2083 		errno = 0;
2084 		u = errcheck(log(getfval(x)), "log");
2085 		break;
2086 	case FINT:
2087 		modf(getfval(x), &u); break;
2088 	case FEXP:
2089 		errno = 0;
2090 		u = errcheck(exp(getfval(x)), "exp");
2091 		break;
2092 	case FSQRT:
2093 		errno = 0;
2094 		u = errcheck(sqrt(getfval(x)), "sqrt");
2095 		break;
2096 	case FSIN:
2097 		u = sin(getfval(x)); break;
2098 	case FCOS:
2099 		u = cos(getfval(x)); break;
2100 	case FATAN:
2101 		if (nextarg == NULL) {
2102 			WARNING("atan2 requires two arguments; returning 1.0");
2103 			u = 1.0;
2104 		} else {
2105 			y = execute(a[1]->nnext);
2106 			u = atan2(getfval(x), getfval(y));
2107 			tempfree(y);
2108 			nextarg = nextarg->nnext;
2109 		}
2110 		break;
2111 	case FCOMPL:
2112 		u = ~((int)getfval(x));
2113 		break;
2114 	case FAND:
2115 		if (nextarg == 0) {
2116 			WARNING("and requires two arguments; returning 0");
2117 			u = 0;
2118 			break;
2119 		}
2120 		y = execute(a[1]->nnext);
2121 		u = ((int)getfval(x)) & ((int)getfval(y));
2122 		tempfree(y);
2123 		nextarg = nextarg->nnext;
2124 		break;
2125 	case FFOR:
2126 		if (nextarg == 0) {
2127 			WARNING("or requires two arguments; returning 0");
2128 			u = 0;
2129 			break;
2130 		}
2131 		y = execute(a[1]->nnext);
2132 		u = ((int)getfval(x)) | ((int)getfval(y));
2133 		tempfree(y);
2134 		nextarg = nextarg->nnext;
2135 		break;
2136 	case FXOR:
2137 		if (nextarg == 0) {
2138 			WARNING("xor requires two arguments; returning 0");
2139 			u = 0;
2140 			break;
2141 		}
2142 		y = execute(a[1]->nnext);
2143 		u = ((int)getfval(x)) ^ ((int)getfval(y));
2144 		tempfree(y);
2145 		nextarg = nextarg->nnext;
2146 		break;
2147 	case FLSHIFT:
2148 		if (nextarg == 0) {
2149 			WARNING("lshift requires two arguments; returning 0");
2150 			u = 0;
2151 			break;
2152 		}
2153 		y = execute(a[1]->nnext);
2154 		u = ((int)getfval(x)) << ((int)getfval(y));
2155 		tempfree(y);
2156 		nextarg = nextarg->nnext;
2157 		break;
2158 	case FRSHIFT:
2159 		if (nextarg == 0) {
2160 			WARNING("rshift requires two arguments; returning 0");
2161 			u = 0;
2162 			break;
2163 		}
2164 		y = execute(a[1]->nnext);
2165 		u = ((int)getfval(x)) >> ((int)getfval(y));
2166 		tempfree(y);
2167 		nextarg = nextarg->nnext;
2168 		break;
2169 	case FSYSTEM:
2170 		fflush(stdout);		/* in case something is buffered already */
2171 		status = system(getsval(x));
2172 		u = status;
2173 		if (status != -1) {
2174 			if (WIFEXITED(status)) {
2175 				u = WEXITSTATUS(status);
2176 			} else if (WIFSIGNALED(status)) {
2177 				u = WTERMSIG(status) + 256;
2178 #ifdef WCOREDUMP
2179 				if (WCOREDUMP(status))
2180 					u += 256;
2181 #endif
2182 			} else	/* something else?!? */
2183 				u = 0;
2184 		}
2185 		break;
2186 	case FRAND:
2187 		/* random() returns numbers in [0..2^31-1]
2188 		 * in order to get a number in [0, 1), divide it by 2^31
2189 		 */
2190 		u = (Awkfloat) random() / (0x7fffffffL + 0x1UL);
2191 		break;
2192 	case FSRAND:
2193 		if (isrec(x)) {		/* no argument provided */
2194 			u = time(NULL);
2195 			tmp = u;
2196 			srandom((unsigned int) u);
2197 		} else {
2198 			u = getfval(x);
2199 			tmp = u;
2200 			srandom_deterministic((unsigned int) u);
2201 		}
2202 		u = srand_seed;
2203 		srand_seed = tmp;
2204 		break;
2205 	case FTOUPPER:
2206 	case FTOLOWER:
2207 		if (t == FTOUPPER)
2208 			buf = nawk_toupper(getsval(x));
2209 		else
2210 			buf = nawk_tolower(getsval(x));
2211 		tempfree(x);
2212 		x = gettemp();
2213 		setsval(x, buf);
2214 		free(buf);
2215 		return x;
2216 	case FFLUSH:
2217 		if (isrec(x) || strlen(getsval(x)) == 0) {
2218 			flush_all();	/* fflush() or fflush("") -> all */
2219 			u = 0;
2220 		} else if ((fp = openfile(FFLUSH, getsval(x), NULL)) == NULL)
2221 			u = EOF;
2222 		else
2223 			u = fflush(fp);
2224 		break;
2225 	case FMKTIME:
2226 		memset(&tmbuf, 0, sizeof(tmbuf));
2227 		tm = &tmbuf;
2228 		t = sscanf(getsval(x), "%d %d %d %d %d %d %d",
2229 		    &tm->tm_year, &tm->tm_mon, &tm->tm_mday, &tm->tm_hour,
2230 		    &tm->tm_min, &tm->tm_sec, &tm->tm_isdst);
2231 		switch (t) {
2232 		case 6:
2233 			tm->tm_isdst = -1;	/* let mktime figure it out */
2234 			/* FALLTHROUGH */
2235 		case 7:
2236 			tm->tm_year -= 1900;
2237 			tm->tm_mon--;
2238 			u = mktime(tm);
2239 			break;
2240 		default:
2241 			u = -1;
2242 			break;
2243 		}
2244 		break;
2245 	case FSYSTIME:
2246 		u = time((time_t *) 0);
2247 		break;
2248 	case FSTRFTIME:
2249 		/* strftime([format [,timestamp]]) */
2250 		if (nextarg) {
2251 			y = execute(nextarg);
2252 			nextarg = nextarg->nnext;
2253 			tv = (time_t) getfval(y);
2254 			tempfree(y);
2255 		} else
2256 			tv = time((time_t *) 0);
2257 		tm = localtime(&tv);
2258 		if (tm == NULL)
2259 			FATAL("bad time %ld", (long)tv);
2260 
2261 		if (isrec(x)) {
2262 			/* format argument not provided, use default */
2263 			fmt = tostring("%a %b %d %H:%M:%S %Z %Y");
2264 		} else
2265 			fmt = tostring(getsval(x));
2266 
2267 		sz = 32;
2268 		buf = NULL;
2269 		do {
2270 			if ((buf = (char *) reallocarray(buf, 2, sz)) == NULL)
2271 				FATAL("out of memory in strftime");
2272 			sz *= 2;
2273 		} while (strftime(buf, sz, fmt, tm) == 0 && fmt[0] != '\0');
2274 
2275 		y = gettemp();
2276 		setsval(y, buf);
2277 		free(fmt);
2278 		free(buf);
2279 
2280 		return y;
2281 	default:	/* can't happen */
2282 		FATAL("illegal function type %d", t);
2283 		break;
2284 	}
2285 	tempfree(x);
2286 	x = gettemp();
2287 	setfval(x, u);
2288 	if (nextarg != NULL) {
2289 		WARNING("warning: function has too many arguments");
2290 		for ( ; nextarg; nextarg = nextarg->nnext) {
2291 			y = execute(nextarg);
2292 			tempfree(y);
2293 		}
2294 	}
2295 	return(x);
2296 }
2297 
2298 Cell *printstat(Node **a, int n)	/* print a[0] */
2299 {
2300 	Node *x;
2301 	Cell *y;
2302 	FILE *fp;
2303 
2304 	if (a[1] == NULL)	/* a[1] is redirection operator, a[2] is file */
2305 		fp = stdout;
2306 	else
2307 		fp = redirect(ptoi(a[1]), a[2]);
2308 	for (x = a[0]; x != NULL; x = x->nnext) {
2309 		y = execute(x);
2310 		fputs(getpssval(y), fp);
2311 		tempfree(y);
2312 		if (x->nnext == NULL)
2313 			fputs(getsval(orsloc), fp);
2314 		else
2315 			fputs(getsval(ofsloc), fp);
2316 	}
2317 	if (a[1] != NULL)
2318 		fflush(fp);
2319 	if (ferror(fp))
2320 		FATAL("write error on %s", filename(fp));
2321 	return(True);
2322 }
2323 
2324 Cell *nullproc(Node **a, int n)
2325 {
2326 	return 0;
2327 }
2328 
2329 
2330 FILE *redirect(int a, Node *b)	/* set up all i/o redirections */
2331 {
2332 	FILE *fp;
2333 	Cell *x;
2334 	char *fname;
2335 
2336 	x = execute(b);
2337 	fname = getsval(x);
2338 	fp = openfile(a, fname, NULL);
2339 	if (fp == NULL)
2340 		FATAL("can't open file %s", fname);
2341 	tempfree(x);
2342 	return fp;
2343 }
2344 
2345 struct files {
2346 	FILE	*fp;
2347 	const char	*fname;
2348 	int	mode;	/* '|', 'a', 'w' => LE/LT, GT */
2349 } *files;
2350 
2351 size_t nfiles;
2352 
2353 static void stdinit(void)	/* in case stdin, etc., are not constants */
2354 {
2355 	nfiles = FOPEN_MAX;
2356 	files = (struct files *) calloc(nfiles, sizeof(*files));
2357 	if (files == NULL)
2358 		FATAL("can't allocate file memory for %zu files", nfiles);
2359         files[0].fp = stdin;
2360 	files[0].fname = tostring("/dev/stdin");
2361 	files[0].mode = LT;
2362         files[1].fp = stdout;
2363 	files[1].fname = tostring("/dev/stdout");
2364 	files[1].mode = GT;
2365         files[2].fp = stderr;
2366 	files[2].fname = tostring("/dev/stderr");
2367 	files[2].mode = GT;
2368 }
2369 
2370 FILE *openfile(int a, const char *us, bool *pnewflag)
2371 {
2372 	const char *s = us;
2373 	size_t i;
2374 	int m;
2375 	FILE *fp = NULL;
2376 
2377 	if (*s == '\0')
2378 		FATAL("null file name in print or getline");
2379 	for (i = 0; i < nfiles; i++)
2380 		if (files[i].fname && strcmp(s, files[i].fname) == 0 &&
2381 		    (a == files[i].mode || (a==APPEND && files[i].mode==GT) ||
2382 		     a == FFLUSH)) {
2383 			if (pnewflag)
2384 				*pnewflag = false;
2385 			return files[i].fp;
2386 		}
2387 	if (a == FFLUSH)	/* didn't find it, so don't create it! */
2388 		return NULL;
2389 
2390 	for (i = 0; i < nfiles; i++)
2391 		if (files[i].fp == NULL)
2392 			break;
2393 	if (i >= nfiles) {
2394 		struct files *nf;
2395 		size_t nnf = nfiles + FOPEN_MAX;
2396 		nf = (struct files *) reallocarray(files, nnf, sizeof(*nf));
2397 		if (nf == NULL)
2398 			FATAL("cannot grow files for %s and %zu files", s, nnf);
2399 		memset(&nf[nfiles], 0, FOPEN_MAX * sizeof(*nf));
2400 		nfiles = nnf;
2401 		files = nf;
2402 	}
2403 	fflush(stdout);	/* force a semblance of order */
2404 	m = a;
2405 	if (a == GT) {
2406 		fp = fopen(s, "w");
2407 	} else if (a == APPEND) {
2408 		fp = fopen(s, "a");
2409 		m = GT;	/* so can mix > and >> */
2410 	} else if (a == '|') {	/* output pipe */
2411 		fp = popen(s, "w");
2412 	} else if (a == LE) {	/* input pipe */
2413 		fp = popen(s, "r");
2414 	} else if (a == LT) {	/* getline <file */
2415 		fp = strcmp(s, "-") == 0 ? stdin : fopen(s, "r");	/* "-" is stdin */
2416 	} else	/* can't happen */
2417 		FATAL("illegal redirection %d", a);
2418 	if (fp != NULL) {
2419 		files[i].fname = tostring(s);
2420 		files[i].fp = fp;
2421 		files[i].mode = m;
2422 		if (pnewflag)
2423 			*pnewflag = true;
2424 		if (fp != stdin && fp != stdout && fp != stderr)
2425 			(void) fcntl(fileno(fp), F_SETFD, FD_CLOEXEC);
2426 	}
2427 	return fp;
2428 }
2429 
2430 const char *filename(FILE *fp)
2431 {
2432 	size_t i;
2433 
2434 	for (i = 0; i < nfiles; i++)
2435 		if (fp == files[i].fp)
2436 			return files[i].fname;
2437 	return "???";
2438 }
2439 
2440 Cell *closefile(Node **a, int n)
2441 {
2442  	Cell *x;
2443 	size_t i;
2444 	bool stat;
2445 
2446  	x = execute(a[0]);
2447  	getsval(x);
2448 	stat = true;
2449  	for (i = 0; i < nfiles; i++) {
2450 		if (!files[i].fname || strcmp(x->sval, files[i].fname) != 0)
2451 			continue;
2452 		if (files[i].mode == GT || files[i].mode == '|')
2453 			fflush(files[i].fp);
2454 		if (ferror(files[i].fp)) {
2455 			if ((files[i].mode == GT && files[i].fp != stderr)
2456 			  || files[i].mode == '|')
2457 				FATAL("write error on %s", files[i].fname);
2458 			else
2459 				WARNING("i/o error occurred on %s", files[i].fname);
2460 		}
2461 		if (files[i].fp == stdin || files[i].fp == stdout ||
2462 		    files[i].fp == stderr)
2463 			stat = freopen("/dev/null", "r+", files[i].fp) == NULL;
2464 		else if (files[i].mode == '|' || files[i].mode == LE)
2465 			stat = pclose(files[i].fp) == -1;
2466 		else
2467 			stat = fclose(files[i].fp) == EOF;
2468 		if (stat)
2469 			WARNING("i/o error occurred closing %s", files[i].fname);
2470 		xfree(files[i].fname);
2471 		files[i].fname = NULL;	/* watch out for ref thru this */
2472 		files[i].fp = NULL;
2473 		break;
2474  	}
2475  	tempfree(x);
2476  	x = gettemp();
2477 	setfval(x, (Awkfloat) (stat ? -1 : 0));
2478  	return(x);
2479 }
2480 
2481 void closeall(void)
2482 {
2483 	size_t i;
2484 	bool stat = false;
2485 
2486 	for (i = 0; i < nfiles; i++) {
2487 		if (! files[i].fp)
2488 			continue;
2489 		if (files[i].mode == GT || files[i].mode == '|')
2490 			fflush(files[i].fp);
2491 		if (ferror(files[i].fp)) {
2492 			if ((files[i].mode == GT && files[i].fp != stderr)
2493 			  || files[i].mode == '|')
2494 				FATAL("write error on %s", files[i].fname);
2495 			else
2496 				WARNING("i/o error occurred on %s", files[i].fname);
2497 		}
2498 		if (files[i].fp == stdin || files[i].fp == stdout ||
2499 		    files[i].fp == stderr)
2500 			continue;
2501 		if (files[i].mode == '|' || files[i].mode == LE)
2502 			stat = pclose(files[i].fp) == -1;
2503 		else
2504 			stat = fclose(files[i].fp) == EOF;
2505 		if (stat)
2506 			WARNING("i/o error occurred while closing %s", files[i].fname);
2507 	}
2508 }
2509 
2510 static void flush_all(void)
2511 {
2512 	size_t i;
2513 
2514 	for (i = 0; i < nfiles; i++)
2515 		if (files[i].fp)
2516 			fflush(files[i].fp);
2517 }
2518 
2519 void backsub(char **pb_ptr, const char **sptr_ptr);
2520 
2521 Cell *sub(Node **a, int nnn)	/* substitute command */
2522 {
2523 	const char *sptr, *q;
2524 	Cell *x, *y, *result;
2525 	char *t, *buf, *pb;
2526 	fa *pfa;
2527 	int bufsz = recsize;
2528 
2529 	if ((buf = (char *) malloc(bufsz)) == NULL)
2530 		FATAL("out of memory in sub");
2531 	x = execute(a[3]);	/* target string */
2532 	t = getsval(x);
2533 	if (a[0] == NULL)	/* 0 => a[1] is already-compiled regexpr */
2534 		pfa = (fa *) a[1];	/* regular expression */
2535 	else {
2536 		y = execute(a[1]);
2537 		pfa = makedfa(getsval(y), 1);
2538 		tempfree(y);
2539 	}
2540 	y = execute(a[2]);	/* replacement string */
2541 	result = False;
2542 	if (pmatch(pfa, t)) {
2543 		sptr = t;
2544 		adjbuf(&buf, &bufsz, 1+patbeg-sptr, recsize, 0, "sub");
2545 		pb = buf;
2546 		while (sptr < patbeg)
2547 			*pb++ = *sptr++;
2548 		sptr = getsval(y);
2549 		while (*sptr != '\0') {
2550 			adjbuf(&buf, &bufsz, 5+pb-buf, recsize, &pb, "sub");
2551 			if (*sptr == '\\') {
2552 				backsub(&pb, &sptr);
2553 			} else if (*sptr == '&') {
2554 				sptr++;
2555 				adjbuf(&buf, &bufsz, 1+patlen+pb-buf, recsize, &pb, "sub");
2556 				for (q = patbeg; q < patbeg+patlen; )
2557 					*pb++ = *q++;
2558 			} else
2559 				*pb++ = *sptr++;
2560 		}
2561 		*pb = '\0';
2562 		if (pb > buf + bufsz)
2563 			FATAL("sub result1 %.30s too big; can't happen", buf);
2564 		sptr = patbeg + patlen;
2565 		if ((patlen == 0 && *patbeg) || (patlen && *(sptr-1))) {
2566 			adjbuf(&buf, &bufsz, 1+strlen(sptr)+pb-buf, 0, &pb, "sub");
2567 			while ((*pb++ = *sptr++) != '\0')
2568 				continue;
2569 		}
2570 		if (pb > buf + bufsz)
2571 			FATAL("sub result2 %.30s too big; can't happen", buf);
2572 		setsval(x, buf);	/* BUG: should be able to avoid copy */
2573 		result = True;
2574 	}
2575 	tempfree(x);
2576 	tempfree(y);
2577 	free(buf);
2578 	return result;
2579 }
2580 
2581 Cell *gsub(Node **a, int nnn)	/* global substitute */
2582 {
2583 	Cell *x, *y;
2584 	char *rptr, *pb;
2585 	const char *q, *t, *sptr;
2586 	char *buf;
2587 	fa *pfa;
2588 	int mflag, tempstat, num;
2589 	int bufsz = recsize;
2590 	int charlen = 0;
2591 
2592 	if ((buf = (char *) malloc(bufsz)) == NULL)
2593 		FATAL("out of memory in gsub");
2594 	mflag = 0;	/* if mflag == 0, can replace empty string */
2595 	num = 0;
2596 	x = execute(a[3]);	/* target string */
2597 	t = getsval(x);
2598 	if (a[0] == NULL)	/* 0 => a[1] is already-compiled regexpr */
2599 		pfa = (fa *) a[1];	/* regular expression */
2600 	else {
2601 		y = execute(a[1]);
2602 		pfa = makedfa(getsval(y), 1);
2603 		tempfree(y);
2604 	}
2605 	y = execute(a[2]);	/* replacement string */
2606 	if (pmatch(pfa, t)) {
2607 		tempstat = pfa->initstat;
2608 		pfa->initstat = 2;
2609 		pb = buf;
2610 		rptr = getsval(y);
2611 		do {
2612 			if (patlen == 0 && *patbeg != '\0') {	/* matched empty string */
2613 				if (mflag == 0) {	/* can replace empty */
2614 					num++;
2615 					sptr = rptr;
2616 					while (*sptr != '\0') {
2617 						adjbuf(&buf, &bufsz, 5+pb-buf, recsize, &pb, "gsub");
2618 						if (*sptr == '\\') {
2619 							backsub(&pb, &sptr);
2620 						} else if (*sptr == '&') {
2621 							sptr++;
2622 							adjbuf(&buf, &bufsz, 1+patlen+pb-buf, recsize, &pb, "gsub");
2623 							for (q = patbeg; q < patbeg+patlen; )
2624 								*pb++ = *q++;
2625 						} else
2626 							*pb++ = *sptr++;
2627 					}
2628 				}
2629 				if (*t == '\0')	/* at end */
2630 					goto done;
2631 				adjbuf(&buf, &bufsz, 2+pb-buf, recsize, &pb, "gsub");
2632 				charlen = u8_nextlen(t);
2633 				while (charlen-- > 0)
2634 					*pb++ = *t++;
2635 				if (pb > buf + bufsz)	/* BUG: not sure of this test */
2636 					FATAL("gsub result0 %.30s too big; can't happen", buf);
2637 				mflag = 0;
2638 			}
2639 			else {	/* matched nonempty string */
2640 				num++;
2641 				sptr = t;
2642 				adjbuf(&buf, &bufsz, 1+(patbeg-sptr)+pb-buf, recsize, &pb, "gsub");
2643 				while (sptr < patbeg)
2644 					*pb++ = *sptr++;
2645 				sptr = rptr;
2646 				while (*sptr != '\0') {
2647 					adjbuf(&buf, &bufsz, 5+pb-buf, recsize, &pb, "gsub");
2648 					if (*sptr == '\\') {
2649 						backsub(&pb, &sptr);
2650 					} else if (*sptr == '&') {
2651 						sptr++;
2652 						adjbuf(&buf, &bufsz, 1+patlen+pb-buf, recsize, &pb, "gsub");
2653 						for (q = patbeg; q < patbeg+patlen; )
2654 							*pb++ = *q++;
2655 					} else
2656 						*pb++ = *sptr++;
2657 				}
2658 				t = patbeg + patlen;
2659 				if (patlen == 0 || *t == '\0' || *(t-1) == '\0')
2660 					goto done;
2661 				if (pb > buf + bufsz)
2662 					FATAL("gsub result1 %.30s too big; can't happen", buf);
2663 				mflag = 1;
2664 			}
2665 		} while (pmatch(pfa,t));
2666 		sptr = t;
2667 		adjbuf(&buf, &bufsz, 1+strlen(sptr)+pb-buf, 0, &pb, "gsub");
2668 		while ((*pb++ = *sptr++) != '\0')
2669 			continue;
2670 	done:	if (pb < buf + bufsz)
2671 			*pb = '\0';
2672 		else if (*(pb-1) != '\0')
2673 			FATAL("gsub result2 %.30s truncated; can't happen", buf);
2674 		setsval(x, buf);	/* BUG: should be able to avoid copy + free */
2675 		pfa->initstat = tempstat;
2676 	}
2677 	tempfree(x);
2678 	tempfree(y);
2679 	x = gettemp();
2680 	x->tval = NUM;
2681 	x->fval = num;
2682 	free(buf);
2683 	return(x);
2684 }
2685 
2686 Cell *gensub(Node **a, int nnn)	/* global selective substitute */
2687 	/* XXX incomplete - doesn't support backreferences \0 ... \9 */
2688 {
2689 	Cell *x, *y, *res, *h;
2690 	char *rptr;
2691 	const char *sptr;
2692 	char *buf, *pb;
2693 	const char *t, *q;
2694 	fa *pfa;
2695 	int mflag, tempstat, num, whichm;
2696 	int bufsz = recsize;
2697 
2698 	if ((buf = malloc(bufsz)) == NULL)
2699 		FATAL("out of memory in gensub");
2700 	mflag = 0;	/* if mflag == 0, can replace empty string */
2701 	num = 0;
2702 	x = execute(a[4]);	/* source string */
2703 	t = getsval(x);
2704 	res = copycell(x);	/* target string - initially copy of source */
2705 	res->csub = CTEMP;	/* result values are temporary */
2706 	if (a[0] == 0)		/* 0 => a[1] is already-compiled regexpr */
2707 		pfa = (fa *) a[1];	/* regular expression */
2708 	else {
2709 		y = execute(a[1]);
2710 		pfa = makedfa(getsval(y), 1);
2711 		tempfree(y);
2712 	}
2713 	y = execute(a[2]);	/* replacement string */
2714 	h = execute(a[3]);	/* which matches should be replaced */
2715 	sptr = getsval(h);
2716 	if (sptr[0] == 'g' || sptr[0] == 'G')
2717 		whichm = -1;
2718 	else {
2719 		/*
2720 		 * The specified number is index of replacement, starting
2721 		 * from 1. GNU awk treats index lower than 0 same as
2722 		 * 1, we do same for compatibility.
2723 		 */
2724 		whichm = (int) getfval(h) - 1;
2725 		if (whichm < 0)
2726 			whichm = 0;
2727 	}
2728 	tempfree(h);
2729 
2730 	if (pmatch(pfa, t)) {
2731 		char *sl;
2732 
2733 		tempstat = pfa->initstat;
2734 		pfa->initstat = 2;
2735 		pb = buf;
2736 		rptr = getsval(y);
2737 		/*
2738 		 * XXX if there are any backreferences in subst string,
2739 		 * complain now.
2740 		 */
2741 		for (sl = rptr; (sl = strchr(sl, '\\')) && sl[1]; sl++) {
2742 			if (strchr("0123456789", sl[1])) {
2743 				FATAL("gensub doesn't support backreferences (subst \"%s\")", rptr);
2744 			}
2745 		}
2746 
2747 		do {
2748 			if (whichm >= 0 && whichm != num) {
2749 				num++;
2750 				adjbuf(&buf, &bufsz, (pb - buf) + (patbeg - t) + patlen, recsize, &pb, "gensub");
2751 
2752 				/* copy the part of string up to and including
2753 				 * match to output buffer */
2754 				while (t < patbeg + patlen)
2755 					*pb++ = *t++;
2756 				continue;
2757 			}
2758 
2759 			if (patlen == 0 && *patbeg != 0) {	/* matched empty string */
2760 				if (mflag == 0) {	/* can replace empty */
2761 					num++;
2762 					sptr = rptr;
2763 					while (*sptr != 0) {
2764 						adjbuf(&buf, &bufsz, 5+pb-buf, recsize, &pb, "gensub");
2765 						if (*sptr == '\\') {
2766 							backsub(&pb, &sptr);
2767 						} else if (*sptr == '&') {
2768 							sptr++;
2769 							adjbuf(&buf, &bufsz, 1+patlen+pb-buf, recsize, &pb, "gensub");
2770 							for (q = patbeg; q < patbeg+patlen; )
2771 								*pb++ = *q++;
2772 						} else
2773 							*pb++ = *sptr++;
2774 					}
2775 				}
2776 				if (*t == 0)	/* at end */
2777 					goto done;
2778 				adjbuf(&buf, &bufsz, 2+pb-buf, recsize, &pb, "gensub");
2779 				*pb++ = *t++;
2780 				if (pb > buf + bufsz)	/* BUG: not sure of this test */
2781 					FATAL("gensub result0 %.30s too big; can't happen", buf);
2782 				mflag = 0;
2783 			}
2784 			else {	/* matched nonempty string */
2785 				num++;
2786 				sptr = t;
2787 				adjbuf(&buf, &bufsz, 1+(patbeg-sptr)+pb-buf, recsize, &pb, "gensub");
2788 				while (sptr < patbeg)
2789 					*pb++ = *sptr++;
2790 				sptr = rptr;
2791 				while (*sptr != 0) {
2792 					adjbuf(&buf, &bufsz, 5+pb-buf, recsize, &pb, "gensub");
2793 					if (*sptr == '\\') {
2794 						backsub(&pb, &sptr);
2795 					} else if (*sptr == '&') {
2796 						sptr++;
2797 						adjbuf(&buf, &bufsz, 1+patlen+pb-buf, recsize, &pb, "gensub");
2798 						for (q = patbeg; q < patbeg+patlen; )
2799 							*pb++ = *q++;
2800 					} else
2801 						*pb++ = *sptr++;
2802 				}
2803 				t = patbeg + patlen;
2804 				if (patlen == 0 || *t == 0 || *(t-1) == 0)
2805 					goto done;
2806 				if (pb > buf + bufsz)
2807 					FATAL("gensub result1 %.30s too big; can't happen", buf);
2808 				mflag = 1;
2809 			}
2810 		} while (pmatch(pfa,t));
2811 		sptr = t;
2812 		adjbuf(&buf, &bufsz, 1+strlen(sptr)+pb-buf, 0, &pb, "gensub");
2813 		while ((*pb++ = *sptr++) != 0)
2814 			;
2815 	done:	if (pb > buf + bufsz)
2816 			FATAL("gensub result2 %.30s too big; can't happen", buf);
2817 		*pb = '\0';
2818 		setsval(res, buf);
2819 		pfa->initstat = tempstat;
2820 	}
2821 	tempfree(x);
2822 	tempfree(y);
2823 	free(buf);
2824 	return(res);
2825 }
2826 
2827 void backsub(char **pb_ptr, const char **sptr_ptr)	/* handle \\& variations */
2828 {						/* sptr[0] == '\\' */
2829 	char *pb = *pb_ptr;
2830 	const char *sptr = *sptr_ptr;
2831 
2832 	if (sptr[1] == '\\') {
2833 		if (sptr[2] == '\\' && sptr[3] == '&') { /* \\\& -> \& */
2834 			*pb++ = '\\';
2835 			*pb++ = '&';
2836 			sptr += 4;
2837 		} else if (sptr[2] == '&') {	/* \\& -> \ + matched */
2838 			*pb++ = '\\';
2839 			sptr += 2;
2840 		} else if (do_posix) {		/* \\x -> \x */
2841 			sptr++;
2842 			*pb++ = *sptr++;
2843 		} else {			/* \\x -> \\x */
2844 			*pb++ = *sptr++;
2845 			*pb++ = *sptr++;
2846 		}
2847 	} else if (sptr[1] == '&') {	/* literal & */
2848 		sptr++;
2849 		*pb++ = *sptr++;
2850 	} else				/* literal \ */
2851 		*pb++ = *sptr++;
2852 
2853 	*pb_ptr = pb;
2854 	*sptr_ptr = sptr;
2855 }
2856 
2857 static char *wide_char_to_byte_str(int rune, size_t *outlen)
2858 {
2859 	static char buf[5];
2860 	int len;
2861 
2862 	if (rune < 0 || rune > 0x10FFFF)
2863 		return NULL;
2864 
2865 	memset(buf, 0, sizeof(buf));
2866 
2867 	len = 0;
2868 	if (rune <= 0x0000007F) {
2869 		buf[len++] = rune;
2870 	} else if (rune <= 0x000007FF) {
2871 		// 110xxxxx 10xxxxxx
2872 		buf[len++] = 0xC0 | (rune >> 6);
2873 		buf[len++] = 0x80 | (rune & 0x3F);
2874 	} else if (rune <= 0x0000FFFF) {
2875 		// 1110xxxx 10xxxxxx 10xxxxxx
2876 		buf[len++] = 0xE0 | (rune >> 12);
2877 		buf[len++] = 0x80 | ((rune >> 6) & 0x3F);
2878 		buf[len++] = 0x80 | (rune & 0x3F);
2879 
2880 	} else {
2881 		// 0x00010000 - 0x10FFFF
2882 		// 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
2883 		buf[len++] = 0xF0 | (rune >> 18);
2884 		buf[len++] = 0x80 | ((rune >> 12) & 0x3F);
2885 		buf[len++] = 0x80 | ((rune >> 6) & 0x3F);
2886 		buf[len++] = 0x80 | (rune & 0x3F);
2887 	}
2888 
2889 	*outlen = len;
2890 	buf[len++] = '\0';
2891 
2892 	return buf;
2893 }
2894