1 /*
2  * node.c -- routines for node management
3  */
4 
5 /*
6  * Copyright (C) 1986, 1988, 1989, 1991-2001, 2003-2015, 2017-2019, 2021,
7  * the Free Software Foundation, Inc.
8  *
9  * This file is part of GAWK, the GNU implementation of the
10  * AWK Programming Language.
11  *
12  * GAWK is free software; you can redistribute it and/or modify
13  * it under the terms of the GNU General Public License as published by
14  * the Free Software Foundation; either version 3 of the License, or
15  * (at your option) any later version.
16  *
17  * GAWK is distributed in the hope that it will be useful,
18  * but WITHOUT ANY WARRANTY; without even the implied warranty of
19  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
20  * GNU General Public License for more details.
21  *
22  * You should have received a copy of the GNU General Public License
23  * along with this program; if not, write to the Free Software
24  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA
25  */
26 
27 #include "awk.h"
28 #include "floatmagic.h"	/* definition of isnan */
29 
30 static NODE *r_make_number(double x);
31 static AWKNUM get_ieee_magic_val(char *val);
32 extern NODE **fmt_list;          /* declared in eval.c */
33 
34 NODE *(*make_number)(double) = r_make_number;
35 NODE *(*str2number)(NODE *) = r_force_number;
36 NODE *(*format_val)(const char *, int, NODE *) = r_format_val;
37 int (*cmp_numbers)(const NODE *, const NODE *) = cmp_awknums;
38 
39 /* is_hex --- return true if a string looks like a hex value */
40 
41 static bool
is_hex(const char * str,const char * cpend)42 is_hex(const char *str, const char *cpend)
43 {
44 	/* on entry, we know the string length is >= 1 */
45 	if (*str == '-' || *str == '+')
46 		str++;
47 
48 	if (str + 1 < cpend && str[0] == '0' && (str[1] == 'x' || str[1] == 'X'))
49 		return true;
50 
51 	return false;
52 }
53 
54 /* force_number --- force a value to be numeric */
55 
56 NODE *
r_force_number(NODE * n)57 r_force_number(NODE *n)
58 {
59 	char *cp;
60 	char *cpend;
61 	char save;
62 	char *ptr;
63 
64 	if ((n->flags & NUMCUR) != 0)
65 		return n;
66 
67 	/*
68 	 * We should always set NUMCUR. If USER_INPUT is set and it's a
69 	 * numeric string, we clear STRING and enable NUMBER, but if it's not
70 	 * numeric, we disable USER_INPUT.
71 	 */
72 
73 	/* All the conditionals are an attempt to avoid the expensive strtod */
74 
75 	n->flags |= NUMCUR;
76 	n->numbr = 0.0;
77 
78 	/* Trim leading white space, bailing out if there's nothing else */
79 	for (cp = n->stptr, cpend = cp + n->stlen;
80 	     cp < cpend && isspace((unsigned char) *cp); cp++)
81 		continue;
82 
83 	if (cp == cpend)
84 		goto badnum;
85 
86 	/* At this point, we know the string is not entirely white space */
87 	/* Trim trailing white space */
88 	while (isspace((unsigned char) cpend[-1]))
89 		cpend--;
90 
91 	/*
92 	 * 2/2007:
93 	 * POSIX, by way of severe language lawyering, seems to
94 	 * allow things like "inf" and "nan" to mean something.
95 	 * So if do_posix, the user gets what he deserves.
96 	 * This also allows hexadecimal floating point. Ugh.
97 	 */
98 	if (! do_posix) {
99 		if (is_alpha((unsigned char) *cp))
100 			goto badnum;
101 		else if (is_ieee_magic_val(cp)) {
102 			if (cpend == cp + 4) {
103 				n->numbr = get_ieee_magic_val(cp);
104 				goto goodnum;
105 			} else
106 				goto badnum;
107 		}
108 		/* else
109 			fall through */
110 	}
111 	/* else POSIX, so
112 		fall through */
113 
114 	if (   (! do_posix		/* not POSIXLY paranoid and */
115 	        && (is_alpha((unsigned char) *cp)	/* letter, or */
116 					/* CANNOT do non-decimal and saw 0x */
117 		    || (! do_non_decimal_data && is_hex(cp, cpend))))) {
118 		goto badnum;
119 	}
120 
121 	if (cpend - cp == 1) {		/* only one character */
122 		if (isdigit((unsigned char) *cp)) {	/* it's a digit! */
123 			n->numbr = (AWKNUM)(*cp - '0');
124 			if (n->stlen == 1)		/* no white space */
125 				n->flags |= NUMINT;
126 			goto goodnum;
127 		}
128 		goto badnum;
129 	}
130 
131 	errno = 0;
132 	if (do_non_decimal_data		/* main.c assures false if do_posix */
133 		&& ! do_traditional && get_numbase(cp, cpend - cp, true) != 10) {
134 		/* nondec2awknum() saves and restores the byte after the string itself */
135 		n->numbr = nondec2awknum(cp, cpend - cp, &ptr);
136 	} else {
137 		save = *cpend;
138 		*cpend = '\0';
139 		n->numbr = (AWKNUM) strtod((const char *) cp, &ptr);
140 		*cpend = save;
141 	}
142 
143 	if (errno == 0 || errno == ERANGE) {
144 		errno = 0;	/* reset in case of ERANGE */
145 		if (ptr == cpend)
146 			goto goodnum;
147 		/* else keep the leading numeric value without updating flags */
148 		/* fall through to badnum */
149 	} else {
150 		errno = 0;
151 		/*
152 		 * N.B. For subnormal values, strtod may return the
153 		 * floating-point representation while setting errno to ERANGE.
154 		 * We force the numeric value to 0 in such cases.
155 		 */
156 		n->numbr = 0;
157 		/*
158 		 * Or should we accept it as a NUMBER even though strtod
159 		 * threw an error?
160 		 */
161 		/* fall through to badnum */
162 	}
163 badnum:
164 	n->flags &= ~USER_INPUT;
165 	return n;
166 
167 goodnum:
168 	if (isnan(n->numbr) && *cp == '-' && signbit(n->numbr) == 0)
169 		n->numbr = -(n->numbr);
170 
171 	if ((n->flags & USER_INPUT) != 0) {
172 		/* leave USER_INPUT enabled to indicate that this is a strnum */
173 		n->flags &= ~STRING;
174 		n->flags |= NUMBER;
175 	}
176 	return n;
177 }
178 
179 
180 /*
181  * The following lookup table is used as an optimization in force_string;
182  * (more complicated) variations on this theme didn't seem to pay off, but
183  * systematic testing might be in order at some point.
184  */
185 static const char *values[] = {
186 	"0",
187 	"1",
188 	"2",
189 	"3",
190 	"4",
191 	"5",
192 	"6",
193 	"7",
194 	"8",
195 	"9",
196 };
197 #define	NVAL	(sizeof(values)/sizeof(values[0]))
198 
199 /* r_format_val --- format a numeric value based on format */
200 
201 NODE *
r_format_val(const char * format,int index,NODE * s)202 r_format_val(const char *format, int index, NODE *s)
203 {
204 	char buf[BUFSIZ];
205 	char *sp = buf;
206 	double val;
207 
208 	/*
209 	 * 2/2007: Simplify our lives here. Instead of worrying about
210 	 * whether or not the value will fit into a long just so we
211 	 * can use sprintf("%ld", val) on it, always format it ourselves.
212 	 * The only thing to worry about is that integral values always
213 	 * format as integers. %.0f does that very well.
214 	 *
215 	 * 6/2008: Would that things were so simple. Always using %.0f
216 	 * imposes a notable performance penalty for applications that
217 	 * do a lot of conversion of integers to strings. So, we reinstate
218 	 * the old code, but use %.0f for integral values that are outside
219 	 * the range of a long.  This seems a reasonable compromise.
220 	 *
221 	 * 12/2009: Use <= and >= in the comparisons with LONG_xxx instead of
222 	 * < and > so that things work correctly on systems with 64 bit integers.
223 	 */
224 
225 	if (out_of_range(s)) {
226 		const char *result = format_nan_inf(s, 'g');
227 		return make_string(result, strlen(result));
228 	} else if ((val = double_to_int(s->numbr)) != s->numbr
229 			|| val <= LONG_MIN || val >= LONG_MAX
230 	) {
231 		/* not an integral value, or out of integer range */
232 		/*
233 		 * Once upon a time, we just blindly did this:
234 		 *	sprintf(sp, format, s->numbr);
235 		 *	s->stlen = strlen(sp);
236 		 *	s->stfmt = index;
237 		 * but that's no good if, e.g., OFMT is %s. So we punt,
238 		 * and just always format the value ourselves.
239 		 */
240 
241 		NODE *dummy[2], *r;
242 		unsigned int oflags;
243 
244 		/* create dummy node for a sole use of format_tree */
245 		dummy[1] = s;
246 		oflags = s->flags;
247 
248 		if (val == s->numbr) {
249 			/* integral value, but outside range of %ld, use %.0f */
250 			r = format_tree("%.0f", 4, dummy, 2);
251 			s->stfmt = STFMT_UNUSED;
252 		} else {
253 			r = format_tree(format, fmt_list[index]->stlen, dummy, 2);
254 			assert(r != NULL);
255 			s->stfmt = index;
256 		}
257 		s->flags = oflags;
258 		s->stlen = r->stlen;
259 		if ((s->flags & (MALLOC|STRCUR)) == (MALLOC|STRCUR))
260 			efree(s->stptr);
261 		s->stptr = r->stptr;
262 #ifdef HAVE_MPFR
263 		s->strndmode = MPFR_round_mode;
264 #endif
265 		freenode(r);	/* Do not unref(r)! We want to keep s->stptr == r->stpr.  */
266 
267 		goto no_malloc;
268 	} else {
269 		/*
270 		 * integral value; force conversion to long only once.
271 		 */
272 		long num = (long) val;
273 
274 		if (num < NVAL && num >= 0) {
275 			sp = (char *) values[num];
276 			s->stlen = 1;
277 		} else {
278 			(void) sprintf(sp, "%ld", num);
279 			s->stlen = strlen(sp);
280 		}
281 		s->stfmt = STFMT_UNUSED;
282 		if ((s->flags & INTIND) != 0) {
283 			s->flags &= ~(INTIND|NUMBER);
284 			s->flags |= STRING;
285 		}
286 #ifdef HAVE_MPFR
287 		s->strndmode = MPFR_round_mode;
288 #endif
289 	}
290 	if ((s->flags & (MALLOC|STRCUR)) == (MALLOC|STRCUR))
291 		efree(s->stptr);
292 	emalloc(s->stptr, char *, s->stlen + 1, "format_val");
293 	memcpy(s->stptr, sp, s->stlen + 1);
294 no_malloc:
295 	s->flags |= STRCUR;
296 	free_wstr(s);
297 	return s;
298 }
299 
300 /* r_dupnode --- duplicate a node */
301 
302 NODE *
r_dupnode(NODE * n)303 r_dupnode(NODE *n)
304 {
305 	NODE *r;
306 
307 	assert(n->type == Node_val);
308 
309 #ifdef GAWKDEBUG
310 	/* Do the same as in awk.h:dupnode().  */
311 	if ((n->flags & MALLOC) != 0) {
312 		n->valref++;
313 		return n;
314 	}
315 #endif
316 	getnode(r);
317 	*r = *n;
318 
319 #ifdef HAVE_MPFR
320 	if ((n->flags & MPZN) != 0) {
321 		mpz_init(r->mpg_i);
322 		mpz_set(r->mpg_i, n->mpg_i);
323 	} else if ((n->flags & MPFN) != 0) {
324 		mpfr_init(r->mpg_numbr);
325 		int tval = mpfr_set(r->mpg_numbr, n->mpg_numbr, ROUND_MODE);
326 		IEEE_FMT(r->mpg_numbr, tval);
327 	}
328 #endif
329 
330 	r->flags |= MALLOC;
331 	r->valref = 1;
332 	/*
333 	 * DON'T call free_wstr(r) here!
334 	 * r->wstptr still points at n->wstptr's value, and we
335 	 * don't want to free it!
336 	 */
337 	r->wstptr = NULL;
338 	r->wstlen = 0;
339 
340 	if ((n->flags & STRCUR) != 0) {
341 		emalloc(r->stptr, char *, n->stlen + 1, "r_dupnode");
342 		memcpy(r->stptr, n->stptr, n->stlen);
343 		r->stptr[n->stlen] = '\0';
344 		r->stlen = n->stlen;
345 		if ((n->flags & WSTRCUR) != 0) {
346 			r->wstlen = n->wstlen;
347 			emalloc(r->wstptr, wchar_t *, sizeof(wchar_t) * (n->wstlen + 1), "r_dupnode");
348 			memcpy(r->wstptr, n->wstptr, n->wstlen * sizeof(wchar_t));
349 			r->wstptr[n->wstlen] = L'\0';
350 			r->flags |= WSTRCUR;
351 		}
352 	}
353 
354 	return r;
355 }
356 
357 /* r_make_number --- allocate a node with defined number */
358 
359 static NODE *
r_make_number(double x)360 r_make_number(double x)
361 {
362 	NODE *r = make_number_node(0);
363 	r->numbr = x;
364 	return r;
365 }
366 
367 /* cmp_awknums --- compare two AWKNUMs */
368 
369 int
cmp_awknums(const NODE * t1,const NODE * t2)370 cmp_awknums(const NODE *t1, const NODE *t2)
371 {
372 	/*
373 	 * This routine is also used to sort numeric array indices or values.
374 	 * For the purposes of sorting, NaN is considered greater than
375 	 * any other value, and all NaN values are considered equivalent and equal.
376 	 * This isn't in compliance with IEEE standard, but compliance w.r.t. NaN
377 	 * comparison at the awk level is a different issue, and needs to be dealt
378 	 * with in the interpreter for each opcode seperately.
379 	 */
380 
381 	if (isnan(t1->numbr))
382 		return ! isnan(t2->numbr);
383 	if (isnan(t2->numbr))
384 		return -1;
385 	/* don't subtract, in case one or both are infinite */
386 	if (t1->numbr == t2->numbr)
387 		return 0;
388 	if (t1->numbr < t2->numbr)
389 		return -1;
390 	return 1;
391 }
392 
393 
394 /* make_str_node --- make a string node */
395 
396 NODE *
make_str_node(const char * s,size_t len,int flags)397 make_str_node(const char *s, size_t len, int flags)
398 {
399 	NODE *r;
400 	getnode(r);
401 	r->type = Node_val;
402 	r->numbr = 0;
403 	r->flags = (MALLOC|STRING|STRCUR);
404 	r->valref = 1;
405 	r->stfmt = STFMT_UNUSED;
406 #ifdef HAVE_MPFR
407 	r->strndmode = MPFR_round_mode;
408 #endif
409 	r->wstptr = NULL;
410 	r->wstlen = 0;
411 
412 	if ((flags & ALREADY_MALLOCED) != 0)
413 		r->stptr = (char *) s;
414 	else {
415 		emalloc(r->stptr, char *, len + 1, "make_str_node");
416 		memcpy(r->stptr, s, len);
417 	}
418 	r->stptr[len] = '\0';
419 
420 	if ((flags & SCAN) != 0) {	/* scan for escape sequences */
421 		const char *pf;
422 		char *ptm;
423 		int c;
424 		const char *end;
425 		mbstate_t cur_state;
426 
427 		memset(& cur_state, 0, sizeof(cur_state));
428 
429 		end = &(r->stptr[len]);
430 		for (pf = ptm = r->stptr; pf < end;) {
431 			/*
432 			 * Keep multibyte characters together. This avoids
433 			 * problems if a subsequent byte of a multibyte
434 			 * character happens to be a backslash.
435 			 */
436 			if (gawk_mb_cur_max > 1) {
437 				int mblen = mbrlen(pf, end-pf, &cur_state);
438 
439 				if (mblen > 1) {
440 					int i;
441 
442 					for (i = 0; i < mblen; i++)
443 						*ptm++ = *pf++;
444 					continue;
445 				}
446 			}
447 
448 			c = *pf++;
449 			if (c == '\\') {
450 				c = parse_escape(&pf);
451 				if (c < 0) {
452 					if (do_lint)
453 						lintwarn(_("backslash string continuation is not portable"));
454 					if ((flags & ELIDE_BACK_NL) != 0)
455 						continue;
456 					c = '\\';
457 				}
458 				*ptm++ = c;
459 			} else
460 				*ptm++ = c;
461 		}
462 		len = ptm - r->stptr;
463 		erealloc(r->stptr, char *, len + 1, "make_str_node");
464 		r->stptr[len] = '\0';
465 	}
466 	r->stlen = len;
467 
468 	return r;
469 }
470 
471 /* make_typed_regex --- make a typed regex node */
472 
473 NODE *
make_typed_regex(const char * re,size_t len)474 make_typed_regex(const char *re, size_t len)
475 {
476 	NODE *n, *exp, *n2;
477 
478 	exp = make_str_node(re, len, ALREADY_MALLOCED);
479 	n = make_regnode(Node_regex, exp);
480 	if (n == NULL)
481 		fatal(_("could not make typed regex"));
482 
483 	n2 = make_string(re, len);
484 	n2->typed_re = n;
485 #if HAVE_MPFR
486 	if (do_mpfr)
487 		mpg_zero(n2);
488 	else
489 #endif
490 	n2->numbr = 0;
491 	n2->flags |= NUMCUR|STRCUR|REGEX;
492 	n2->flags &= ~(STRING|NUMBER);
493 
494 	return n2;
495 }
496 
497 
498 /* unref --- remove reference to a particular node */
499 
500 void
r_unref(NODE * tmp)501 r_unref(NODE *tmp)
502 {
503 #ifdef GAWKDEBUG
504 	/* Do the same as in awk.h:unref().  */
505 	assert(tmp == NULL || tmp->valref > 0);
506 	if (tmp == NULL || --tmp->valref > 0)
507 		return;
508 #endif
509 
510 	if ((tmp->flags & (MALLOC|STRCUR)) == (MALLOC|STRCUR))
511 		efree(tmp->stptr);
512 
513 	mpfr_unset(tmp);
514 
515 	free_wstr(tmp);
516 	freenode(tmp);
517 }
518 
519 
520 /*
521  * parse_escape:
522  *
523  * Parse a C escape sequence.  STRING_PTR points to a variable containing a
524  * pointer to the string to parse.  That pointer is updated past the
525  * characters we use.  The value of the escape sequence is returned.
526  *
527  * A negative value means the sequence \ newline was seen, which is supposed to
528  * be equivalent to nothing at all.
529  *
530  * If \ is followed by a null character, we return a negative value and leave
531  * the string pointer pointing at the null character.
532  *
533  * If \ is followed by 000, we return 0 and leave the string pointer after the
534  * zeros.  A value of 0 does not mean end of string.
535  *
536  * POSIX doesn't allow \x.
537  */
538 
539 int
parse_escape(const char ** string_ptr)540 parse_escape(const char **string_ptr)
541 {
542 	int c = *(*string_ptr)++;
543 	int i;
544 	int count;
545 	int j;
546 	const char *start;
547 
548 	if (do_lint_old) {
549 		switch (c) {
550 		case 'a':
551 		case 'b':
552 		case 'f':
553 		case 'r':
554 			lintwarn(_("old awk does not support the `\\%c' escape sequence"), c);
555 			break;
556 		}
557 	}
558 
559 	switch (c) {
560 	case 'a':
561 		return '\a';
562 	case 'b':
563 		return '\b';
564 	case 'f':
565 		return '\f';
566 	case 'n':
567 		return '\n';
568 	case 'r':
569 		return '\r';
570 	case 't':
571 		return '\t';
572 	case 'v':
573 		return '\v';
574 	case '\n':
575 		return -2;
576 	case 0:
577 		(*string_ptr)--;
578 		return -1;
579 	case '0':
580 	case '1':
581 	case '2':
582 	case '3':
583 	case '4':
584 	case '5':
585 	case '6':
586 	case '7':
587 		i = c - '0';
588 		count = 0;
589 		while (++count < 3) {
590 			if ((c = *(*string_ptr)++) >= '0' && c <= '7') {
591 				i *= 8;
592 				i += c - '0';
593 			} else {
594 				(*string_ptr)--;
595 				break;
596 			}
597 		}
598 		return i;
599 	case 'x':
600 		if (do_lint) {
601 			static bool warned = false;
602 
603 			if (! warned) {
604 				warned = true;
605 				lintwarn(_("POSIX does not allow `\\x' escapes"));
606 			}
607 		}
608 		if (do_posix)
609 			return ('x');
610 		if (! isxdigit((unsigned char) (*string_ptr)[0])) {
611 			warning(_("no hex digits in `\\x' escape sequence"));
612 			return ('x');
613 		}
614 		start = *string_ptr;
615 		for (i = j = 0; j < 2; j++) {
616 			/* do outside test to avoid multiple side effects */
617 			c = (unsigned char) *(*string_ptr)++;
618 			if (isxdigit(c)) {
619 				i *= 16;
620 				if (isdigit(c))
621 					i += c - '0';
622 				else if (isupper(c))
623 					i += c - 'A' + 10;
624 				else
625 					i += c - 'a' + 10;
626 			} else {
627 				(*string_ptr)--;
628 				break;
629 			}
630 		}
631 		if (do_lint && j == 2 && isxdigit((unsigned char)*(*string_ptr)))
632 			lintwarn(_("hex escape \\x%.*s of %d characters probably not interpreted the way you expect"), 3, start, 3);
633 		return i;
634 	case '\\':
635 	case '"':
636 		return c;
637 	default:
638 	{
639 		static bool warned[256];
640 		unsigned char uc = (unsigned char) c;
641 
642 		/* N.B.: use unsigned char here to avoid Latin-1 problems */
643 
644 		if (! warned[uc]) {
645 			warned[uc] = true;
646 
647 			warning(_("escape sequence `\\%c' treated as plain `%c'"), uc, uc);
648 		}
649 	}
650 		return c;
651 	}
652 }
653 
654 /* get_numbase --- return the base to use for the number in 's' */
655 
656 int
get_numbase(const char * s,size_t len,bool use_locale)657 get_numbase(const char *s, size_t len, bool use_locale)
658 {
659 	int dec_point = '.';
660 	const char *str = s;
661 
662 #if defined(HAVE_LOCALE_H)
663 	/*
664 	 * loc.decimal_point may not have been initialized yet,
665 	 * so double check it before using it.
666 	 */
667 	if (use_locale && loc.decimal_point != NULL && loc.decimal_point[0] != '\0')
668 		dec_point = loc.decimal_point[0];	/* XXX --- assumes one char */
669 #endif
670 
671 	if (len < 2 || str[0] != '0')
672 		return 10;
673 
674 	/* leading 0x or 0X */
675 	if (str[1] == 'x' || str[1] == 'X')
676 		return 16;
677 
678 	/*
679 	 * Numbers with '.', 'e', or 'E' are decimal.
680 	 * Have to check so that things like 00.34 are handled right.
681 	 *
682 	 * These beasts can have trailing whitespace. Deal with that too.
683 	 */
684 	for (; len > 0; len--, str++) {
685 		if (*str == 'e' || *str == 'E' || *str == dec_point)
686 			return 10;
687 		else if (! isdigit((unsigned char) *str))
688 			break;
689 	}
690 
691 	if (! isdigit((unsigned char) s[1])
692 			|| s[1] == '8' || s[1] == '9'
693 	)
694 		return 10;
695 	return 8;
696 }
697 
698 /* str2wstr --- convert a multibyte string to a wide string */
699 
700 NODE *
str2wstr(NODE * n,size_t ** ptr)701 str2wstr(NODE *n, size_t **ptr)
702 {
703 	size_t i, count, src_count;
704 	char *sp;
705 	mbstate_t mbs;
706 	wchar_t wc, *wsp;
707 	static bool warned = false;
708 
709 	assert((n->flags & (STRING|STRCUR)) != 0);
710 
711 	/*
712 	 * Don't convert global null string or global null field
713 	 * variables to a wide string. They are both zero-length anyway.
714 	 * This also avoids future double-free errors while releasing
715 	 * shallow copies, eg. *tmp = *Null_field; free_wstr(tmp);
716 	 */
717 	if (n == Nnull_string || n == Null_field)
718 		return n;
719 
720 	if ((n->flags & WSTRCUR) != 0) {
721 		if (ptr == NULL)
722 			return n;
723 		/* otherwise
724 			fall through and recompute to fill in the array */
725 		free_wstr(n);
726 	}
727 
728 	/*
729 	 * After consideration and consultation, this
730 	 * code trades space for time. We allocate
731 	 * an array of wchar_t that is n->stlen long.
732 	 * This is needed in the worst case anyway, where
733 	 * each input byte maps to one wchar_t.  The
734 	 * advantage is that we only have to convert the string
735 	 * once, instead of twice, once to find out how many
736 	 * wide characters, and then again to actually fill in
737 	 * the info.  If there's a lot left over, we can
738 	 * realloc the wide string down in size.
739 	 */
740 
741 	emalloc(n->wstptr, wchar_t *, sizeof(wchar_t) * (n->stlen + 1), "str2wstr");
742 	wsp = n->wstptr;
743 
744 	/*
745 	 * For use by do_match, create and fill in an array.
746 	 * For each byte `i' in n->stptr (the original string),
747 	 * a[i] is equal to `j', where `j' is the corresponding wchar_t
748 	 * in the converted wide string.
749 	 *
750 	 * Create the array.
751 	 */
752 	if (ptr != NULL) {
753 		ezalloc(*ptr, size_t *, sizeof(size_t) * n->stlen, "str2wstr");
754 	}
755 
756 	sp = n->stptr;
757 	src_count = n->stlen;
758 	memset(& mbs, 0, sizeof(mbs));
759 	for (i = 0; src_count > 0; i++) {
760 		/*
761 		 * 9/2010: Check the current byte; if it's a valid character,
762 		 * then it doesn't start a multibyte sequence. This brings a
763 		 * big speed up. Thanks to Ulrich Drepper for the tip.
764 		 * 11/2010: Thanks to Paolo Bonzini for some even faster code.
765 		 */
766 		if (is_valid_character(*sp)) {
767 			count = 1;
768 			wc = btowc_cache(*sp);
769 		} else
770 			count = mbrtowc(& wc, sp, src_count, & mbs);
771 		switch (count) {
772 		case (size_t) -2:
773 		case (size_t) -1:
774 			/*
775 			 * mbrtowc(3) says the state of mbs becomes undefined
776 			 * after a bad character, so reset it.
777 			 */
778 			memset(& mbs, 0, sizeof(mbs));
779 
780 			/* Warn the user something's wrong */
781 			if (! warned) {
782 				warned = true;
783 				warning(_("Invalid multibyte data detected. There may be a mismatch between your data and your locale"));
784 			}
785 
786 			/*
787 			 * 8/2015: If we're using UTF, then instead of just
788 			 * skipping the character, plug in the Unicode
789 			 * replacement character. In most cases this gives
790 			 * us "better" results, in that character counts
791 			 * and string lengths tend to make more sense.
792 			 *
793 			 * Otherwise, just skip the bad byte and keep going,
794 			 * so that we get a more-or-less full string, instead of
795 			 * stopping early. This is particularly important
796 			 * for match() where we need to build the indices.
797 			 */
798 			if (using_utf8()) {
799 				count = 1;
800 				wc = 0xFFFD;	/* unicode replacement character */
801 				goto set_wc;
802 			} else {
803 				/* skip it and keep going */
804 				sp++;
805 				src_count--;
806 			}
807 			break;
808 
809 		case 0:
810 			count = 1;
811 			/* fall through */
812 		default:
813 		set_wc:
814 			*wsp++ = wc;
815 			src_count -= count;
816 			while (count--)  {
817 				if (ptr != NULL)
818 					(*ptr)[sp - n->stptr] = i;
819 				sp++;
820 			}
821 			break;
822 		}
823 	}
824 
825 	*wsp = L'\0';
826 	n->wstlen = wsp - n->wstptr;
827 	n->flags |= WSTRCUR;
828 #define ARBITRARY_AMOUNT_TO_GIVE_BACK 100
829 	if (n->stlen - n->wstlen > ARBITRARY_AMOUNT_TO_GIVE_BACK)
830 		erealloc(n->wstptr, wchar_t *, sizeof(wchar_t) * (n->wstlen + 1), "str2wstr");
831 
832 	return n;
833 }
834 
835 /* wstr2str --- convert a wide string back into multibyte one */
836 
837 NODE *
wstr2str(NODE * n)838 wstr2str(NODE *n)
839 {
840 	size_t result;
841 	size_t length;
842 	wchar_t *wp;
843 	mbstate_t mbs;
844 	char *newval, *cp;
845 
846 	assert(n->valref == 1);
847 	assert((n->flags & WSTRCUR) != 0);
848 
849 	/*
850 	 * Convert the wide chars in t1->wstptr back into m.b. chars.
851 	 * This is pretty grotty, but it's the most straightforward
852 	 * way to do things.
853 	 */
854 	memset(& mbs, 0, sizeof(mbs));
855 
856 	length = n->wstlen;
857 	emalloc(newval, char *, (length * gawk_mb_cur_max) + 1, "wstr2str");
858 
859 	wp = n->wstptr;
860 	for (cp = newval; length > 0; length--) {
861 		result = wcrtomb(cp, *wp, & mbs);
862 		if (result == (size_t) -1)	/* what to do? break seems best */
863 			break;
864 		cp += result;
865 		wp++;
866 	}
867 	*cp = '\0';
868 
869 	/* N.B. caller just created n with make_string, so this free is safe */
870 	efree(n->stptr);
871 	n->stptr = newval;
872 	n->stlen = cp - newval;
873 
874 	return n;
875 }
876 
877 /* free_wstr --- release the wide string part of a node */
878 
879 void
r_free_wstr(NODE * n)880 r_free_wstr(NODE *n)
881 {
882 	assert(n->type == Node_val);
883 
884 	if ((n->flags & WSTRCUR) != 0) {
885 		assert(n->wstptr != NULL);
886 		efree(n->wstptr);
887 	}
888 	n->wstptr = NULL;
889 	n->wstlen = 0;
890 	n->flags &= ~WSTRCUR;
891 }
892 
893 static void __attribute__ ((unused))
dump_wstr(FILE * fp,const wchar_t * str,size_t len)894 dump_wstr(FILE *fp, const wchar_t *str, size_t len)
895 {
896 	if (str == NULL || len == 0)
897 		return;
898 
899 	for (; len--; str++)
900 		putwc(*str, fp);
901 }
902 
903 /* wstrstr --- walk haystack, looking for needle, wide char version */
904 
905 const wchar_t *
wstrstr(const wchar_t * haystack,size_t hs_len,const wchar_t * needle,size_t needle_len)906 wstrstr(const wchar_t *haystack, size_t hs_len,
907 	const wchar_t *needle, size_t needle_len)
908 {
909 	size_t i;
910 
911 	if (haystack == NULL || needle == NULL || needle_len > hs_len)
912 		return NULL;
913 
914 	for (i = 0; i < hs_len; i++) {
915 		if (haystack[i] == needle[0]
916 		    && i+needle_len-1 < hs_len
917 		    && haystack[i+needle_len-1] == needle[needle_len-1]) {
918 			/* first & last chars match, check string */
919 			if (memcmp(haystack+i, needle, sizeof(wchar_t) * needle_len) == 0) {
920 				return haystack + i;
921 			}
922 		}
923 	}
924 
925 	return NULL;
926 }
927 
928 /* wcasestrstr --- walk haystack, nocase look for needle, wide char version */
929 
930 const wchar_t *
wcasestrstr(const wchar_t * haystack,size_t hs_len,const wchar_t * needle,size_t needle_len)931 wcasestrstr(const wchar_t *haystack, size_t hs_len,
932 	const wchar_t *needle, size_t needle_len)
933 {
934 	size_t i, j;
935 
936 	if (haystack == NULL || needle == NULL || needle_len > hs_len)
937 		return NULL;
938 
939 	for (i = 0; i < hs_len; i++) {
940 		if (towlower(haystack[i]) == towlower(needle[0])
941 		    && i+needle_len-1 < hs_len
942 		    && towlower(haystack[i+needle_len-1]) == towlower(needle[needle_len-1])) {
943 			/* first & last chars match, check string */
944 			const wchar_t *start;
945 
946 			start = haystack+i;
947 			for (j = 0; j < needle_len; j++, start++) {
948 				wchar_t h, n;
949 
950 				h = towlower(*start);
951 				n = towlower(needle[j]);
952 				if (h != n)
953 					goto out;
954 			}
955 			return haystack + i;
956 		}
957 out:	;
958 	}
959 
960 	return NULL;
961 }
962 
963 /* is_ieee_magic_val --- return true for +inf, -inf, +nan, -nan */
964 
965 bool
is_ieee_magic_val(const char * val)966 is_ieee_magic_val(const char *val)
967 {
968 	/*
969 	 * Avoid strncasecmp: it mishandles ASCII bytes in some locales.
970 	 * Assume the length is 4, as the caller checks this.
971 	 */
972 	return (   (val[0] == '+' || val[0] == '-')
973 		&& (   (   (val[1] == 'i' || val[1] == 'I')
974 			&& (val[2] == 'n' || val[2] == 'N')
975 			&& (val[3] == 'f' || val[3] == 'F'))
976 		    || (   (val[1] == 'n' || val[1] == 'N')
977 			&& (val[2] == 'a' || val[2] == 'A')
978 			&& (val[3] == 'n' || val[3] == 'N'))));
979 }
980 
981 /* get_ieee_magic_val --- return magic value for string */
982 
983 static AWKNUM
get_ieee_magic_val(char * val)984 get_ieee_magic_val(char *val)
985 {
986 	static bool first = true;
987 	static AWKNUM inf;
988 	static AWKNUM nan;
989 	char save;
990 
991 	char *ptr;
992 	save = val[4];
993 	val[4] = '\0';
994 	AWKNUM v = strtod(val, &ptr);
995 	val[4] = save;
996 
997 	if (val == ptr) { /* Older strtod implementations don't support inf or nan. */
998 		if (first) {
999 			first = false;
1000 			nan = sqrt(-1.0);
1001 			inf = -log(0.0);
1002 		}
1003 
1004 		v = ((val[1] == 'i' || val[1] == 'I') ? inf : nan);
1005 		if (val[0] == '-')
1006 			v = -v;
1007 	}
1008 
1009 	return v;
1010 }
1011 
1012 wint_t btowc_cache[256];
1013 
1014 /* init_btowc_cache --- initialize the cache */
1015 
init_btowc_cache()1016 void init_btowc_cache()
1017 {
1018 	int i;
1019 
1020 	for (i = 0; i <= 255; i++) {
1021 		btowc_cache[i] = btowc(i);
1022 	}
1023 }
1024 
1025 #define BLOCKCHUNK 100
1026 
1027 struct block_header nextfree[BLOCK_MAX] = {
1028 	{ NULL, sizeof(NODE), "node" },
1029 	{ NULL, sizeof(BUCKET), "bucket" },
1030 };
1031 
1032 #ifdef MEMDEBUG
1033 
1034 void *
r_getblock(int id)1035 r_getblock(int id)
1036 {
1037 	void *res;
1038 	emalloc(res, void *, nextfree[id].size, "getblock");
1039 	nextfree[id].active++;
1040 	if (nextfree[id].highwater < nextfree[id].active)
1041 		nextfree[id].highwater = nextfree[id].active;
1042 	return res;
1043 }
1044 
1045 void
r_freeblock(void * p,int id)1046 r_freeblock(void *p, int id)
1047 {
1048 	nextfree[id].active--;
1049 	free(p);
1050 }
1051 
1052 #else
1053 
1054 /* more_blocks --- get more blocks of memory and add to the free list;
1055 	size of a block must be >= sizeof(struct block_item)
1056  */
1057 
1058 void *
more_blocks(int id)1059 more_blocks(int id)
1060 {
1061 	struct block_item *freep, *np, *next;
1062 	char *p, *endp;
1063 	size_t size;
1064 
1065 	size = nextfree[id].size;
1066 
1067 	assert(size >= sizeof(struct block_item));
1068 	emalloc(freep, struct block_item *, BLOCKCHUNK * size, "more_blocks");
1069 	p = (char *) freep;
1070 	endp = p + BLOCKCHUNK * size;
1071 
1072 	for (np = freep; ; np = next) {
1073 		next = (struct block_item *) (p += size);
1074 		if (p >= endp) {
1075 			np->freep = NULL;
1076 			break;
1077 		}
1078 		np->freep = next;
1079 	}
1080 	nextfree[id].freep = freep->freep;
1081 	nextfree[id].highwater += BLOCKCHUNK;
1082 	return freep;
1083 }
1084 
1085 #endif
1086