1 /*
2 * node.c -- routines for node management
3 */
4
5 /*
6 * Copyright (C) 1986, 1988, 1989, 1991-2001, 2003-2015, 2017-2019, 2021,
7 * the Free Software Foundation, Inc.
8 *
9 * This file is part of GAWK, the GNU implementation of the
10 * AWK Programming Language.
11 *
12 * GAWK is free software; you can redistribute it and/or modify
13 * it under the terms of the GNU General Public License as published by
14 * the Free Software Foundation; either version 3 of the License, or
15 * (at your option) any later version.
16 *
17 * GAWK is distributed in the hope that it will be useful,
18 * but WITHOUT ANY WARRANTY; without even the implied warranty of
19 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20 * GNU General Public License for more details.
21 *
22 * You should have received a copy of the GNU General Public License
23 * along with this program; if not, write to the Free Software
24 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
25 */
26
27 #include "awk.h"
28 #include "floatmagic.h" /* definition of isnan */
29
30 static NODE *r_make_number(double x);
31 static AWKNUM get_ieee_magic_val(char *val);
32 extern NODE **fmt_list; /* declared in eval.c */
33
34 NODE *(*make_number)(double) = r_make_number;
35 NODE *(*str2number)(NODE *) = r_force_number;
36 NODE *(*format_val)(const char *, int, NODE *) = r_format_val;
37 int (*cmp_numbers)(const NODE *, const NODE *) = cmp_awknums;
38
39 /* is_hex --- return true if a string looks like a hex value */
40
41 static bool
is_hex(const char * str,const char * cpend)42 is_hex(const char *str, const char *cpend)
43 {
44 /* on entry, we know the string length is >= 1 */
45 if (*str == '-' || *str == '+')
46 str++;
47
48 if (str + 1 < cpend && str[0] == '0' && (str[1] == 'x' || str[1] == 'X'))
49 return true;
50
51 return false;
52 }
53
54 /* force_number --- force a value to be numeric */
55
56 NODE *
r_force_number(NODE * n)57 r_force_number(NODE *n)
58 {
59 char *cp;
60 char *cpend;
61 char save;
62 char *ptr;
63
64 if ((n->flags & NUMCUR) != 0)
65 return n;
66
67 /*
68 * We should always set NUMCUR. If USER_INPUT is set and it's a
69 * numeric string, we clear STRING and enable NUMBER, but if it's not
70 * numeric, we disable USER_INPUT.
71 */
72
73 /* All the conditionals are an attempt to avoid the expensive strtod */
74
75 n->flags |= NUMCUR;
76 n->numbr = 0.0;
77
78 /* Trim leading white space, bailing out if there's nothing else */
79 for (cp = n->stptr, cpend = cp + n->stlen;
80 cp < cpend && isspace((unsigned char) *cp); cp++)
81 continue;
82
83 if (cp == cpend)
84 goto badnum;
85
86 /* At this point, we know the string is not entirely white space */
87 /* Trim trailing white space */
88 while (isspace((unsigned char) cpend[-1]))
89 cpend--;
90
91 /*
92 * 2/2007:
93 * POSIX, by way of severe language lawyering, seems to
94 * allow things like "inf" and "nan" to mean something.
95 * So if do_posix, the user gets what he deserves.
96 * This also allows hexadecimal floating point. Ugh.
97 */
98 if (! do_posix) {
99 if (is_alpha((unsigned char) *cp))
100 goto badnum;
101 else if (is_ieee_magic_val(cp)) {
102 if (cpend == cp + 4) {
103 n->numbr = get_ieee_magic_val(cp);
104 goto goodnum;
105 } else
106 goto badnum;
107 }
108 /* else
109 fall through */
110 }
111 /* else POSIX, so
112 fall through */
113
114 if ( (! do_posix /* not POSIXLY paranoid and */
115 && (is_alpha((unsigned char) *cp) /* letter, or */
116 /* CANNOT do non-decimal and saw 0x */
117 || (! do_non_decimal_data && is_hex(cp, cpend))))) {
118 goto badnum;
119 }
120
121 if (cpend - cp == 1) { /* only one character */
122 if (isdigit((unsigned char) *cp)) { /* it's a digit! */
123 n->numbr = (AWKNUM)(*cp - '0');
124 if (n->stlen == 1) /* no white space */
125 n->flags |= NUMINT;
126 goto goodnum;
127 }
128 goto badnum;
129 }
130
131 errno = 0;
132 if (do_non_decimal_data /* main.c assures false if do_posix */
133 && ! do_traditional && get_numbase(cp, cpend - cp, true) != 10) {
134 /* nondec2awknum() saves and restores the byte after the string itself */
135 n->numbr = nondec2awknum(cp, cpend - cp, &ptr);
136 } else {
137 save = *cpend;
138 *cpend = '\0';
139 n->numbr = (AWKNUM) strtod((const char *) cp, &ptr);
140 *cpend = save;
141 }
142
143 if (errno == 0 || errno == ERANGE) {
144 errno = 0; /* reset in case of ERANGE */
145 if (ptr == cpend)
146 goto goodnum;
147 /* else keep the leading numeric value without updating flags */
148 /* fall through to badnum */
149 } else {
150 errno = 0;
151 /*
152 * N.B. For subnormal values, strtod may return the
153 * floating-point representation while setting errno to ERANGE.
154 * We force the numeric value to 0 in such cases.
155 */
156 n->numbr = 0;
157 /*
158 * Or should we accept it as a NUMBER even though strtod
159 * threw an error?
160 */
161 /* fall through to badnum */
162 }
163 badnum:
164 n->flags &= ~USER_INPUT;
165 return n;
166
167 goodnum:
168 if (isnan(n->numbr) && *cp == '-' && signbit(n->numbr) == 0)
169 n->numbr = -(n->numbr);
170
171 if ((n->flags & USER_INPUT) != 0) {
172 /* leave USER_INPUT enabled to indicate that this is a strnum */
173 n->flags &= ~STRING;
174 n->flags |= NUMBER;
175 }
176 return n;
177 }
178
179
180 /*
181 * The following lookup table is used as an optimization in force_string;
182 * (more complicated) variations on this theme didn't seem to pay off, but
183 * systematic testing might be in order at some point.
184 */
185 static const char *values[] = {
186 "0",
187 "1",
188 "2",
189 "3",
190 "4",
191 "5",
192 "6",
193 "7",
194 "8",
195 "9",
196 };
197 #define NVAL (sizeof(values)/sizeof(values[0]))
198
199 /* r_format_val --- format a numeric value based on format */
200
201 NODE *
r_format_val(const char * format,int index,NODE * s)202 r_format_val(const char *format, int index, NODE *s)
203 {
204 char buf[BUFSIZ];
205 char *sp = buf;
206 double val;
207
208 /*
209 * 2/2007: Simplify our lives here. Instead of worrying about
210 * whether or not the value will fit into a long just so we
211 * can use sprintf("%ld", val) on it, always format it ourselves.
212 * The only thing to worry about is that integral values always
213 * format as integers. %.0f does that very well.
214 *
215 * 6/2008: Would that things were so simple. Always using %.0f
216 * imposes a notable performance penalty for applications that
217 * do a lot of conversion of integers to strings. So, we reinstate
218 * the old code, but use %.0f for integral values that are outside
219 * the range of a long. This seems a reasonable compromise.
220 *
221 * 12/2009: Use <= and >= in the comparisons with LONG_xxx instead of
222 * < and > so that things work correctly on systems with 64 bit integers.
223 */
224
225 if (out_of_range(s)) {
226 const char *result = format_nan_inf(s, 'g');
227 return make_string(result, strlen(result));
228 } else if ((val = double_to_int(s->numbr)) != s->numbr
229 || val <= LONG_MIN || val >= LONG_MAX
230 ) {
231 /* not an integral value, or out of integer range */
232 /*
233 * Once upon a time, we just blindly did this:
234 * sprintf(sp, format, s->numbr);
235 * s->stlen = strlen(sp);
236 * s->stfmt = index;
237 * but that's no good if, e.g., OFMT is %s. So we punt,
238 * and just always format the value ourselves.
239 */
240
241 NODE *dummy[2], *r;
242 unsigned int oflags;
243
244 /* create dummy node for a sole use of format_tree */
245 dummy[1] = s;
246 oflags = s->flags;
247
248 if (val == s->numbr) {
249 /* integral value, but outside range of %ld, use %.0f */
250 r = format_tree("%.0f", 4, dummy, 2);
251 s->stfmt = STFMT_UNUSED;
252 } else {
253 r = format_tree(format, fmt_list[index]->stlen, dummy, 2);
254 assert(r != NULL);
255 s->stfmt = index;
256 }
257 s->flags = oflags;
258 s->stlen = r->stlen;
259 if ((s->flags & (MALLOC|STRCUR)) == (MALLOC|STRCUR))
260 efree(s->stptr);
261 s->stptr = r->stptr;
262 #ifdef HAVE_MPFR
263 s->strndmode = MPFR_round_mode;
264 #endif
265 freenode(r); /* Do not unref(r)! We want to keep s->stptr == r->stpr. */
266
267 goto no_malloc;
268 } else {
269 /*
270 * integral value; force conversion to long only once.
271 */
272 long num = (long) val;
273
274 if (num < NVAL && num >= 0) {
275 sp = (char *) values[num];
276 s->stlen = 1;
277 } else {
278 (void) sprintf(sp, "%ld", num);
279 s->stlen = strlen(sp);
280 }
281 s->stfmt = STFMT_UNUSED;
282 if ((s->flags & INTIND) != 0) {
283 s->flags &= ~(INTIND|NUMBER);
284 s->flags |= STRING;
285 }
286 #ifdef HAVE_MPFR
287 s->strndmode = MPFR_round_mode;
288 #endif
289 }
290 if ((s->flags & (MALLOC|STRCUR)) == (MALLOC|STRCUR))
291 efree(s->stptr);
292 emalloc(s->stptr, char *, s->stlen + 1, "format_val");
293 memcpy(s->stptr, sp, s->stlen + 1);
294 no_malloc:
295 s->flags |= STRCUR;
296 free_wstr(s);
297 return s;
298 }
299
300 /* r_dupnode --- duplicate a node */
301
302 NODE *
r_dupnode(NODE * n)303 r_dupnode(NODE *n)
304 {
305 NODE *r;
306
307 assert(n->type == Node_val);
308
309 #ifdef GAWKDEBUG
310 /* Do the same as in awk.h:dupnode(). */
311 if ((n->flags & MALLOC) != 0) {
312 n->valref++;
313 return n;
314 }
315 #endif
316 getnode(r);
317 *r = *n;
318
319 #ifdef HAVE_MPFR
320 if ((n->flags & MPZN) != 0) {
321 mpz_init(r->mpg_i);
322 mpz_set(r->mpg_i, n->mpg_i);
323 } else if ((n->flags & MPFN) != 0) {
324 mpfr_init(r->mpg_numbr);
325 int tval = mpfr_set(r->mpg_numbr, n->mpg_numbr, ROUND_MODE);
326 IEEE_FMT(r->mpg_numbr, tval);
327 }
328 #endif
329
330 r->flags |= MALLOC;
331 r->valref = 1;
332 /*
333 * DON'T call free_wstr(r) here!
334 * r->wstptr still points at n->wstptr's value, and we
335 * don't want to free it!
336 */
337 r->wstptr = NULL;
338 r->wstlen = 0;
339
340 if ((n->flags & STRCUR) != 0) {
341 emalloc(r->stptr, char *, n->stlen + 1, "r_dupnode");
342 memcpy(r->stptr, n->stptr, n->stlen);
343 r->stptr[n->stlen] = '\0';
344 r->stlen = n->stlen;
345 if ((n->flags & WSTRCUR) != 0) {
346 r->wstlen = n->wstlen;
347 emalloc(r->wstptr, wchar_t *, sizeof(wchar_t) * (n->wstlen + 1), "r_dupnode");
348 memcpy(r->wstptr, n->wstptr, n->wstlen * sizeof(wchar_t));
349 r->wstptr[n->wstlen] = L'\0';
350 r->flags |= WSTRCUR;
351 }
352 }
353
354 return r;
355 }
356
357 /* r_make_number --- allocate a node with defined number */
358
359 static NODE *
r_make_number(double x)360 r_make_number(double x)
361 {
362 NODE *r = make_number_node(0);
363 r->numbr = x;
364 return r;
365 }
366
367 /* cmp_awknums --- compare two AWKNUMs */
368
369 int
cmp_awknums(const NODE * t1,const NODE * t2)370 cmp_awknums(const NODE *t1, const NODE *t2)
371 {
372 /*
373 * This routine is also used to sort numeric array indices or values.
374 * For the purposes of sorting, NaN is considered greater than
375 * any other value, and all NaN values are considered equivalent and equal.
376 * This isn't in compliance with IEEE standard, but compliance w.r.t. NaN
377 * comparison at the awk level is a different issue, and needs to be dealt
378 * with in the interpreter for each opcode seperately.
379 */
380
381 if (isnan(t1->numbr))
382 return ! isnan(t2->numbr);
383 if (isnan(t2->numbr))
384 return -1;
385 /* don't subtract, in case one or both are infinite */
386 if (t1->numbr == t2->numbr)
387 return 0;
388 if (t1->numbr < t2->numbr)
389 return -1;
390 return 1;
391 }
392
393
394 /* make_str_node --- make a string node */
395
396 NODE *
make_str_node(const char * s,size_t len,int flags)397 make_str_node(const char *s, size_t len, int flags)
398 {
399 NODE *r;
400 getnode(r);
401 r->type = Node_val;
402 r->numbr = 0;
403 r->flags = (MALLOC|STRING|STRCUR);
404 r->valref = 1;
405 r->stfmt = STFMT_UNUSED;
406 #ifdef HAVE_MPFR
407 r->strndmode = MPFR_round_mode;
408 #endif
409 r->wstptr = NULL;
410 r->wstlen = 0;
411
412 if ((flags & ALREADY_MALLOCED) != 0)
413 r->stptr = (char *) s;
414 else {
415 emalloc(r->stptr, char *, len + 1, "make_str_node");
416 memcpy(r->stptr, s, len);
417 }
418 r->stptr[len] = '\0';
419
420 if ((flags & SCAN) != 0) { /* scan for escape sequences */
421 const char *pf;
422 char *ptm;
423 int c;
424 const char *end;
425 mbstate_t cur_state;
426
427 memset(& cur_state, 0, sizeof(cur_state));
428
429 end = &(r->stptr[len]);
430 for (pf = ptm = r->stptr; pf < end;) {
431 /*
432 * Keep multibyte characters together. This avoids
433 * problems if a subsequent byte of a multibyte
434 * character happens to be a backslash.
435 */
436 if (gawk_mb_cur_max > 1) {
437 int mblen = mbrlen(pf, end-pf, &cur_state);
438
439 if (mblen > 1) {
440 int i;
441
442 for (i = 0; i < mblen; i++)
443 *ptm++ = *pf++;
444 continue;
445 }
446 }
447
448 c = *pf++;
449 if (c == '\\') {
450 c = parse_escape(&pf);
451 if (c < 0) {
452 if (do_lint)
453 lintwarn(_("backslash string continuation is not portable"));
454 if ((flags & ELIDE_BACK_NL) != 0)
455 continue;
456 c = '\\';
457 }
458 *ptm++ = c;
459 } else
460 *ptm++ = c;
461 }
462 len = ptm - r->stptr;
463 erealloc(r->stptr, char *, len + 1, "make_str_node");
464 r->stptr[len] = '\0';
465 }
466 r->stlen = len;
467
468 return r;
469 }
470
471 /* make_typed_regex --- make a typed regex node */
472
473 NODE *
make_typed_regex(const char * re,size_t len)474 make_typed_regex(const char *re, size_t len)
475 {
476 NODE *n, *exp, *n2;
477
478 exp = make_str_node(re, len, ALREADY_MALLOCED);
479 n = make_regnode(Node_regex, exp);
480 if (n == NULL)
481 fatal(_("could not make typed regex"));
482
483 n2 = make_string(re, len);
484 n2->typed_re = n;
485 #if HAVE_MPFR
486 if (do_mpfr)
487 mpg_zero(n2);
488 else
489 #endif
490 n2->numbr = 0;
491 n2->flags |= NUMCUR|STRCUR|REGEX;
492 n2->flags &= ~(STRING|NUMBER);
493
494 return n2;
495 }
496
497
498 /* unref --- remove reference to a particular node */
499
500 void
r_unref(NODE * tmp)501 r_unref(NODE *tmp)
502 {
503 #ifdef GAWKDEBUG
504 /* Do the same as in awk.h:unref(). */
505 assert(tmp == NULL || tmp->valref > 0);
506 if (tmp == NULL || --tmp->valref > 0)
507 return;
508 #endif
509
510 if ((tmp->flags & (MALLOC|STRCUR)) == (MALLOC|STRCUR))
511 efree(tmp->stptr);
512
513 mpfr_unset(tmp);
514
515 free_wstr(tmp);
516 freenode(tmp);
517 }
518
519
520 /*
521 * parse_escape:
522 *
523 * Parse a C escape sequence. STRING_PTR points to a variable containing a
524 * pointer to the string to parse. That pointer is updated past the
525 * characters we use. The value of the escape sequence is returned.
526 *
527 * A negative value means the sequence \ newline was seen, which is supposed to
528 * be equivalent to nothing at all.
529 *
530 * If \ is followed by a null character, we return a negative value and leave
531 * the string pointer pointing at the null character.
532 *
533 * If \ is followed by 000, we return 0 and leave the string pointer after the
534 * zeros. A value of 0 does not mean end of string.
535 *
536 * POSIX doesn't allow \x.
537 */
538
539 int
parse_escape(const char ** string_ptr)540 parse_escape(const char **string_ptr)
541 {
542 int c = *(*string_ptr)++;
543 int i;
544 int count;
545 int j;
546 const char *start;
547
548 if (do_lint_old) {
549 switch (c) {
550 case 'a':
551 case 'b':
552 case 'f':
553 case 'r':
554 lintwarn(_("old awk does not support the `\\%c' escape sequence"), c);
555 break;
556 }
557 }
558
559 switch (c) {
560 case 'a':
561 return '\a';
562 case 'b':
563 return '\b';
564 case 'f':
565 return '\f';
566 case 'n':
567 return '\n';
568 case 'r':
569 return '\r';
570 case 't':
571 return '\t';
572 case 'v':
573 return '\v';
574 case '\n':
575 return -2;
576 case 0:
577 (*string_ptr)--;
578 return -1;
579 case '0':
580 case '1':
581 case '2':
582 case '3':
583 case '4':
584 case '5':
585 case '6':
586 case '7':
587 i = c - '0';
588 count = 0;
589 while (++count < 3) {
590 if ((c = *(*string_ptr)++) >= '0' && c <= '7') {
591 i *= 8;
592 i += c - '0';
593 } else {
594 (*string_ptr)--;
595 break;
596 }
597 }
598 return i;
599 case 'x':
600 if (do_lint) {
601 static bool warned = false;
602
603 if (! warned) {
604 warned = true;
605 lintwarn(_("POSIX does not allow `\\x' escapes"));
606 }
607 }
608 if (do_posix)
609 return ('x');
610 if (! isxdigit((unsigned char) (*string_ptr)[0])) {
611 warning(_("no hex digits in `\\x' escape sequence"));
612 return ('x');
613 }
614 start = *string_ptr;
615 for (i = j = 0; j < 2; j++) {
616 /* do outside test to avoid multiple side effects */
617 c = (unsigned char) *(*string_ptr)++;
618 if (isxdigit(c)) {
619 i *= 16;
620 if (isdigit(c))
621 i += c - '0';
622 else if (isupper(c))
623 i += c - 'A' + 10;
624 else
625 i += c - 'a' + 10;
626 } else {
627 (*string_ptr)--;
628 break;
629 }
630 }
631 if (do_lint && j == 2 && isxdigit((unsigned char)*(*string_ptr)))
632 lintwarn(_("hex escape \\x%.*s of %d characters probably not interpreted the way you expect"), 3, start, 3);
633 return i;
634 case '\\':
635 case '"':
636 return c;
637 default:
638 {
639 static bool warned[256];
640 unsigned char uc = (unsigned char) c;
641
642 /* N.B.: use unsigned char here to avoid Latin-1 problems */
643
644 if (! warned[uc]) {
645 warned[uc] = true;
646
647 warning(_("escape sequence `\\%c' treated as plain `%c'"), uc, uc);
648 }
649 }
650 return c;
651 }
652 }
653
654 /* get_numbase --- return the base to use for the number in 's' */
655
656 int
get_numbase(const char * s,size_t len,bool use_locale)657 get_numbase(const char *s, size_t len, bool use_locale)
658 {
659 int dec_point = '.';
660 const char *str = s;
661
662 #if defined(HAVE_LOCALE_H)
663 /*
664 * loc.decimal_point may not have been initialized yet,
665 * so double check it before using it.
666 */
667 if (use_locale && loc.decimal_point != NULL && loc.decimal_point[0] != '\0')
668 dec_point = loc.decimal_point[0]; /* XXX --- assumes one char */
669 #endif
670
671 if (len < 2 || str[0] != '0')
672 return 10;
673
674 /* leading 0x or 0X */
675 if (str[1] == 'x' || str[1] == 'X')
676 return 16;
677
678 /*
679 * Numbers with '.', 'e', or 'E' are decimal.
680 * Have to check so that things like 00.34 are handled right.
681 *
682 * These beasts can have trailing whitespace. Deal with that too.
683 */
684 for (; len > 0; len--, str++) {
685 if (*str == 'e' || *str == 'E' || *str == dec_point)
686 return 10;
687 else if (! isdigit((unsigned char) *str))
688 break;
689 }
690
691 if (! isdigit((unsigned char) s[1])
692 || s[1] == '8' || s[1] == '9'
693 )
694 return 10;
695 return 8;
696 }
697
698 /* str2wstr --- convert a multibyte string to a wide string */
699
700 NODE *
str2wstr(NODE * n,size_t ** ptr)701 str2wstr(NODE *n, size_t **ptr)
702 {
703 size_t i, count, src_count;
704 char *sp;
705 mbstate_t mbs;
706 wchar_t wc, *wsp;
707 static bool warned = false;
708
709 assert((n->flags & (STRING|STRCUR)) != 0);
710
711 /*
712 * Don't convert global null string or global null field
713 * variables to a wide string. They are both zero-length anyway.
714 * This also avoids future double-free errors while releasing
715 * shallow copies, eg. *tmp = *Null_field; free_wstr(tmp);
716 */
717 if (n == Nnull_string || n == Null_field)
718 return n;
719
720 if ((n->flags & WSTRCUR) != 0) {
721 if (ptr == NULL)
722 return n;
723 /* otherwise
724 fall through and recompute to fill in the array */
725 free_wstr(n);
726 }
727
728 /*
729 * After consideration and consultation, this
730 * code trades space for time. We allocate
731 * an array of wchar_t that is n->stlen long.
732 * This is needed in the worst case anyway, where
733 * each input byte maps to one wchar_t. The
734 * advantage is that we only have to convert the string
735 * once, instead of twice, once to find out how many
736 * wide characters, and then again to actually fill in
737 * the info. If there's a lot left over, we can
738 * realloc the wide string down in size.
739 */
740
741 emalloc(n->wstptr, wchar_t *, sizeof(wchar_t) * (n->stlen + 1), "str2wstr");
742 wsp = n->wstptr;
743
744 /*
745 * For use by do_match, create and fill in an array.
746 * For each byte `i' in n->stptr (the original string),
747 * a[i] is equal to `j', where `j' is the corresponding wchar_t
748 * in the converted wide string.
749 *
750 * Create the array.
751 */
752 if (ptr != NULL) {
753 ezalloc(*ptr, size_t *, sizeof(size_t) * n->stlen, "str2wstr");
754 }
755
756 sp = n->stptr;
757 src_count = n->stlen;
758 memset(& mbs, 0, sizeof(mbs));
759 for (i = 0; src_count > 0; i++) {
760 /*
761 * 9/2010: Check the current byte; if it's a valid character,
762 * then it doesn't start a multibyte sequence. This brings a
763 * big speed up. Thanks to Ulrich Drepper for the tip.
764 * 11/2010: Thanks to Paolo Bonzini for some even faster code.
765 */
766 if (is_valid_character(*sp)) {
767 count = 1;
768 wc = btowc_cache(*sp);
769 } else
770 count = mbrtowc(& wc, sp, src_count, & mbs);
771 switch (count) {
772 case (size_t) -2:
773 case (size_t) -1:
774 /*
775 * mbrtowc(3) says the state of mbs becomes undefined
776 * after a bad character, so reset it.
777 */
778 memset(& mbs, 0, sizeof(mbs));
779
780 /* Warn the user something's wrong */
781 if (! warned) {
782 warned = true;
783 warning(_("Invalid multibyte data detected. There may be a mismatch between your data and your locale"));
784 }
785
786 /*
787 * 8/2015: If we're using UTF, then instead of just
788 * skipping the character, plug in the Unicode
789 * replacement character. In most cases this gives
790 * us "better" results, in that character counts
791 * and string lengths tend to make more sense.
792 *
793 * Otherwise, just skip the bad byte and keep going,
794 * so that we get a more-or-less full string, instead of
795 * stopping early. This is particularly important
796 * for match() where we need to build the indices.
797 */
798 if (using_utf8()) {
799 count = 1;
800 wc = 0xFFFD; /* unicode replacement character */
801 goto set_wc;
802 } else {
803 /* skip it and keep going */
804 sp++;
805 src_count--;
806 }
807 break;
808
809 case 0:
810 count = 1;
811 /* fall through */
812 default:
813 set_wc:
814 *wsp++ = wc;
815 src_count -= count;
816 while (count--) {
817 if (ptr != NULL)
818 (*ptr)[sp - n->stptr] = i;
819 sp++;
820 }
821 break;
822 }
823 }
824
825 *wsp = L'\0';
826 n->wstlen = wsp - n->wstptr;
827 n->flags |= WSTRCUR;
828 #define ARBITRARY_AMOUNT_TO_GIVE_BACK 100
829 if (n->stlen - n->wstlen > ARBITRARY_AMOUNT_TO_GIVE_BACK)
830 erealloc(n->wstptr, wchar_t *, sizeof(wchar_t) * (n->wstlen + 1), "str2wstr");
831
832 return n;
833 }
834
835 /* wstr2str --- convert a wide string back into multibyte one */
836
837 NODE *
wstr2str(NODE * n)838 wstr2str(NODE *n)
839 {
840 size_t result;
841 size_t length;
842 wchar_t *wp;
843 mbstate_t mbs;
844 char *newval, *cp;
845
846 assert(n->valref == 1);
847 assert((n->flags & WSTRCUR) != 0);
848
849 /*
850 * Convert the wide chars in t1->wstptr back into m.b. chars.
851 * This is pretty grotty, but it's the most straightforward
852 * way to do things.
853 */
854 memset(& mbs, 0, sizeof(mbs));
855
856 length = n->wstlen;
857 emalloc(newval, char *, (length * gawk_mb_cur_max) + 1, "wstr2str");
858
859 wp = n->wstptr;
860 for (cp = newval; length > 0; length--) {
861 result = wcrtomb(cp, *wp, & mbs);
862 if (result == (size_t) -1) /* what to do? break seems best */
863 break;
864 cp += result;
865 wp++;
866 }
867 *cp = '\0';
868
869 /* N.B. caller just created n with make_string, so this free is safe */
870 efree(n->stptr);
871 n->stptr = newval;
872 n->stlen = cp - newval;
873
874 return n;
875 }
876
877 /* free_wstr --- release the wide string part of a node */
878
879 void
r_free_wstr(NODE * n)880 r_free_wstr(NODE *n)
881 {
882 assert(n->type == Node_val);
883
884 if ((n->flags & WSTRCUR) != 0) {
885 assert(n->wstptr != NULL);
886 efree(n->wstptr);
887 }
888 n->wstptr = NULL;
889 n->wstlen = 0;
890 n->flags &= ~WSTRCUR;
891 }
892
893 static void __attribute__ ((unused))
dump_wstr(FILE * fp,const wchar_t * str,size_t len)894 dump_wstr(FILE *fp, const wchar_t *str, size_t len)
895 {
896 if (str == NULL || len == 0)
897 return;
898
899 for (; len--; str++)
900 putwc(*str, fp);
901 }
902
903 /* wstrstr --- walk haystack, looking for needle, wide char version */
904
905 const wchar_t *
wstrstr(const wchar_t * haystack,size_t hs_len,const wchar_t * needle,size_t needle_len)906 wstrstr(const wchar_t *haystack, size_t hs_len,
907 const wchar_t *needle, size_t needle_len)
908 {
909 size_t i;
910
911 if (haystack == NULL || needle == NULL || needle_len > hs_len)
912 return NULL;
913
914 for (i = 0; i < hs_len; i++) {
915 if (haystack[i] == needle[0]
916 && i+needle_len-1 < hs_len
917 && haystack[i+needle_len-1] == needle[needle_len-1]) {
918 /* first & last chars match, check string */
919 if (memcmp(haystack+i, needle, sizeof(wchar_t) * needle_len) == 0) {
920 return haystack + i;
921 }
922 }
923 }
924
925 return NULL;
926 }
927
928 /* wcasestrstr --- walk haystack, nocase look for needle, wide char version */
929
930 const wchar_t *
wcasestrstr(const wchar_t * haystack,size_t hs_len,const wchar_t * needle,size_t needle_len)931 wcasestrstr(const wchar_t *haystack, size_t hs_len,
932 const wchar_t *needle, size_t needle_len)
933 {
934 size_t i, j;
935
936 if (haystack == NULL || needle == NULL || needle_len > hs_len)
937 return NULL;
938
939 for (i = 0; i < hs_len; i++) {
940 if (towlower(haystack[i]) == towlower(needle[0])
941 && i+needle_len-1 < hs_len
942 && towlower(haystack[i+needle_len-1]) == towlower(needle[needle_len-1])) {
943 /* first & last chars match, check string */
944 const wchar_t *start;
945
946 start = haystack+i;
947 for (j = 0; j < needle_len; j++, start++) {
948 wchar_t h, n;
949
950 h = towlower(*start);
951 n = towlower(needle[j]);
952 if (h != n)
953 goto out;
954 }
955 return haystack + i;
956 }
957 out: ;
958 }
959
960 return NULL;
961 }
962
963 /* is_ieee_magic_val --- return true for +inf, -inf, +nan, -nan */
964
965 bool
is_ieee_magic_val(const char * val)966 is_ieee_magic_val(const char *val)
967 {
968 /*
969 * Avoid strncasecmp: it mishandles ASCII bytes in some locales.
970 * Assume the length is 4, as the caller checks this.
971 */
972 return ( (val[0] == '+' || val[0] == '-')
973 && ( ( (val[1] == 'i' || val[1] == 'I')
974 && (val[2] == 'n' || val[2] == 'N')
975 && (val[3] == 'f' || val[3] == 'F'))
976 || ( (val[1] == 'n' || val[1] == 'N')
977 && (val[2] == 'a' || val[2] == 'A')
978 && (val[3] == 'n' || val[3] == 'N'))));
979 }
980
981 /* get_ieee_magic_val --- return magic value for string */
982
983 static AWKNUM
get_ieee_magic_val(char * val)984 get_ieee_magic_val(char *val)
985 {
986 static bool first = true;
987 static AWKNUM inf;
988 static AWKNUM nan;
989 char save;
990
991 char *ptr;
992 save = val[4];
993 val[4] = '\0';
994 AWKNUM v = strtod(val, &ptr);
995 val[4] = save;
996
997 if (val == ptr) { /* Older strtod implementations don't support inf or nan. */
998 if (first) {
999 first = false;
1000 nan = sqrt(-1.0);
1001 inf = -log(0.0);
1002 }
1003
1004 v = ((val[1] == 'i' || val[1] == 'I') ? inf : nan);
1005 if (val[0] == '-')
1006 v = -v;
1007 }
1008
1009 return v;
1010 }
1011
1012 wint_t btowc_cache[256];
1013
1014 /* init_btowc_cache --- initialize the cache */
1015
init_btowc_cache()1016 void init_btowc_cache()
1017 {
1018 int i;
1019
1020 for (i = 0; i <= 255; i++) {
1021 btowc_cache[i] = btowc(i);
1022 }
1023 }
1024
1025 #define BLOCKCHUNK 100
1026
1027 struct block_header nextfree[BLOCK_MAX] = {
1028 { NULL, sizeof(NODE), "node" },
1029 { NULL, sizeof(BUCKET), "bucket" },
1030 };
1031
1032 #ifdef MEMDEBUG
1033
1034 void *
r_getblock(int id)1035 r_getblock(int id)
1036 {
1037 void *res;
1038 emalloc(res, void *, nextfree[id].size, "getblock");
1039 nextfree[id].active++;
1040 if (nextfree[id].highwater < nextfree[id].active)
1041 nextfree[id].highwater = nextfree[id].active;
1042 return res;
1043 }
1044
1045 void
r_freeblock(void * p,int id)1046 r_freeblock(void *p, int id)
1047 {
1048 nextfree[id].active--;
1049 free(p);
1050 }
1051
1052 #else
1053
1054 /* more_blocks --- get more blocks of memory and add to the free list;
1055 size of a block must be >= sizeof(struct block_item)
1056 */
1057
1058 void *
more_blocks(int id)1059 more_blocks(int id)
1060 {
1061 struct block_item *freep, *np, *next;
1062 char *p, *endp;
1063 size_t size;
1064
1065 size = nextfree[id].size;
1066
1067 assert(size >= sizeof(struct block_item));
1068 emalloc(freep, struct block_item *, BLOCKCHUNK * size, "more_blocks");
1069 p = (char *) freep;
1070 endp = p + BLOCKCHUNK * size;
1071
1072 for (np = freep; ; np = next) {
1073 next = (struct block_item *) (p += size);
1074 if (p >= endp) {
1075 np->freep = NULL;
1076 break;
1077 }
1078 np->freep = next;
1079 }
1080 nextfree[id].freep = freep->freep;
1081 nextfree[id].highwater += BLOCKCHUNK;
1082 return freep;
1083 }
1084
1085 #endif
1086