1 /*
2 * field.c - routines for dealing with fields and record parsing
3 */
4
5 /*
6 * Copyright (C) 1986, 1988, 1989, 1991-2021 the Free Software Foundation, Inc.
7 *
8 * This file is part of GAWK, the GNU implementation of the
9 * AWK Programming Language.
10 *
11 * GAWK is free software; you can redistribute it and/or modify
12 * it under the terms of the GNU General Public License as published by
13 * the Free Software Foundation; either version 3 of the License, or
14 * (at your option) any later version.
15 *
16 * GAWK is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 * GNU General Public License for more details.
20 *
21 * You should have received a copy of the GNU General Public License
22 * along with this program; if not, write to the Free Software
23 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
24 */
25
26 #include "awk.h"
27
28 /*
29 * In case that the system doesn't have isblank().
30 * Don't bother with autoconf ifdef junk, just force it.
31 * See dfa.c and regex_internal.h and regcomp.c. Bleah.
32 */
33 static int
is_blank(int c)34 is_blank(int c)
35 {
36 return c == ' ' || c == '\t';
37 }
38
39 typedef void (* Setfunc)(long, char *, long, NODE *);
40
41 /* is the API currently overriding the default parsing mechanism? */
42 static bool api_parser_override = false;
43 typedef long (*parse_field_func_t)(long, char **, int, NODE *,
44 Regexp *, Setfunc, NODE *, NODE *, bool);
45 static parse_field_func_t parse_field;
46 /*
47 * N.B. The normal_parse_field function pointer contains the parse_field value
48 * that should be used except when API field parsing is overriding the default
49 * field parsing mechanism.
50 */
51 static parse_field_func_t normal_parse_field;
52 static long re_parse_field(long, char **, int, NODE *,
53 Regexp *, Setfunc, NODE *, NODE *, bool);
54 static long def_parse_field(long, char **, int, NODE *,
55 Regexp *, Setfunc, NODE *, NODE *, bool);
56 static long null_parse_field(long, char **, int, NODE *,
57 Regexp *, Setfunc, NODE *, NODE *, bool);
58 static long sc_parse_field(long, char **, int, NODE *,
59 Regexp *, Setfunc, NODE *, NODE *, bool);
60 static long fw_parse_field(long, char **, int, NODE *,
61 Regexp *, Setfunc, NODE *, NODE *, bool);
62 static const awk_fieldwidth_info_t *api_fw = NULL;
63 static long fpat_parse_field(long, char **, int, NODE *,
64 Regexp *, Setfunc, NODE *, NODE *, bool);
65 static void set_element(long num, char * str, long len, NODE *arr);
66 static void grow_fields_arr(long num);
67 static void set_field(long num, char *str, long len, NODE *dummy);
68 static void purge_record(void);
69
70 static char *parse_extent; /* marks where to restart parse of record */
71 static long parse_high_water = 0; /* field number that we have parsed so far */
72 static long nf_high_water = 0; /* size of fields_arr */
73 static bool resave_fs;
74 static NODE *save_FS; /* save current value of FS when line is read,
75 * to be used in deferred parsing
76 */
77 static NODE *save_FPAT; /* save current value of FPAT when line is read,
78 * to be used in deferred parsing
79 */
80 static awk_fieldwidth_info_t *FIELDWIDTHS = NULL;
81
82 NODE **fields_arr; /* array of pointers to the field nodes */
83 bool field0_valid; /* $(>0) has not been changed yet */
84 int default_FS; /* true when FS == " " */
85 Regexp *FS_re_yes_case = NULL;
86 Regexp *FS_re_no_case = NULL;
87 Regexp *FS_regexp = NULL;
88 Regexp *FPAT_re_yes_case = NULL;
89 Regexp *FPAT_re_no_case = NULL;
90 Regexp *FPAT_regexp = NULL;
91 NODE *Null_field = NULL;
92
93 #define clear_mpfr(n) ((n)->flags &= ~(MPFN | MPZN | NUMCUR))
94
95 /* init_fields --- set up the fields array to start with */
96
97 void
init_fields()98 init_fields()
99 {
100 emalloc(fields_arr, NODE **, sizeof(NODE *), "init_fields");
101
102 fields_arr[0] = make_string("", 0);
103 fields_arr[0]->flags |= NULL_FIELD;
104
105 parse_extent = fields_arr[0]->stptr;
106 save_FS = dupnode(FS_node->var_value);
107
108 Null_field = make_string("", 0);
109 Null_field->flags = (STRCUR|STRING|NULL_FIELD); /* do not set MALLOC */
110
111 field0_valid = true;
112 }
113
114 /* grow_fields --- acquire new fields as needed */
115
116 static void
grow_fields_arr(long num)117 grow_fields_arr(long num)
118 {
119 int t;
120 NODE *n;
121
122 erealloc(fields_arr, NODE **, (num + 1) * sizeof(NODE *), "grow_fields_arr");
123 for (t = nf_high_water + 1; t <= num; t++) {
124 getnode(n);
125 *n = *Null_field;
126 fields_arr[t] = n;
127 }
128 nf_high_water = num;
129 }
130
131 /* set_field --- set the value of a particular field */
132
133 /*ARGSUSED*/
134 static void
set_field(long num,char * str,long len,NODE * dummy ATTRIBUTE_UNUSED)135 set_field(long num,
136 char *str,
137 long len,
138 NODE *dummy ATTRIBUTE_UNUSED) /* just to make interface same as set_element */
139 {
140 NODE *n;
141
142 if (num > nf_high_water)
143 grow_fields_arr(num);
144 n = fields_arr[num];
145 n->stptr = str;
146 n->stlen = len;
147 n->flags = (STRCUR|STRING|USER_INPUT); /* do not set MALLOC */
148 }
149
150 /* rebuild_record --- Someone assigned a value to $(something).
151 Fix up $0 to be right */
152
153 void
rebuild_record()154 rebuild_record()
155 {
156 /*
157 * use explicit unsigned longs for lengths, in case
158 * a size_t isn't big enough.
159 */
160 unsigned long tlen;
161 NODE *tmp;
162 char *ops;
163 char *cops;
164 long i;
165
166 assert(NF != -1);
167
168 tlen = 0;
169 for (i = NF; i > 0; i--) {
170 tmp = fields_arr[i];
171 tmp = force_string(tmp);
172 tlen += tmp->stlen;
173 }
174 tlen += (NF - 1) * OFSlen;
175 if ((long) tlen < 0)
176 tlen = 0;
177 emalloc(ops, char *, tlen + 1, "rebuild_record");
178 cops = ops;
179 ops[0] = '\0';
180 for (i = 1; i <= NF; i++) {
181 free_wstr(fields_arr[i]);
182 tmp = fields_arr[i];
183 /* copy field */
184 if (tmp->stlen == 1)
185 *cops++ = tmp->stptr[0];
186 else if (tmp->stlen != 0) {
187 memcpy(cops, tmp->stptr, tmp->stlen);
188 cops += tmp->stlen;
189 }
190 /* copy OFS */
191 if (i != NF) {
192 if (OFSlen == 1)
193 *cops++ = *OFS;
194 else if (OFSlen != 0) {
195 memcpy(cops, OFS, OFSlen);
196 cops += OFSlen;
197 }
198 }
199 }
200 tmp = make_str_node(ops, tlen, ALREADY_MALLOCED);
201
202 /*
203 * Since we are about to unref fields_arr[0], we want to find
204 * any fields that still point into it, and have them point
205 * into the new field zero. This has to be done intelligently,
206 * so that unrefing a field doesn't try to unref into the old $0.
207 */
208 for (cops = ops, i = 1; i <= NF; i++) {
209 NODE *r = fields_arr[i];
210 /*
211 * There is no reason to copy malloc'ed fields to point into
212 * the new $0 buffer, although that's how previous versions did
213 * it. It seems faster to leave the malloc'ed fields in place.
214 */
215 if (r->stlen > 0 && (r->flags & MALLOC) == 0) {
216 NODE *n;
217 getnode(n);
218
219 *n = *r;
220 if (r->valref > 1) {
221 /*
222 * This can and does happen. It seems clear that
223 * we can't leave r's stptr pointing into the
224 * old $0 buffer that we are about to unref.
225 */
226 emalloc(r->stptr, char *, r->stlen + 1, "rebuild_record");
227 memcpy(r->stptr, cops, r->stlen);
228 r->stptr[r->stlen] = '\0';
229 r->flags |= MALLOC;
230
231 n->valref = 1; // reset in the new field to start it off correctly!
232 }
233
234 n->stptr = cops;
235 clear_mpfr(n);
236 unref(r);
237 fields_arr[i] = n;
238 assert((n->flags & WSTRCUR) == 0);
239 }
240 cops += fields_arr[i]->stlen + OFSlen;
241 }
242
243 assert((fields_arr[0]->flags & MALLOC) == 0
244 ? fields_arr[0]->valref == 1
245 : true);
246
247 unref(fields_arr[0]);
248
249 fields_arr[0] = tmp;
250 field0_valid = true;
251 }
252
253 /*
254 * set_record:
255 * setup $0, but defer parsing rest of line until reference is made to $(>0)
256 * or to NF. At that point, parse only as much as necessary.
257 *
258 * Manage a private buffer for the contents of $0. Doing so keeps us safe
259 * if `getline var' decides to rearrange the contents of the IOBUF that
260 * $0 might have been pointing into. The cost is the copying of the buffer;
261 * but better correct than fast.
262 */
263 void
set_record(const char * buf,int cnt,const awk_fieldwidth_info_t * fw)264 set_record(const char *buf, int cnt, const awk_fieldwidth_info_t *fw)
265 {
266 NODE *n;
267 static char *databuf;
268 static unsigned long databuf_size;
269 #define INITIAL_SIZE 512
270 #define MAX_SIZE ((unsigned long) ~0) /* maximally portable ... */
271
272 purge_record();
273
274 /* buffer management: */
275 if (databuf_size == 0) { /* first time */
276 ezalloc(databuf, char *, INITIAL_SIZE, "set_record");
277 databuf_size = INITIAL_SIZE;
278 }
279 /*
280 * Make sure there's enough room. Since we sometimes need
281 * to place a sentinel at the end, we make sure
282 * databuf_size is > cnt after allocation.
283 */
284 if (cnt >= databuf_size) {
285 do {
286 if (databuf_size > MAX_SIZE/2)
287 fatal(_("input record too large"));
288 databuf_size *= 2;
289 } while (cnt >= databuf_size);
290 erealloc(databuf, char *, databuf_size, "set_record");
291 memset(databuf, '\0', databuf_size);
292 }
293 /* copy the data */
294 if (cnt != 0) {
295 memcpy(databuf, buf, cnt);
296 }
297
298 /*
299 * Add terminating '\0' so that C library routines
300 * will know when to stop.
301 */
302 databuf[cnt] = '\0';
303
304 /* manage field 0: */
305 assert((fields_arr[0]->flags & MALLOC) == 0
306 ? fields_arr[0]->valref == 1
307 : true);
308
309 unref(fields_arr[0]);
310 getnode(n);
311 n->stptr = databuf;
312 n->stlen = cnt;
313 n->valref = 1;
314 n->type = Node_val;
315 n->stfmt = STFMT_UNUSED;
316 #ifdef HAVE_MPFR
317 n->strndmode = MPFR_round_mode;
318 #endif
319 n->flags = (STRING|STRCUR|USER_INPUT); /* do not set MALLOC */
320 fields_arr[0] = n;
321 if (fw != api_fw) {
322 if ((api_fw = fw) != NULL) {
323 if (! api_parser_override) {
324 api_parser_override = true;
325 parse_field = fw_parse_field;
326 update_PROCINFO_str("FS", "API");
327 }
328 } else if (api_parser_override) {
329 api_parser_override = false;
330 parse_field = normal_parse_field;
331 update_PROCINFO_str("FS", current_field_sep_str());
332 }
333 }
334
335 #undef INITIAL_SIZE
336 #undef MAX_SIZE
337 }
338
339 /* reset_record --- start over again with current $0 */
340
341 void
reset_record()342 reset_record()
343 {
344 fields_arr[0] = force_string(fields_arr[0]);
345 purge_record();
346 if (api_parser_override) {
347 api_parser_override = false;
348 parse_field = normal_parse_field;
349 update_PROCINFO_str("FS", current_field_sep_str());
350 }
351 }
352
353 /*
354 * purge_record --- throw away the fields, make sure that
355 * individual nodes remain valid.
356 */
357
358 static void
purge_record()359 purge_record()
360 {
361 int i;
362
363 NF = -1;
364 for (i = 1; i <= parse_high_water; i++) {
365 NODE *n;
366 NODE *r = fields_arr[i];
367 if ((r->flags & MALLOC) == 0 && r->valref > 1) {
368 /* This can and does happen. We must copy the string! */
369 const char *save = r->stptr;
370 emalloc(r->stptr, char *, r->stlen + 1, "purge_record");
371 memcpy(r->stptr, save, r->stlen);
372 r->stptr[r->stlen] = '\0';
373 r->flags |= MALLOC;
374 }
375 unref(r);
376 getnode(n);
377 *n = *Null_field;
378 fields_arr[i] = n;
379 }
380
381 parse_high_water = 0;
382 /*
383 * $0 = $0 should resplit using the current value of FS.
384 */
385 if (resave_fs) {
386 resave_fs = false;
387 unref(save_FS);
388 save_FS = dupnode(FS_node->var_value);
389 }
390
391 field0_valid = true;
392 }
393
394 /* set_NF --- handle what happens to $0 and fields when NF is changed */
395
396 void
set_NF()397 set_NF()
398 {
399 int i;
400 long nf;
401 NODE *n;
402
403 assert(NF != -1);
404
405 (void) force_number(NF_node->var_value);
406 nf = get_number_si(NF_node->var_value);
407 if (nf < 0)
408 fatal(_("NF set to negative value"));
409
410 static bool warned = false;
411 if (do_lint && NF > nf && ! warned) {
412 warned = true;
413 lintwarn(_("decrementing NF is not portable to many awk versions"));
414 }
415
416 NF = nf;
417
418 if (NF > nf_high_water)
419 grow_fields_arr(NF);
420 if (parse_high_water < NF) {
421 for (i = parse_high_water + 1; i >= 0 && i <= NF; i++) {
422 unref(fields_arr[i]);
423 getnode(n);
424 *n = *Null_field;
425 fields_arr[i] = n;
426 }
427 parse_high_water = NF;
428 } else if (parse_high_water > 0) {
429 for (i = NF + 1; i >= 0 && i <= parse_high_water; i++) {
430 unref(fields_arr[i]);
431 getnode(n);
432 *n = *Null_field;
433 fields_arr[i] = n;
434 }
435 parse_high_water = NF;
436 }
437 field0_valid = false;
438 }
439
440 /*
441 * re_parse_field --- parse fields using a regexp.
442 *
443 * This is called both from get_field() and from do_split()
444 * via (*parse_field)(). This variation is for when FS is a regular
445 * expression -- either user-defined or because RS=="" and FS==" "
446 */
447 static long
re_parse_field(long up_to,char ** buf,int len,NODE * fs ATTRIBUTE_UNUSED,Regexp * rp,Setfunc set,NODE * n,NODE * sep_arr,bool in_middle)448 re_parse_field(long up_to, /* parse only up to this field number */
449 char **buf, /* on input: string to parse; on output: point to start next */
450 int len,
451 NODE *fs ATTRIBUTE_UNUSED,
452 Regexp *rp,
453 Setfunc set, /* routine to set the value of the parsed field */
454 NODE *n,
455 NODE *sep_arr, /* array of field separators (maybe NULL) */
456 bool in_middle)
457 {
458 char *scan = *buf;
459 long nf = parse_high_water;
460 char *field;
461 char *end = scan + len;
462 int regex_flags = RE_NEED_START;
463 char *sep;
464 size_t mbclen = 0;
465 mbstate_t mbs;
466
467 memset(&mbs, 0, sizeof(mbstate_t));
468
469 if (in_middle)
470 regex_flags |= RE_NO_BOL;
471
472 if (up_to == UNLIMITED)
473 nf = 0;
474 if (len == 0)
475 return nf;
476
477 bool default_field_splitting = (RS_is_null && default_FS);
478
479 if (default_field_splitting) {
480 sep = scan;
481 while (scan < end && (*scan == ' ' || *scan == '\t' || *scan == '\n'))
482 scan++;
483 if (sep_arr != NULL && sep < scan)
484 set_element(nf, sep, (long)(scan - sep), sep_arr);
485 }
486
487 if (rp == NULL) /* use FS */
488 rp = FS_regexp;
489
490 field = scan;
491 while (scan < end
492 && research(rp, scan, 0, (end - scan), regex_flags) != -1
493 && nf < up_to) {
494 regex_flags |= RE_NO_BOL;
495 if (REEND(rp, scan) == RESTART(rp, scan)) { /* null match */
496 if (gawk_mb_cur_max > 1) {
497 mbclen = mbrlen(scan, end-scan, &mbs);
498 if ((mbclen == 1) || (mbclen == (size_t) -1)
499 || (mbclen == (size_t) -2) || (mbclen == 0)) {
500 /* We treat it as a singlebyte character. */
501 mbclen = 1;
502 }
503 scan += mbclen;
504 } else
505 scan++;
506 if (scan == end) {
507 (*set)(++nf, field, (long)(scan - field), n);
508 up_to = nf;
509 break;
510 }
511 continue;
512 }
513 (*set)(++nf, field,
514 (long)(scan + RESTART(rp, scan) - field), n);
515 if (sep_arr != NULL)
516 set_element(nf, scan + RESTART(rp, scan),
517 (long) (REEND(rp, scan) - RESTART(rp, scan)), sep_arr);
518 scan += REEND(rp, scan);
519 field = scan;
520 if (scan == end && ! default_field_splitting) /* FS at end of record */
521 (*set)(++nf, field, 0L, n);
522 }
523 if (nf != up_to && scan < end) {
524 (*set)(++nf, scan, (long)(end - scan), n);
525 scan = end;
526 }
527 *buf = scan;
528 return nf;
529 }
530
531 /*
532 * def_parse_field --- default field parsing.
533 *
534 * This is called both from get_field() and from do_split()
535 * via (*parse_field)(). This variation is for when FS is a single space
536 * character.
537 */
538
539 static long
def_parse_field(long up_to,char ** buf,int len,NODE * fs,Regexp * rp ATTRIBUTE_UNUSED,Setfunc set,NODE * n,NODE * sep_arr,bool in_middle ATTRIBUTE_UNUSED)540 def_parse_field(long up_to, /* parse only up to this field number */
541 char **buf, /* on input: string to parse; on output: point to start next */
542 int len,
543 NODE *fs,
544 Regexp *rp ATTRIBUTE_UNUSED,
545 Setfunc set, /* routine to set the value of the parsed field */
546 NODE *n,
547 NODE *sep_arr, /* array of field separators (maybe NULL) */
548 bool in_middle ATTRIBUTE_UNUSED)
549 {
550 char *scan = *buf;
551 long nf = parse_high_water;
552 char *field;
553 char *end = scan + len;
554 char sav;
555 char *sep;
556
557 if (up_to == UNLIMITED)
558 nf = 0;
559 if (len == 0)
560 return nf;
561
562 /*
563 * Nasty special case. If FS set to "", return whole record
564 * as first field. This is not worth a separate function.
565 */
566 if (fs->stlen == 0) {
567 (*set)(++nf, *buf, len, n);
568 *buf += len;
569 return nf;
570 }
571
572 /* before doing anything save the char at *end */
573 sav = *end;
574 /* because it will be destroyed now: */
575
576 *end = ' '; /* sentinel character */
577 sep = scan;
578 for (; nf < up_to; scan++) {
579 /*
580 * special case: fs is single space, strip leading whitespace
581 */
582 while (scan < end && (*scan == ' ' || *scan == '\t' || *scan == '\n'))
583 scan++;
584
585 if (sep_arr != NULL && scan > sep)
586 set_element(nf, sep, (long) (scan - sep), sep_arr);
587
588 if (scan >= end)
589 break;
590
591 field = scan;
592
593 while (*scan != ' ' && *scan != '\t' && *scan != '\n')
594 scan++;
595
596 (*set)(++nf, field, (long)(scan - field), n);
597
598 if (scan == end)
599 break;
600
601 sep = scan;
602 }
603
604 /* everything done, restore original char at *end */
605 *end = sav;
606
607 *buf = scan;
608 return nf;
609 }
610
611 /*
612 * null_parse_field --- each character is a separate field
613 *
614 * This is called both from get_field() and from do_split()
615 * via (*parse_field)(). This variation is for when FS is the null string.
616 */
617 static long
null_parse_field(long up_to,char ** buf,int len,NODE * fs ATTRIBUTE_UNUSED,Regexp * rp ATTRIBUTE_UNUSED,Setfunc set,NODE * n,NODE * sep_arr,bool in_middle ATTRIBUTE_UNUSED)618 null_parse_field(long up_to, /* parse only up to this field number */
619 char **buf, /* on input: string to parse; on output: point to start next */
620 int len,
621 NODE *fs ATTRIBUTE_UNUSED,
622 Regexp *rp ATTRIBUTE_UNUSED,
623 Setfunc set, /* routine to set the value of the parsed field */
624 NODE *n,
625 NODE *sep_arr, /* array of field separators (maybe NULL) */
626 bool in_middle ATTRIBUTE_UNUSED)
627 {
628 char *scan = *buf;
629 long nf = parse_high_water;
630 char *end = scan + len;
631
632 if (up_to == UNLIMITED)
633 nf = 0;
634 if (len == 0)
635 return nf;
636
637 if (gawk_mb_cur_max > 1) {
638 mbstate_t mbs;
639 memset(&mbs, 0, sizeof(mbstate_t));
640 for (; nf < up_to && scan < end;) {
641 size_t mbclen = mbrlen(scan, end-scan, &mbs);
642 if ((mbclen == 1) || (mbclen == (size_t) -1)
643 || (mbclen == (size_t) -2) || (mbclen == 0)) {
644 /* We treat it as a singlebyte character. */
645 mbclen = 1;
646 }
647 if (sep_arr != NULL && nf > 0)
648 set_element(nf, scan, 0L, sep_arr);
649 (*set)(++nf, scan, mbclen, n);
650 scan += mbclen;
651 }
652 } else {
653 for (; nf < up_to && scan < end; scan++) {
654 if (sep_arr != NULL && nf > 0)
655 set_element(nf, scan, 0L, sep_arr);
656 (*set)(++nf, scan, 1L, n);
657 }
658 }
659
660 *buf = scan;
661 return nf;
662 }
663
664 /*
665 * sc_parse_field --- single character field separator
666 *
667 * This is called both from get_field() and from do_split()
668 * via (*parse_field)(). This variation is for when FS is a single character
669 * other than space.
670 */
671 static long
sc_parse_field(long up_to,char ** buf,int len,NODE * fs,Regexp * rp ATTRIBUTE_UNUSED,Setfunc set,NODE * n,NODE * sep_arr,bool in_middle ATTRIBUTE_UNUSED)672 sc_parse_field(long up_to, /* parse only up to this field number */
673 char **buf, /* on input: string to parse; on output: point to start next */
674 int len,
675 NODE *fs,
676 Regexp *rp ATTRIBUTE_UNUSED,
677 Setfunc set, /* routine to set the value of the parsed field */
678 NODE *n,
679 NODE *sep_arr, /* array of field separators (maybe NULL) */
680 bool in_middle ATTRIBUTE_UNUSED)
681 {
682 char *scan = *buf;
683 char fschar;
684 long nf = parse_high_water;
685 char *field;
686 char *end = scan + len;
687 char sav;
688 size_t mbclen = 0;
689 mbstate_t mbs;
690
691 memset(&mbs, 0, sizeof(mbstate_t));
692
693 if (up_to == UNLIMITED)
694 nf = 0;
695 if (len == 0)
696 return nf;
697
698 if (RS_is_null && fs->stlen == 0)
699 fschar = '\n';
700 else
701 fschar = fs->stptr[0];
702
703 /* before doing anything save the char at *end */
704 sav = *end;
705 /* because it will be destroyed now: */
706 *end = fschar; /* sentinel character */
707
708 for (; nf < up_to;) {
709 field = scan;
710 if (gawk_mb_cur_max > 1) {
711 while (*scan != fschar) {
712 mbclen = mbrlen(scan, end-scan, &mbs);
713 if ((mbclen == 1) || (mbclen == (size_t) -1)
714 || (mbclen == (size_t) -2) || (mbclen == 0)) {
715 /* We treat it as a singlebyte character. */
716 mbclen = 1;
717 }
718 scan += mbclen;
719 }
720 } else {
721 while (*scan != fschar)
722 scan++;
723 }
724 (*set)(++nf, field, (long)(scan - field), n);
725 if (scan == end)
726 break;
727 if (sep_arr != NULL)
728 set_element(nf, scan, 1L, sep_arr);
729 scan++;
730 if (scan == end) { /* FS at end of record */
731 (*set)(++nf, field, 0L, n);
732 break;
733 }
734 }
735
736 /* everything done, restore original char at *end */
737 *end = sav;
738
739 *buf = scan;
740 return nf;
741 }
742
743 /*
744 * calc_mbslen --- calculate the length in bytes of a multi-byte string
745 * containing len characters.
746 */
747
748 static size_t
calc_mbslen(char * scan,char * end,size_t len,mbstate_t * mbs)749 calc_mbslen(char *scan, char *end, size_t len, mbstate_t *mbs)
750 {
751
752 size_t mbclen;
753 char *mbscan = scan;
754
755 while (len-- > 0 && mbscan < end) {
756 mbclen = mbrlen(mbscan, end - mbscan, mbs);
757 if (!(mbclen > 0 && mbclen <= (size_t)(end - mbscan)))
758 /*
759 * We treat it as a singlebyte character. This should
760 * catch error codes 0, (size_t) -1, and (size_t) -2.
761 */
762 mbclen = 1;
763 mbscan += mbclen;
764 }
765 return mbscan - scan;
766 }
767
768 /*
769 * fw_parse_field --- field parsing using FIELDWIDTHS spec
770 *
771 * This is called from get_field() via (*parse_field)().
772 * This variation is for fields are fixed widths.
773 */
774 static long
fw_parse_field(long up_to,char ** buf,int len,NODE * fs ATTRIBUTE_UNUSED,Regexp * rp ATTRIBUTE_UNUSED,Setfunc set,NODE * n,NODE * dummy ATTRIBUTE_UNUSED,bool in_middle ATTRIBUTE_UNUSED)775 fw_parse_field(long up_to, /* parse only up to this field number */
776 char **buf, /* on input: string to parse; on output: point to start next */
777 int len,
778 NODE *fs ATTRIBUTE_UNUSED,
779 Regexp *rp ATTRIBUTE_UNUSED,
780 Setfunc set, /* routine to set the value of the parsed field */
781 NODE *n,
782 NODE *dummy ATTRIBUTE_UNUSED, /* sep_arr not needed here: hence dummy */
783 bool in_middle ATTRIBUTE_UNUSED)
784 {
785 char *scan = *buf;
786 long nf = parse_high_water;
787 char *end = scan + len;
788 const awk_fieldwidth_info_t *fw;
789 mbstate_t mbs;
790 size_t skiplen;
791 size_t flen;
792
793 fw = (api_parser_override ? api_fw : FIELDWIDTHS);
794
795 if (up_to == UNLIMITED)
796 nf = 0;
797 if (len == 0)
798 return nf;
799 if (gawk_mb_cur_max > 1 && fw->use_chars) {
800 /*
801 * Reset the shift state. Arguably, the shift state should
802 * be part of the file state and carried forward at all times,
803 * but nobody has complained so far, so this may not matter
804 * in practice.
805 */
806 memset(&mbs, 0, sizeof(mbstate_t));
807 while (nf < up_to && scan < end) {
808 if (nf >= fw->nf) {
809 *buf = end;
810 return nf;
811 }
812 scan += calc_mbslen(scan, end, fw->fields[nf].skip, &mbs);
813 flen = calc_mbslen(scan, end, fw->fields[nf].len, &mbs);
814 (*set)(++nf, scan, (long) flen, n);
815 scan += flen;
816 }
817 } else {
818 while (nf < up_to && scan < end) {
819 if (nf >= fw->nf) {
820 *buf = end;
821 return nf;
822 }
823 skiplen = fw->fields[nf].skip;
824 if (skiplen > end - scan)
825 skiplen = end - scan;
826 scan += skiplen;
827 flen = fw->fields[nf].len;
828 if (flen > end - scan)
829 flen = end - scan;
830 (*set)(++nf, scan, (long) flen, n);
831 scan += flen;
832 }
833 }
834 *buf = scan;
835 return nf;
836 }
837
838 /* invalidate_field0 --- $0 needs reconstruction */
839
840 void
invalidate_field0()841 invalidate_field0()
842 {
843 field0_valid = false;
844 }
845
846 /* get_field --- return a particular $n */
847
848 /* assign is not NULL if this field is on the LHS of an assign */
849
850 NODE **
get_field(long requested,Func_ptr * assign)851 get_field(long requested, Func_ptr *assign)
852 {
853 bool in_middle = false;
854 static bool warned = false;
855 extern int currule;
856 NODE *saved_fs;
857 Regexp *fs_regexp;
858
859 if (do_lint && currule == END && ! warned) {
860 warned = true;
861 lintwarn(_("accessing fields from an END rule may not be portable"));
862 }
863
864 /*
865 * if requesting whole line but some other field has been altered,
866 * then the whole line must be rebuilt
867 */
868 if (requested == 0) {
869 if (! field0_valid) {
870 /* first, parse remainder of input record */
871 if (NF == -1) {
872 in_middle = (parse_high_water != 0);
873 if (current_field_sep() == Using_FPAT) {
874 saved_fs = save_FPAT;
875 fs_regexp = FPAT_regexp;
876 } else {
877 saved_fs = save_FS;
878 fs_regexp = FS_regexp;
879 }
880 NF = (*parse_field)(UNLIMITED - 1, &parse_extent,
881 fields_arr[0]->stlen -
882 (parse_extent - fields_arr[0]->stptr),
883 saved_fs, fs_regexp, set_field,
884 (NODE *) NULL,
885 (NODE *) NULL,
886 in_middle);
887 parse_high_water = NF;
888 }
889 rebuild_record();
890 }
891 if (assign != NULL)
892 *assign = reset_record;
893 return &fields_arr[0];
894 }
895
896 /* assert(requested > 0); */
897
898 #if 0
899 if (assign != NULL)
900 field0_valid = false; /* $0 needs reconstruction */
901 #else
902 /*
903 * Keep things uniform. Also, mere intention of assigning something
904 * to $n should not make $0 invalid. Makes sense to invalidate $0
905 * after the actual assignment is performed. Not a real issue in
906 * the interpreter otherwise, but causes problem in the
907 * debugger when watching or printing fields.
908 */
909
910 if (assign != NULL)
911 *assign = invalidate_field0; /* $0 needs reconstruction */
912 #endif
913
914 if (requested <= parse_high_water) /* already parsed this field */
915 return &fields_arr[requested];
916
917 if (NF == -1) { /* have not yet parsed to end of record */
918 /*
919 * parse up to requested fields, calling set_field() for each,
920 * saving in parse_extent the point where the parse left off
921 */
922 if (parse_high_water == 0) /* starting at the beginning */
923 parse_extent = fields_arr[0]->stptr;
924 else
925 in_middle = true;
926 parse_high_water = (*parse_field)(requested, &parse_extent,
927 fields_arr[0]->stlen - (parse_extent - fields_arr[0]->stptr),
928 save_FS, NULL, set_field, (NODE *) NULL, (NODE *) NULL, in_middle);
929
930 /*
931 * if we reached the end of the record, set NF to the number of
932 * fields so far. Note that requested might actually refer to
933 * a field that is beyond the end of the record, but we won't
934 * set NF to that value at this point, since this is only a
935 * reference to the field and NF only gets set if the field
936 * is assigned to -- this case is handled below
937 */
938 if (parse_extent == fields_arr[0]->stptr + fields_arr[0]->stlen)
939 NF = parse_high_water;
940 if (requested == UNLIMITED - 1) /* UNLIMITED-1 means set NF */
941 requested = parse_high_water;
942 }
943 if (parse_high_water < requested) { /* requested beyond end of record */
944 if (assign != NULL) { /* expand record */
945 if (requested > nf_high_water)
946 grow_fields_arr(requested);
947
948 NF = requested;
949 parse_high_water = requested;
950 } else
951 return &Null_field;
952 }
953
954 return &fields_arr[requested];
955 }
956
957 /* set_element --- set an array element, used by do_split() */
958
959 static void
set_element(long num,char * s,long len,NODE * n)960 set_element(long num, char *s, long len, NODE *n)
961 {
962 NODE *it;
963 NODE *sub;
964
965 it = make_string(s, len);
966 it->flags |= USER_INPUT;
967 sub = make_number((AWKNUM) (num));
968 assoc_set(n, sub, it);
969 }
970
971 /* do_split --- implement split(), semantics are same as for field splitting */
972
973 NODE *
do_split(int nargs)974 do_split(int nargs)
975 {
976 NODE *src, *arr, *sep, *fs, *tmp, *sep_arr = NULL;
977 char *s;
978 long (*parseit)(long, char **, int, NODE *,
979 Regexp *, Setfunc, NODE *, NODE *, bool);
980 Regexp *rp = NULL;
981
982 if (nargs == 4) {
983 static bool warned = false;
984
985 if (do_traditional || do_posix) {
986 fatal(_("split: fourth argument is a gawk extension"));
987 }
988 sep_arr = POP_PARAM();
989 if (sep_arr->type != Node_var_array)
990 fatal(_("split: fourth argument is not an array"));
991 check_symtab_functab(sep_arr, "split",
992 _("%s: cannot use %s as fourth argument"));
993 if ((do_lint_extensions || do_lint_old) && ! warned) {
994 warned = true;
995 lintwarn(_("split: fourth argument is a gawk extension"));
996 }
997 }
998
999 sep = POP();
1000 arr = POP_PARAM();
1001 if (arr->type != Node_var_array)
1002 fatal(_("split: second argument is not an array"));
1003 check_symtab_functab(arr, "split",
1004 _("%s: cannot use %s as second argument"));
1005
1006 if (sep_arr != NULL) {
1007 if (sep_arr == arr)
1008 fatal(_("split: cannot use the same array for second and fourth args"));
1009
1010 /* This checks need to be done before clearing any of the arrays */
1011 for (tmp = sep_arr->parent_array; tmp != NULL; tmp = tmp->parent_array)
1012 if (tmp == arr)
1013 fatal(_("split: cannot use a subarray of second arg for fourth arg"));
1014 for (tmp = arr->parent_array; tmp != NULL; tmp = tmp->parent_array)
1015 if (tmp == sep_arr)
1016 fatal(_("split: cannot use a subarray of fourth arg for second arg"));
1017 assoc_clear(sep_arr);
1018 }
1019 assoc_clear(arr);
1020
1021 src = TOP_STRING();
1022 if (src->stlen == 0) {
1023 /*
1024 * Skip the work if first arg is the null string.
1025 */
1026 tmp = POP_SCALAR();
1027 DEREF(tmp);
1028 return make_number((AWKNUM) 0);
1029 }
1030
1031 if ((sep->flags & REGEX) != 0)
1032 sep = sep->typed_re;
1033
1034 if ( (sep->re_flags & FS_DFLT) != 0
1035 && current_field_sep() == Using_FS
1036 && ! RS_is_null) {
1037 parseit = parse_field;
1038 fs = force_string(FS_node->var_value);
1039 rp = FS_regexp;
1040 } else {
1041 fs = sep->re_exp;
1042
1043 if (fs->stlen == 0) {
1044 static bool warned = false;
1045
1046 parseit = null_parse_field;
1047
1048 if (do_lint && ! warned) {
1049 warned = true;
1050 lintwarn(_("split: null string for third arg is a non-standard extension"));
1051 }
1052 } else if (fs->stlen == 1 && (sep->re_flags & CONSTANT) == 0) {
1053 if (fs->stptr[0] == ' ') {
1054 parseit = def_parse_field;
1055 } else
1056 parseit = sc_parse_field;
1057 } else {
1058 parseit = re_parse_field;
1059 rp = re_update(sep);
1060 }
1061 }
1062
1063 s = src->stptr;
1064 tmp = make_number((AWKNUM) (*parseit)(UNLIMITED, &s, (int) src->stlen,
1065 fs, rp, set_element, arr, sep_arr, false));
1066
1067 src = POP_SCALAR(); /* really pop off stack */
1068 DEREF(src);
1069 return tmp;
1070 }
1071
1072 /*
1073 * do_patsplit --- implement patsplit(), semantics are same as for field
1074 * splitting with FPAT.
1075 */
1076
1077 NODE *
do_patsplit(int nargs)1078 do_patsplit(int nargs)
1079 {
1080 NODE *src, *arr, *sep, *fpat, *tmp, *sep_arr = NULL;
1081 char *s;
1082 Regexp *rp = NULL;
1083
1084 if (nargs == 4) {
1085 sep_arr = POP_PARAM();
1086 if (sep_arr->type != Node_var_array)
1087 fatal(_("patsplit: fourth argument is not an array"));
1088 check_symtab_functab(sep_arr, "patsplit",
1089 _("%s: cannot use %s as fourth argument"));
1090 }
1091 sep = POP();
1092 arr = POP_PARAM();
1093 if (arr->type != Node_var_array)
1094 fatal(_("patsplit: second argument is not an array"));
1095 check_symtab_functab(arr, "patsplit",
1096 _("%s: cannot use %s as second argument"));
1097
1098 src = TOP_STRING();
1099
1100 if ((sep->flags & REGEX) != 0)
1101 sep = sep->typed_re;
1102
1103 fpat = sep->re_exp;
1104 if (fpat->stlen == 0)
1105 fatal(_("patsplit: third argument must be non-null"));
1106
1107 if (sep_arr != NULL) {
1108 if (sep_arr == arr)
1109 fatal(_("patsplit: cannot use the same array for second and fourth args"));
1110
1111 /* These checks need to be done before clearing any of the arrays */
1112 for (tmp = sep_arr->parent_array; tmp != NULL; tmp = tmp->parent_array)
1113 if (tmp == arr)
1114 fatal(_("patsplit: cannot use a subarray of second arg for fourth arg"));
1115 for (tmp = arr->parent_array; tmp != NULL; tmp = tmp->parent_array)
1116 if (tmp == sep_arr)
1117 fatal(_("patsplit: cannot use a subarray of fourth arg for second arg"));
1118 assoc_clear(sep_arr);
1119 }
1120 assoc_clear(arr);
1121
1122 if (src->stlen == 0) {
1123 /*
1124 * Skip the work if first arg is the null string.
1125 */
1126 tmp = make_number((AWKNUM) 0);
1127 } else {
1128 rp = re_update(sep);
1129 s = src->stptr;
1130 tmp = make_number((AWKNUM) fpat_parse_field(UNLIMITED, &s,
1131 (int) src->stlen, fpat, rp,
1132 set_element, arr, sep_arr, false));
1133 }
1134
1135 src = POP_SCALAR(); /* really pop off stack */
1136 DEREF(src);
1137 return tmp;
1138 }
1139
1140 /* set_parser --- update the current (non-API) parser */
1141
1142 static void
set_parser(parse_field_func_t func)1143 set_parser(parse_field_func_t func)
1144 {
1145 normal_parse_field = func;
1146 if (! api_parser_override && parse_field != func) {
1147 parse_field = func;
1148 update_PROCINFO_str("FS", current_field_sep_str());
1149 }
1150 }
1151
1152 /* set_FIELDWIDTHS --- handle an assignment to FIELDWIDTHS */
1153
1154 void
set_FIELDWIDTHS()1155 set_FIELDWIDTHS()
1156 {
1157 char *scan;
1158 char *end;
1159 int i;
1160 static int fw_alloc = 4;
1161 static bool warned = false;
1162 bool fatal_error = false;
1163 NODE *tmp;
1164
1165 if (do_lint_extensions && ! warned) {
1166 warned = true;
1167 lintwarn(_("`FIELDWIDTHS' is a gawk extension"));
1168 }
1169 if (do_traditional) /* quick and dirty, does the trick */
1170 return;
1171
1172 /*
1173 * If changing the way fields are split, obey least-surprise
1174 * semantics, and force $0 to be split totally.
1175 */
1176 if (fields_arr != NULL)
1177 (void) get_field(UNLIMITED - 1, 0);
1178
1179 set_parser(fw_parse_field);
1180 tmp = force_string(FIELDWIDTHS_node->var_value);
1181 scan = tmp->stptr;
1182
1183 if (FIELDWIDTHS == NULL) {
1184 emalloc(FIELDWIDTHS, awk_fieldwidth_info_t *, awk_fieldwidth_info_size(fw_alloc), "set_FIELDWIDTHS");
1185 FIELDWIDTHS->use_chars = awk_true;
1186 }
1187 FIELDWIDTHS->nf = 0;
1188 for (i = 0; ; i++) {
1189 unsigned long int tmp;
1190 if (i >= fw_alloc) {
1191 fw_alloc *= 2;
1192 erealloc(FIELDWIDTHS, awk_fieldwidth_info_t *, awk_fieldwidth_info_size(fw_alloc), "set_FIELDWIDTHS");
1193 }
1194 /* Ensure that there is no leading `-' sign. Otherwise,
1195 strtoul would accept it and return a bogus result. */
1196 while (is_blank(*scan)) {
1197 ++scan;
1198 }
1199 if (*scan == '-') {
1200 fatal_error = true;
1201 break;
1202 }
1203 if (*scan == '\0')
1204 break;
1205
1206 // Look for skip value. We allow N:M and N:*.
1207 /*
1208 * Detect an invalid base-10 integer, a valid value that
1209 * is followed by something other than a blank or '\0',
1210 * or a value that is not in the range [1..UINT_MAX].
1211 */
1212 errno = 0;
1213 tmp = strtoul(scan, &end, 10);
1214 if (errno == 0 && *end == ':' && (0 < tmp && tmp <= UINT_MAX)) {
1215 FIELDWIDTHS->fields[i].skip = tmp;
1216 scan = end + 1;
1217 if (*scan == '-' || is_blank(*scan)) {
1218 fatal_error = true;
1219 break;
1220 }
1221 // try scanning for field width
1222 tmp = strtoul(scan, &end, 10);
1223 }
1224 else
1225 FIELDWIDTHS->fields[i].skip = 0;
1226
1227 if (errno != 0
1228 || (*end != '\0' && ! is_blank(*end))
1229 || !(0 < tmp && tmp <= UINT_MAX)
1230 ) {
1231 if (*scan == '*') {
1232 for (scan++; is_blank(*scan); scan++)
1233 continue;
1234
1235 if (*scan != '\0')
1236 fatal(_("`*' must be the last designator in FIELDWIDTHS"));
1237
1238 FIELDWIDTHS->fields[i].len = UINT_MAX;
1239 FIELDWIDTHS->nf = i+1;
1240 }
1241 else
1242 fatal_error = true;
1243 break;
1244 }
1245 FIELDWIDTHS->fields[i].len = tmp;
1246 FIELDWIDTHS->nf = i+1;
1247 scan = end;
1248 /* Skip past any trailing blanks. */
1249 while (is_blank(*scan)) {
1250 ++scan;
1251 }
1252 if (*scan == '\0')
1253 break;
1254 }
1255
1256 if (fatal_error)
1257 fatal(_("invalid FIELDWIDTHS value, for field %d, near `%s'"),
1258 i + 1, scan);
1259 }
1260
1261 /* set_FS --- handle things when FS is assigned to */
1262
1263 void
set_FS()1264 set_FS()
1265 {
1266 char buf[10];
1267 NODE *fs;
1268 static NODE *save_fs = NULL;
1269 static NODE *save_rs = NULL;
1270 bool remake_re = true;
1271
1272 /*
1273 * If changing the way fields are split, obey least-surprise
1274 * semantics, and force $0 to be split totally.
1275 */
1276 if (fields_arr != NULL)
1277 (void) get_field(UNLIMITED - 1, 0);
1278
1279 /* It's possible that only IGNORECASE changed, or FS = FS */
1280 /*
1281 * This comparison can't use cmp_nodes(), which pays attention
1282 * to IGNORECASE, and that's not what we want.
1283 */
1284 if (save_fs
1285 && FS_node->var_value->stlen == save_fs->stlen
1286 && memcmp(FS_node->var_value->stptr, save_fs->stptr, save_fs->stlen) == 0
1287 && save_rs
1288 && RS_node->var_value->stlen == save_rs->stlen
1289 && memcmp(RS_node->var_value->stptr, save_rs->stptr, save_rs->stlen) == 0) {
1290 if (FS_regexp != NULL)
1291 FS_regexp = (IGNORECASE ? FS_re_no_case : FS_re_yes_case);
1292
1293 /* FS = FS */
1294 if (current_field_sep() == Using_FS) {
1295 return;
1296 } else {
1297 remake_re = false;
1298 goto choose_fs_function;
1299 }
1300 }
1301
1302 unref(save_fs);
1303 save_fs = dupnode(FS_node->var_value);
1304 unref(save_rs);
1305 save_rs = dupnode(RS_node->var_value);
1306 resave_fs = true;
1307
1308 /* If FS_re_no_case assignment is fatal (make_regexp in remake_re)
1309 * FS_regexp will be NULL with a non-null FS_re_yes_case.
1310 * refree() handles null argument; no need for `if (FS_regexp != NULL)' below.
1311 * Please do not remerge.
1312 */
1313 refree(FS_re_yes_case);
1314 refree(FS_re_no_case);
1315 FS_re_yes_case = FS_re_no_case = FS_regexp = NULL;
1316
1317
1318 choose_fs_function:
1319 buf[0] = '\0';
1320 default_FS = false;
1321 fs = force_string(FS_node->var_value);
1322
1323 if (! do_traditional && fs->stlen == 0) {
1324 static bool warned = false;
1325
1326 set_parser(null_parse_field);
1327
1328 if (do_lint_extensions && ! warned) {
1329 warned = true;
1330 lintwarn(_("null string for `FS' is a gawk extension"));
1331 }
1332 } else if (fs->stlen > 1 || (fs->flags & REGEX) != 0) {
1333 if (do_lint_old)
1334 lintwarn(_("old awk does not support regexps as value of `FS'"));
1335 set_parser(re_parse_field);
1336 } else if (RS_is_null) {
1337 /* we know that fs->stlen <= 1 */
1338 set_parser(sc_parse_field);
1339 if (fs->stlen == 1) {
1340 if (fs->stptr[0] == ' ') {
1341 default_FS = true;
1342 strcpy(buf, "[ \t\n]+");
1343 } else if (fs->stptr[0] == '\\') {
1344 /* yet another special case */
1345 strcpy(buf, "[\\\\\n]");
1346 } else if (fs->stptr[0] == '\0') {
1347 /* and yet another special case */
1348 strcpy(buf, "[\\000\n]");
1349 } else if (fs->stptr[0] != '\n') {
1350 sprintf(buf, "[%c\n]", fs->stptr[0]);
1351 }
1352 }
1353 } else {
1354 set_parser(def_parse_field);
1355
1356 if (fs->stlen == 1) {
1357 if (fs->stptr[0] == ' ')
1358 default_FS = true;
1359 else if (fs->stptr[0] == '\\')
1360 /* same special case */
1361 strcpy(buf, "[\\\\]");
1362 else
1363 set_parser(sc_parse_field);
1364 }
1365 }
1366 if (remake_re) {
1367 refree(FS_re_yes_case);
1368 refree(FS_re_no_case);
1369 FS_re_yes_case = FS_re_no_case = FS_regexp = NULL;
1370
1371 if (buf[0] != '\0') {
1372 FS_re_yes_case = make_regexp(buf, strlen(buf), false, true, true);
1373 FS_re_no_case = make_regexp(buf, strlen(buf), true, true, true);
1374 FS_regexp = (IGNORECASE ? FS_re_no_case : FS_re_yes_case);
1375 set_parser(re_parse_field);
1376 } else if (parse_field == re_parse_field) {
1377 FS_re_yes_case = make_regexp(fs->stptr, fs->stlen, false, true, true);
1378 FS_re_no_case = make_regexp(fs->stptr, fs->stlen, true, true, true);
1379 FS_regexp = (IGNORECASE ? FS_re_no_case : FS_re_yes_case);
1380 } else
1381 FS_re_yes_case = FS_re_no_case = FS_regexp = NULL;
1382 }
1383
1384 /*
1385 * For FS = "c", we don't use IGNORECASE. But we must use
1386 * re_parse_field to get the character and the newline as
1387 * field separators.
1388 */
1389 if (fs->stlen == 1 && parse_field == re_parse_field)
1390 FS_regexp = FS_re_yes_case;
1391 }
1392
1393 /* current_field_sep --- return the field separator type */
1394
1395 field_sep_type
current_field_sep()1396 current_field_sep()
1397 {
1398 if (api_parser_override)
1399 return Using_API;
1400 else if (parse_field == fw_parse_field)
1401 return Using_FIELDWIDTHS;
1402 else if (parse_field == fpat_parse_field)
1403 return Using_FPAT;
1404 else
1405 return Using_FS;
1406 }
1407
1408 /* current_field_sep_str --- return the field separator type as a string */
1409
1410 const char *
current_field_sep_str()1411 current_field_sep_str()
1412 {
1413 if (api_parser_override)
1414 return "API";
1415 else if (parse_field == fw_parse_field)
1416 return "FIELDWIDTHS";
1417 else if (parse_field == fpat_parse_field)
1418 return "FPAT";
1419 else
1420 return "FS";
1421 }
1422
1423 /* update_PROCINFO_str --- update PROCINFO[sub] with string value */
1424
1425 void
update_PROCINFO_str(const char * subscript,const char * str)1426 update_PROCINFO_str(const char *subscript, const char *str)
1427 {
1428 NODE *tmp;
1429
1430 if (PROCINFO_node == NULL)
1431 return;
1432 tmp = make_string(subscript, strlen(subscript));
1433 assoc_set(PROCINFO_node, tmp, make_string(str, strlen(str)));
1434 }
1435
1436 /* update_PROCINFO_num --- update PROCINFO[sub] with numeric value */
1437
1438 void
update_PROCINFO_num(const char * subscript,AWKNUM val)1439 update_PROCINFO_num(const char *subscript, AWKNUM val)
1440 {
1441 NODE *tmp;
1442
1443 if (PROCINFO_node == NULL)
1444 return;
1445 tmp = make_string(subscript, strlen(subscript));
1446 assoc_set(PROCINFO_node, tmp, make_number(val));
1447 }
1448
1449 /* set_FPAT --- handle an assignment to FPAT */
1450
1451 void
set_FPAT()1452 set_FPAT()
1453 {
1454 static bool warned = false;
1455 bool remake_re = true;
1456 NODE *fpat;
1457
1458 if (do_lint_extensions && ! warned) {
1459 warned = true;
1460 lintwarn(_("`FPAT' is a gawk extension"));
1461 }
1462 if (do_traditional) /* quick and dirty, does the trick */
1463 return;
1464
1465 /*
1466 * If changing the way fields are split, obey least-suprise
1467 * semantics, and force $0 to be split totally.
1468 */
1469 if (fields_arr != NULL)
1470 (void) get_field(UNLIMITED - 1, 0);
1471
1472 /* It's possible that only IGNORECASE changed, or FPAT = FPAT */
1473 /*
1474 * This comparison can't use cmp_nodes(), which pays attention
1475 * to IGNORECASE, and that's not what we want.
1476 */
1477 if (save_FPAT
1478 && FPAT_node->var_value->stlen == save_FPAT->stlen
1479 && memcmp(FPAT_node->var_value->stptr, save_FPAT->stptr, save_FPAT->stlen) == 0) {
1480 if (FPAT_regexp != NULL)
1481 FPAT_regexp = (IGNORECASE ? FPAT_re_no_case : FPAT_re_yes_case);
1482
1483 /* FPAT = FPAT */
1484 if (current_field_sep() == Using_FPAT) {
1485 return;
1486 } else {
1487 remake_re = false;
1488 goto set_fpat_function;
1489 }
1490 }
1491
1492 unref(save_FPAT);
1493 save_FPAT = dupnode(FPAT_node->var_value);
1494 refree(FPAT_re_yes_case);
1495 refree(FPAT_re_no_case);
1496 FPAT_re_yes_case = FPAT_re_no_case = FPAT_regexp = NULL;
1497
1498 set_fpat_function:
1499 fpat = force_string(FPAT_node->var_value);
1500 set_parser(fpat_parse_field);
1501
1502 if (remake_re) {
1503 refree(FPAT_re_yes_case);
1504 refree(FPAT_re_no_case);
1505 FPAT_re_yes_case = FPAT_re_no_case = FPAT_regexp = NULL;
1506
1507 FPAT_re_yes_case = make_regexp(fpat->stptr, fpat->stlen, false, true, true);
1508 FPAT_re_no_case = make_regexp(fpat->stptr, fpat->stlen, true, true, true);
1509 FPAT_regexp = (IGNORECASE ? FPAT_re_no_case : FPAT_re_yes_case);
1510 }
1511 }
1512
1513 /*
1514 * increment_scan --- macro to move scan pointer ahead by one character.
1515 * Implementation varies if doing MBS or not.
1516 */
1517
1518 #define increment_scan(scanp, len) incr_scan(scanp, len, & mbs)
1519
1520 /* incr_scan --- MBS version of increment_scan() */
1521
1522 static void
incr_scan(char ** scanp,size_t len,mbstate_t * mbs)1523 incr_scan(char **scanp, size_t len, mbstate_t *mbs)
1524 {
1525 size_t mbclen = 0;
1526
1527 if (gawk_mb_cur_max > 1) {
1528 mbclen = mbrlen(*scanp, len, mbs);
1529 if ( (mbclen == 1)
1530 || (mbclen == (size_t) -1)
1531 || (mbclen == (size_t) -2)
1532 || (mbclen == 0)) {
1533 /* We treat it as a singlebyte character. */
1534 mbclen = 1;
1535 }
1536 *scanp += mbclen;
1537 } else
1538 (*scanp)++;
1539 }
1540
1541 /*
1542 * fpat_parse_field --- parse fields using a regexp.
1543 *
1544 * This is called both from get_field() and from do_patsplit()
1545 * via (*parse_field)(). This variation is for when FPAT is a regular
1546 * expression -- use the value to find field contents.
1547 *
1548 * The FPAT parsing logic is a bit difficult to specify. In particular
1549 * to allow null fields at certain locations. To make the code as robust
1550 * as possible, an awk reference implementation was written and tested
1551 * as a first step, and later recoded in C, preserving its structure as
1552 * much as possible.
1553 *
1554 * # Reference implementation of the FPAT record parsing.
1555 * #
1556 * # Each loop iteration identifies a (separator[n-1],field[n]) pair.
1557 * # Each loop iteration must consume some characters, except for the first field.
1558 * # So a null field is only valid as a first field or after a non-null separator.
1559 * # A null record has no fields (not a single null field).
1560 *
1561 * function refpatsplit(string, fields, pattern, seps,
1562 * parse_start, sep_start, field_start, field_length, field_found, nf) # locals
1563 * {
1564 * # Local state variables:
1565 * # - parse_start: pointer to the first not yet consumed character
1566 * # - sep_start: pointer to the beginning of the parsed separator
1567 * # - field start: pointer to the beginning of the parsed field
1568 * # - field length: length of the parsed field
1569 * # - field_found: flag for succesful field match
1570 * # - nf: Number of fields found so far
1571 *
1572 * # Prepare for parsing
1573 * parse_start = 1 # first not yet parsed char
1574 * nf = 0 # fields found so far
1575 * delete fields
1576 * delete seps
1577 *
1578 * # Loop that consumes the whole record
1579 * while (parse_start <= length(string)) { # still something to parse
1580 *
1581 * # first attempt to match the next field
1582 * sep_start = parse_start
1583 * field_found = match(substr(string, parse_start), pattern)
1584 *
1585 * # check for an invalid null field and retry one character away
1586 * if (nf > 0 && field_found && RSTART == 1 && RLENGTH == 0) {
1587 * parse_start++
1588 * field_found = match(substr(string, parse_start), pattern)
1589 * }
1590 *
1591 * # store the (sep[n-1],field[n]) pair
1592 * if (field_found) {
1593 * field_start = parse_start + RSTART - 1
1594 * field_length = RLENGTH
1595 * seps[nf] = substr(string, sep_start, field_start-sep_start)
1596 * fields[++nf] = substr(string, field_start, field_length)
1597 * parse_start = field_start + field_length
1598 *
1599 * # store the final extra sep after the last field
1600 * } else {
1601 * seps[nf] = substr(string, sep_start)
1602 * parse_start = length(string) + 1
1603 * }
1604 * }
1605 *
1606 * return nf
1607 * }
1608 */
1609 static long
fpat_parse_field(long up_to,char ** buf,int len,NODE * fs ATTRIBUTE_UNUSED,Regexp * rp,Setfunc set,NODE * n,NODE * sep_arr,bool in_middle)1610 fpat_parse_field(long up_to, /* parse only up to this field number */
1611 char **buf, /* on input: string to parse; on output: point to start next */
1612 int len,
1613 NODE *fs ATTRIBUTE_UNUSED,
1614 Regexp *rp,
1615 Setfunc set, /* routine to set the value of the parsed field */
1616 NODE *n,
1617 NODE *sep_arr, /* array of field separators (may be NULL) */
1618 bool in_middle)
1619 {
1620 char *scan = *buf;
1621 long nf = parse_high_water;
1622 char *start;
1623 char *end = scan + len;
1624 int regex_flags = RE_NEED_START;
1625 mbstate_t mbs;
1626 char* field_start;
1627 bool field_found = false;
1628
1629 memset(&mbs, 0, sizeof(mbstate_t));
1630
1631 if (up_to == UNLIMITED)
1632 nf = 0;
1633
1634 if (len == 0)
1635 return nf;
1636
1637 if (rp == NULL) /* use FPAT */
1638 rp = FPAT_regexp;
1639
1640 while (scan < end && nf < up_to) { /* still something to parse */
1641
1642 /* first attempt to match the next field */
1643 start = scan;
1644 field_found = research(rp, scan, 0, (end - scan), regex_flags) != -1;
1645
1646 /* check for an invalid null field and retry one character away */
1647 if (nf > 0 && field_found && REEND(rp, scan) == 0) { /* invalid null field */
1648 increment_scan(& scan, end - scan);
1649 field_found = research(rp, scan, 0, (end - scan), regex_flags) != -1;
1650 }
1651
1652 /* store the (sep[n-1],field[n]) pair */
1653 if (field_found) {
1654 field_start = scan + RESTART(rp, scan);
1655 if (sep_arr != NULL) { /* store the separator */
1656 if (field_start == start) /* match at front */
1657 set_element(nf, start, 0L, sep_arr);
1658 else
1659 set_element(nf,
1660 start,
1661 (long) (field_start - start),
1662 sep_arr);
1663 }
1664 /* field is text that matched */
1665 (*set)(++nf,
1666 field_start,
1667 (long)(REEND(rp, scan) - RESTART(rp, scan)),
1668 n);
1669 scan += REEND(rp, scan);
1670
1671 } else {
1672 /*
1673 * No match, store the final extra separator after
1674 * the last field.
1675 */
1676 if (sep_arr != NULL)
1677 set_element(nf, start, (long) (end - start), sep_arr);
1678 scan = end;
1679 }
1680 }
1681
1682 /*
1683 * If the last field extends up to the end of the record, generate
1684 * a null trailing separator
1685 */
1686 if (sep_arr != NULL && scan == end && field_found)
1687 set_element(nf, scan, 0L, sep_arr);
1688
1689 *buf = scan;
1690 return nf;
1691 }
1692