1 /*
2  * field.c - routines for dealing with fields and record parsing
3  */
4 
5 /*
6  * Copyright (C) 1986, 1988, 1989, 1991-2021 the Free Software Foundation, Inc.
7  *
8  * This file is part of GAWK, the GNU implementation of the
9  * AWK Programming Language.
10  *
11  * GAWK is free software; you can redistribute it and/or modify
12  * it under the terms of the GNU General Public License as published by
13  * the Free Software Foundation; either version 3 of the License, or
14  * (at your option) any later version.
15  *
16  * GAWK is distributed in the hope that it will be useful,
17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
19  * GNU General Public License for more details.
20  *
21  * You should have received a copy of the GNU General Public License
22  * along with this program; if not, write to the Free Software
23  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA
24  */
25 
26 #include "awk.h"
27 
28 /*
29  * In case that the system doesn't have isblank().
30  * Don't bother with autoconf ifdef junk, just force it.
31  * See dfa.c and regex_internal.h and regcomp.c. Bleah.
32  */
33 static int
is_blank(int c)34 is_blank(int c)
35 {
36 	return c == ' ' || c == '\t';
37 }
38 
39 typedef void (* Setfunc)(long, char *, long, NODE *);
40 
41 /* is the API currently overriding the default parsing mechanism? */
42 static bool api_parser_override = false;
43 typedef long (*parse_field_func_t)(long, char **, int, NODE *,
44 			     Regexp *, Setfunc, NODE *, NODE *, bool);
45 static parse_field_func_t parse_field;
46 /*
47  * N.B. The normal_parse_field function pointer contains the parse_field value
48  * that should be used except when API field parsing is overriding the default
49  * field parsing mechanism.
50  */
51 static parse_field_func_t normal_parse_field;
52 static long re_parse_field(long, char **, int, NODE *,
53 			     Regexp *, Setfunc, NODE *, NODE *, bool);
54 static long def_parse_field(long, char **, int, NODE *,
55 			      Regexp *, Setfunc, NODE *, NODE *, bool);
56 static long null_parse_field(long, char **, int, NODE *,
57 			     Regexp *, Setfunc, NODE *, NODE *, bool);
58 static long sc_parse_field(long, char **, int, NODE *,
59 			     Regexp *, Setfunc, NODE *, NODE *, bool);
60 static long fw_parse_field(long, char **, int, NODE *,
61 			     Regexp *, Setfunc, NODE *, NODE *, bool);
62 static const awk_fieldwidth_info_t *api_fw = NULL;
63 static long fpat_parse_field(long, char **, int, NODE *,
64 			     Regexp *, Setfunc, NODE *, NODE *, bool);
65 static void set_element(long num, char * str, long len, NODE *arr);
66 static void grow_fields_arr(long num);
67 static void set_field(long num, char *str, long len, NODE *dummy);
68 static void purge_record(void);
69 
70 static char *parse_extent;	/* marks where to restart parse of record */
71 static long parse_high_water = 0; /* field number that we have parsed so far */
72 static long nf_high_water = 0;	/* size of fields_arr */
73 static bool resave_fs;
74 static NODE *save_FS;		/* save current value of FS when line is read,
75 				 * to be used in deferred parsing
76 				 */
77 static NODE *save_FPAT;		/* save current value of FPAT when line is read,
78 				 * to be used in deferred parsing
79 				 */
80 static awk_fieldwidth_info_t *FIELDWIDTHS = NULL;
81 
82 NODE **fields_arr;		/* array of pointers to the field nodes */
83 bool field0_valid;		/* $(>0) has not been changed yet */
84 int default_FS;			/* true when FS == " " */
85 Regexp *FS_re_yes_case = NULL;
86 Regexp *FS_re_no_case = NULL;
87 Regexp *FS_regexp = NULL;
88 Regexp *FPAT_re_yes_case = NULL;
89 Regexp *FPAT_re_no_case = NULL;
90 Regexp *FPAT_regexp = NULL;
91 NODE *Null_field = NULL;
92 
93 #define clear_mpfr(n) ((n)->flags &= ~(MPFN | MPZN | NUMCUR))
94 
95 /* init_fields --- set up the fields array to start with */
96 
97 void
init_fields()98 init_fields()
99 {
100 	emalloc(fields_arr, NODE **, sizeof(NODE *), "init_fields");
101 
102 	fields_arr[0] = make_string("", 0);
103 	fields_arr[0]->flags |= NULL_FIELD;
104 
105 	parse_extent = fields_arr[0]->stptr;
106 	save_FS = dupnode(FS_node->var_value);
107 
108 	Null_field = make_string("", 0);
109 	Null_field->flags = (STRCUR|STRING|NULL_FIELD); /* do not set MALLOC */
110 
111 	field0_valid = true;
112 }
113 
114 /* grow_fields --- acquire new fields as needed */
115 
116 static void
grow_fields_arr(long num)117 grow_fields_arr(long num)
118 {
119 	int t;
120 	NODE *n;
121 
122 	erealloc(fields_arr, NODE **, (num + 1) * sizeof(NODE *), "grow_fields_arr");
123 	for (t = nf_high_water + 1; t <= num; t++) {
124 		getnode(n);
125 		*n = *Null_field;
126 		fields_arr[t] = n;
127 	}
128 	nf_high_water = num;
129 }
130 
131 /* set_field --- set the value of a particular field */
132 
133 /*ARGSUSED*/
134 static void
set_field(long num,char * str,long len,NODE * dummy ATTRIBUTE_UNUSED)135 set_field(long num,
136 	char *str,
137 	long len,
138 	NODE *dummy ATTRIBUTE_UNUSED)	/* just to make interface same as set_element */
139 {
140 	NODE *n;
141 
142 	if (num > nf_high_water)
143 		grow_fields_arr(num);
144 	n = fields_arr[num];
145 	n->stptr = str;
146 	n->stlen = len;
147 	n->flags = (STRCUR|STRING|USER_INPUT);	/* do not set MALLOC */
148 }
149 
150 /* rebuild_record --- Someone assigned a value to $(something).
151 			Fix up $0 to be right */
152 
153 void
rebuild_record()154 rebuild_record()
155 {
156 	/*
157 	 * use explicit unsigned longs for lengths, in case
158 	 * a size_t isn't big enough.
159 	 */
160 	unsigned long tlen;
161 	NODE *tmp;
162 	char *ops;
163 	char *cops;
164 	long i;
165 
166 	assert(NF != -1);
167 
168 	tlen = 0;
169 	for (i = NF; i > 0; i--) {
170 		tmp = fields_arr[i];
171 		tmp = force_string(tmp);
172 		tlen += tmp->stlen;
173 	}
174 	tlen += (NF - 1) * OFSlen;
175 	if ((long) tlen < 0)
176 		tlen = 0;
177 	emalloc(ops, char *, tlen + 1, "rebuild_record");
178 	cops = ops;
179 	ops[0] = '\0';
180 	for (i = 1;  i <= NF; i++) {
181 		free_wstr(fields_arr[i]);
182 		tmp = fields_arr[i];
183 		/* copy field */
184 		if (tmp->stlen == 1)
185 			*cops++ = tmp->stptr[0];
186 		else if (tmp->stlen != 0) {
187 			memcpy(cops, tmp->stptr, tmp->stlen);
188 			cops += tmp->stlen;
189 		}
190 		/* copy OFS */
191 		if (i != NF) {
192 			if (OFSlen == 1)
193 				*cops++ = *OFS;
194 			else if (OFSlen != 0) {
195 				memcpy(cops, OFS, OFSlen);
196 				cops += OFSlen;
197 			}
198 		}
199 	}
200 	tmp = make_str_node(ops, tlen, ALREADY_MALLOCED);
201 
202 	/*
203 	 * Since we are about to unref fields_arr[0], we want to find
204 	 * any fields that still point into it, and have them point
205 	 * into the new field zero.  This has to be done intelligently,
206 	 * so that unrefing a field doesn't try to unref into the old $0.
207 	 */
208 	for (cops = ops, i = 1; i <= NF; i++) {
209 		NODE *r = fields_arr[i];
210 		/*
211 		 * There is no reason to copy malloc'ed fields to point into
212 		 * the new $0 buffer, although that's how previous versions did
213 		 * it. It seems faster to leave the malloc'ed fields in place.
214 		 */
215 		if (r->stlen > 0 && (r->flags & MALLOC) == 0) {
216 			NODE *n;
217 			getnode(n);
218 
219 			*n = *r;
220 			if (r->valref > 1) {
221 				/*
222 				 * This can and does happen.  It seems clear that
223 				 * we can't leave r's stptr pointing into the
224 				 * old $0 buffer that we are about to unref.
225 				 */
226 				emalloc(r->stptr, char *, r->stlen + 1, "rebuild_record");
227 				memcpy(r->stptr, cops, r->stlen);
228 				r->stptr[r->stlen] = '\0';
229 				r->flags |= MALLOC;
230 
231 				n->valref = 1;	// reset in the new field to start it off correctly!
232 			}
233 
234 			n->stptr = cops;
235 			clear_mpfr(n);
236 			unref(r);
237 			fields_arr[i] = n;
238 			assert((n->flags & WSTRCUR) == 0);
239 		}
240 		cops += fields_arr[i]->stlen + OFSlen;
241 	}
242 
243 	assert((fields_arr[0]->flags & MALLOC) == 0
244 		? fields_arr[0]->valref == 1
245 		: true);
246 
247 	unref(fields_arr[0]);
248 
249 	fields_arr[0] = tmp;
250 	field0_valid = true;
251 }
252 
253 /*
254  * set_record:
255  * setup $0, but defer parsing rest of line until reference is made to $(>0)
256  * or to NF.  At that point, parse only as much as necessary.
257  *
258  * Manage a private buffer for the contents of $0.  Doing so keeps us safe
259  * if `getline var' decides to rearrange the contents of the IOBUF that
260  * $0 might have been pointing into.  The cost is the copying of the buffer;
261  * but better correct than fast.
262  */
263 void
set_record(const char * buf,int cnt,const awk_fieldwidth_info_t * fw)264 set_record(const char *buf, int cnt, const awk_fieldwidth_info_t *fw)
265 {
266 	NODE *n;
267 	static char *databuf;
268 	static unsigned long databuf_size;
269 #define INITIAL_SIZE	512
270 #define MAX_SIZE	((unsigned long) ~0)	/* maximally portable ... */
271 
272 	purge_record();
273 
274 	/* buffer management: */
275 	if (databuf_size == 0) {	/* first time */
276 		ezalloc(databuf, char *, INITIAL_SIZE, "set_record");
277 		databuf_size = INITIAL_SIZE;
278 	}
279 	/*
280 	 * Make sure there's enough room. Since we sometimes need
281 	 * to place a sentinel at the end, we make sure
282 	 * databuf_size is > cnt after allocation.
283 	 */
284 	if (cnt >= databuf_size) {
285 		do {
286 			if (databuf_size > MAX_SIZE/2)
287 				fatal(_("input record too large"));
288 			databuf_size *= 2;
289 		} while (cnt >= databuf_size);
290 		erealloc(databuf, char *, databuf_size, "set_record");
291 		memset(databuf, '\0', databuf_size);
292 	}
293 	/* copy the data */
294 	if (cnt != 0) {
295 		memcpy(databuf, buf, cnt);
296 	}
297 
298 	/*
299 	 * Add terminating '\0' so that C library routines
300 	 * will know when to stop.
301 	 */
302 	databuf[cnt] = '\0';
303 
304 	/* manage field 0: */
305 	assert((fields_arr[0]->flags & MALLOC) == 0
306 		? fields_arr[0]->valref == 1
307 		: true);
308 
309 	unref(fields_arr[0]);
310 	getnode(n);
311 	n->stptr = databuf;
312 	n->stlen = cnt;
313 	n->valref = 1;
314 	n->type = Node_val;
315 	n->stfmt = STFMT_UNUSED;
316 #ifdef HAVE_MPFR
317 	n->strndmode = MPFR_round_mode;
318 #endif
319 	n->flags = (STRING|STRCUR|USER_INPUT);	/* do not set MALLOC */
320 	fields_arr[0] = n;
321 	if (fw != api_fw) {
322 		if ((api_fw = fw) != NULL) {
323 			if (! api_parser_override) {
324 				api_parser_override = true;
325 				parse_field = fw_parse_field;
326 				update_PROCINFO_str("FS", "API");
327 			}
328 		} else if (api_parser_override) {
329 			api_parser_override = false;
330 			parse_field = normal_parse_field;
331 			update_PROCINFO_str("FS", current_field_sep_str());
332 		}
333 	}
334 
335 #undef INITIAL_SIZE
336 #undef MAX_SIZE
337 }
338 
339 /* reset_record --- start over again with current $0 */
340 
341 void
reset_record()342 reset_record()
343 {
344 	fields_arr[0] = force_string(fields_arr[0]);
345 	purge_record();
346 	if (api_parser_override) {
347 		api_parser_override = false;
348 		parse_field = normal_parse_field;
349 		update_PROCINFO_str("FS", current_field_sep_str());
350 	}
351 }
352 
353 /*
354  * purge_record --- throw away the fields, make sure that
355  * 	individual nodes remain valid.
356  */
357 
358 static void
purge_record()359 purge_record()
360 {
361 	int i;
362 
363 	NF = -1;
364 	for (i = 1; i <= parse_high_water; i++) {
365 		NODE *n;
366 		NODE *r = fields_arr[i];
367 		if ((r->flags & MALLOC) == 0 && r->valref > 1) {
368 			/* This can and does happen. We must copy the string! */
369 			const char *save = r->stptr;
370 			emalloc(r->stptr, char *, r->stlen + 1, "purge_record");
371 			memcpy(r->stptr, save, r->stlen);
372 			r->stptr[r->stlen] = '\0';
373 			r->flags |= MALLOC;
374 		}
375 		unref(r);
376 		getnode(n);
377 		*n = *Null_field;
378 		fields_arr[i] = n;
379 	}
380 
381 	parse_high_water = 0;
382 	/*
383 	 * $0 = $0 should resplit using the current value of FS.
384 	 */
385 	if (resave_fs) {
386 		resave_fs = false;
387 		unref(save_FS);
388 		save_FS = dupnode(FS_node->var_value);
389 	}
390 
391 	field0_valid = true;
392 }
393 
394 /* set_NF --- handle what happens to $0 and fields when NF is changed */
395 
396 void
set_NF()397 set_NF()
398 {
399 	int i;
400 	long nf;
401 	NODE *n;
402 
403 	assert(NF != -1);
404 
405 	(void) force_number(NF_node->var_value);
406 	nf = get_number_si(NF_node->var_value);
407 	if (nf < 0)
408 		fatal(_("NF set to negative value"));
409 
410 	static bool warned = false;
411 	if (do_lint && NF > nf && ! warned) {
412 		warned = true;
413 		lintwarn(_("decrementing NF is not portable to many awk versions"));
414 	}
415 
416 	NF = nf;
417 
418 	if (NF > nf_high_water)
419 		grow_fields_arr(NF);
420 	if (parse_high_water < NF) {
421 		for (i = parse_high_water + 1; i >= 0 && i <= NF; i++) {
422 			unref(fields_arr[i]);
423 			getnode(n);
424 			*n = *Null_field;
425 			fields_arr[i] = n;
426 		}
427 		parse_high_water = NF;
428 	} else if (parse_high_water > 0) {
429 		for (i = NF + 1; i >= 0 && i <= parse_high_water; i++) {
430 			unref(fields_arr[i]);
431 			getnode(n);
432 			*n = *Null_field;
433 			fields_arr[i] = n;
434 		}
435 		parse_high_water = NF;
436 	}
437 	field0_valid = false;
438 }
439 
440 /*
441  * re_parse_field --- parse fields using a regexp.
442  *
443  * This is called both from get_field() and from do_split()
444  * via (*parse_field)().  This variation is for when FS is a regular
445  * expression -- either user-defined or because RS=="" and FS==" "
446  */
447 static long
re_parse_field(long up_to,char ** buf,int len,NODE * fs ATTRIBUTE_UNUSED,Regexp * rp,Setfunc set,NODE * n,NODE * sep_arr,bool in_middle)448 re_parse_field(long up_to,	/* parse only up to this field number */
449 	char **buf,	/* on input: string to parse; on output: point to start next */
450 	int len,
451 	NODE *fs ATTRIBUTE_UNUSED,
452 	Regexp *rp,
453 	Setfunc set,	/* routine to set the value of the parsed field */
454 	NODE *n,
455 	NODE *sep_arr,  /* array of field separators (maybe NULL) */
456 	bool in_middle)
457 {
458 	char *scan = *buf;
459 	long nf = parse_high_water;
460 	char *field;
461 	char *end = scan + len;
462 	int regex_flags = RE_NEED_START;
463 	char *sep;
464 	size_t mbclen = 0;
465 	mbstate_t mbs;
466 
467 	memset(&mbs, 0, sizeof(mbstate_t));
468 
469 	if (in_middle)
470 		regex_flags |= RE_NO_BOL;
471 
472 	if (up_to == UNLIMITED)
473 		nf = 0;
474 	if (len == 0)
475 		return nf;
476 
477 	bool default_field_splitting = (RS_is_null && default_FS);
478 
479 	if (default_field_splitting) {
480 		sep = scan;
481 		while (scan < end && (*scan == ' ' || *scan == '\t' || *scan == '\n'))
482 			scan++;
483 		if (sep_arr != NULL && sep < scan)
484 			set_element(nf, sep, (long)(scan - sep), sep_arr);
485 	}
486 
487 	if (rp == NULL) /* use FS */
488 		rp = FS_regexp;
489 
490 	field = scan;
491 	while (scan < end
492 	       && research(rp, scan, 0, (end - scan), regex_flags) != -1
493 	       && nf < up_to) {
494 		regex_flags |= RE_NO_BOL;
495 		if (REEND(rp, scan) == RESTART(rp, scan)) {   /* null match */
496 			if (gawk_mb_cur_max > 1)	{
497 				mbclen = mbrlen(scan, end-scan, &mbs);
498 				if ((mbclen == 1) || (mbclen == (size_t) -1)
499 					|| (mbclen == (size_t) -2) || (mbclen == 0)) {
500 					/* We treat it as a singlebyte character.  */
501 					mbclen = 1;
502 				}
503 				scan += mbclen;
504 			} else
505 				scan++;
506 			if (scan == end) {
507 				(*set)(++nf, field, (long)(scan - field), n);
508 				up_to = nf;
509 				break;
510 			}
511 			continue;
512 		}
513 		(*set)(++nf, field,
514 		       (long)(scan + RESTART(rp, scan) - field), n);
515 		if (sep_arr != NULL)
516 	    		set_element(nf, scan + RESTART(rp, scan),
517            			(long) (REEND(rp, scan) - RESTART(rp, scan)), sep_arr);
518 		scan += REEND(rp, scan);
519 		field = scan;
520 		if (scan == end && ! default_field_splitting)	/* FS at end of record */
521 			(*set)(++nf, field, 0L, n);
522 	}
523 	if (nf != up_to && scan < end) {
524 		(*set)(++nf, scan, (long)(end - scan), n);
525 		scan = end;
526 	}
527 	*buf = scan;
528 	return nf;
529 }
530 
531 /*
532  * def_parse_field --- default field parsing.
533  *
534  * This is called both from get_field() and from do_split()
535  * via (*parse_field)().  This variation is for when FS is a single space
536  * character.
537  */
538 
539 static long
def_parse_field(long up_to,char ** buf,int len,NODE * fs,Regexp * rp ATTRIBUTE_UNUSED,Setfunc set,NODE * n,NODE * sep_arr,bool in_middle ATTRIBUTE_UNUSED)540 def_parse_field(long up_to,	/* parse only up to this field number */
541 	char **buf,	/* on input: string to parse; on output: point to start next */
542 	int len,
543 	NODE *fs,
544 	Regexp *rp ATTRIBUTE_UNUSED,
545 	Setfunc set,	/* routine to set the value of the parsed field */
546 	NODE *n,
547 	NODE *sep_arr,  /* array of field separators (maybe NULL) */
548 	bool in_middle ATTRIBUTE_UNUSED)
549 {
550 	char *scan = *buf;
551 	long nf = parse_high_water;
552 	char *field;
553 	char *end = scan + len;
554 	char sav;
555 	char *sep;
556 
557 	if (up_to == UNLIMITED)
558 		nf = 0;
559 	if (len == 0)
560 		return nf;
561 
562 	/*
563 	 * Nasty special case. If FS set to "", return whole record
564 	 * as first field. This is not worth a separate function.
565 	 */
566 	if (fs->stlen == 0) {
567 		(*set)(++nf, *buf, len, n);
568 		*buf += len;
569 		return nf;
570 	}
571 
572 	/* before doing anything save the char at *end */
573 	sav = *end;
574 	/* because it will be destroyed now: */
575 
576 	*end = ' ';	/* sentinel character */
577 	sep = scan;
578 	for (; nf < up_to; scan++) {
579 		/*
580 		 * special case:  fs is single space, strip leading whitespace
581 		 */
582 		while (scan < end && (*scan == ' ' || *scan == '\t' || *scan == '\n'))
583 			scan++;
584 
585 		if (sep_arr != NULL && scan > sep)
586 			set_element(nf, sep, (long) (scan - sep), sep_arr);
587 
588 		if (scan >= end)
589 			break;
590 
591 		field = scan;
592 
593 		while (*scan != ' ' && *scan != '\t' && *scan != '\n')
594 			scan++;
595 
596 		(*set)(++nf, field, (long)(scan - field), n);
597 
598 		if (scan == end)
599 			break;
600 
601 		sep = scan;
602 	}
603 
604 	/* everything done, restore original char at *end */
605 	*end = sav;
606 
607 	*buf = scan;
608 	return nf;
609 }
610 
611 /*
612  * null_parse_field --- each character is a separate field
613  *
614  * This is called both from get_field() and from do_split()
615  * via (*parse_field)().  This variation is for when FS is the null string.
616  */
617 static long
null_parse_field(long up_to,char ** buf,int len,NODE * fs ATTRIBUTE_UNUSED,Regexp * rp ATTRIBUTE_UNUSED,Setfunc set,NODE * n,NODE * sep_arr,bool in_middle ATTRIBUTE_UNUSED)618 null_parse_field(long up_to,	/* parse only up to this field number */
619 	char **buf,	/* on input: string to parse; on output: point to start next */
620 	int len,
621 	NODE *fs ATTRIBUTE_UNUSED,
622 	Regexp *rp ATTRIBUTE_UNUSED,
623 	Setfunc set,	/* routine to set the value of the parsed field */
624 	NODE *n,
625 	NODE *sep_arr,  /* array of field separators (maybe NULL) */
626 	bool in_middle ATTRIBUTE_UNUSED)
627 {
628 	char *scan = *buf;
629 	long nf = parse_high_water;
630 	char *end = scan + len;
631 
632 	if (up_to == UNLIMITED)
633 		nf = 0;
634 	if (len == 0)
635 		return nf;
636 
637 	if (gawk_mb_cur_max > 1) {
638 		mbstate_t mbs;
639 		memset(&mbs, 0, sizeof(mbstate_t));
640 		for (; nf < up_to && scan < end;) {
641 			size_t mbclen = mbrlen(scan, end-scan, &mbs);
642 			if ((mbclen == 1) || (mbclen == (size_t) -1)
643 				|| (mbclen == (size_t) -2) || (mbclen == 0)) {
644 				/* We treat it as a singlebyte character.  */
645 				mbclen = 1;
646 			}
647 			if (sep_arr != NULL && nf > 0)
648 				set_element(nf, scan, 0L, sep_arr);
649 			(*set)(++nf, scan, mbclen, n);
650 			scan += mbclen;
651 		}
652 	} else {
653 		for (; nf < up_to && scan < end; scan++) {
654 			if (sep_arr != NULL && nf > 0)
655 				set_element(nf, scan, 0L, sep_arr);
656 			(*set)(++nf, scan, 1L, n);
657 		}
658 	}
659 
660 	*buf = scan;
661 	return nf;
662 }
663 
664 /*
665  * sc_parse_field --- single character field separator
666  *
667  * This is called both from get_field() and from do_split()
668  * via (*parse_field)().  This variation is for when FS is a single character
669  * other than space.
670  */
671 static long
sc_parse_field(long up_to,char ** buf,int len,NODE * fs,Regexp * rp ATTRIBUTE_UNUSED,Setfunc set,NODE * n,NODE * sep_arr,bool in_middle ATTRIBUTE_UNUSED)672 sc_parse_field(long up_to,	/* parse only up to this field number */
673 	char **buf,	/* on input: string to parse; on output: point to start next */
674 	int len,
675 	NODE *fs,
676 	Regexp *rp ATTRIBUTE_UNUSED,
677 	Setfunc set,	/* routine to set the value of the parsed field */
678 	NODE *n,
679 	NODE *sep_arr,  /* array of field separators (maybe NULL) */
680 	bool in_middle ATTRIBUTE_UNUSED)
681 {
682 	char *scan = *buf;
683 	char fschar;
684 	long nf = parse_high_water;
685 	char *field;
686 	char *end = scan + len;
687 	char sav;
688 	size_t mbclen = 0;
689 	mbstate_t mbs;
690 
691 	memset(&mbs, 0, sizeof(mbstate_t));
692 
693 	if (up_to == UNLIMITED)
694 		nf = 0;
695 	if (len == 0)
696 		return nf;
697 
698 	if (RS_is_null && fs->stlen == 0)
699 		fschar = '\n';
700 	else
701 		fschar = fs->stptr[0];
702 
703 	/* before doing anything save the char at *end */
704 	sav = *end;
705 	/* because it will be destroyed now: */
706 	*end = fschar;	/* sentinel character */
707 
708 	for (; nf < up_to;) {
709 		field = scan;
710 		if (gawk_mb_cur_max > 1) {
711 			while (*scan != fschar) {
712 				mbclen = mbrlen(scan, end-scan, &mbs);
713 				if ((mbclen == 1) || (mbclen == (size_t) -1)
714 					|| (mbclen == (size_t) -2) || (mbclen == 0)) {
715 					/* We treat it as a singlebyte character.  */
716 					mbclen = 1;
717 				}
718 				scan += mbclen;
719 			}
720 		} else {
721 			while (*scan != fschar)
722 				scan++;
723 		}
724 		(*set)(++nf, field, (long)(scan - field), n);
725 		if (scan == end)
726 			break;
727 		if (sep_arr != NULL)
728 			set_element(nf, scan, 1L, sep_arr);
729 		scan++;
730 		if (scan == end) {	/* FS at end of record */
731 			(*set)(++nf, field, 0L, n);
732 			break;
733 		}
734 	}
735 
736 	/* everything done, restore original char at *end */
737 	*end = sav;
738 
739 	*buf = scan;
740 	return nf;
741 }
742 
743 /*
744  * calc_mbslen --- calculate the length in bytes of a multi-byte string
745  * containing len characters.
746  */
747 
748 static size_t
calc_mbslen(char * scan,char * end,size_t len,mbstate_t * mbs)749 calc_mbslen(char *scan, char *end, size_t len, mbstate_t *mbs)
750 {
751 
752 	size_t mbclen;
753 	char *mbscan = scan;
754 
755 	while (len-- > 0 && mbscan < end) {
756 		mbclen = mbrlen(mbscan, end - mbscan, mbs);
757 		if (!(mbclen > 0 && mbclen <= (size_t)(end - mbscan)))
758 			/*
759 			 * We treat it as a singlebyte character. This should
760 			 * catch error codes 0, (size_t) -1, and (size_t) -2.
761 			 */
762 			mbclen = 1;
763 		mbscan += mbclen;
764 	}
765 	return mbscan - scan;
766 }
767 
768 /*
769  * fw_parse_field --- field parsing using FIELDWIDTHS spec
770  *
771  * This is called from get_field() via (*parse_field)().
772  * This variation is for fields are fixed widths.
773  */
774 static long
fw_parse_field(long up_to,char ** buf,int len,NODE * fs ATTRIBUTE_UNUSED,Regexp * rp ATTRIBUTE_UNUSED,Setfunc set,NODE * n,NODE * dummy ATTRIBUTE_UNUSED,bool in_middle ATTRIBUTE_UNUSED)775 fw_parse_field(long up_to,	/* parse only up to this field number */
776 	char **buf,	/* on input: string to parse; on output: point to start next */
777 	int len,
778 	NODE *fs ATTRIBUTE_UNUSED,
779 	Regexp *rp ATTRIBUTE_UNUSED,
780 	Setfunc set,	/* routine to set the value of the parsed field */
781 	NODE *n,
782 	NODE *dummy ATTRIBUTE_UNUSED, /* sep_arr not needed here: hence dummy */
783 	bool in_middle ATTRIBUTE_UNUSED)
784 {
785 	char *scan = *buf;
786 	long nf = parse_high_water;
787 	char *end = scan + len;
788 	const awk_fieldwidth_info_t *fw;
789 	mbstate_t mbs;
790 	size_t skiplen;
791 	size_t flen;
792 
793 	fw = (api_parser_override ? api_fw : FIELDWIDTHS);
794 
795 	if (up_to == UNLIMITED)
796 		nf = 0;
797 	if (len == 0)
798 		return nf;
799 	if (gawk_mb_cur_max > 1 && fw->use_chars) {
800 		/*
801 		 * Reset the shift state. Arguably, the shift state should
802 		 * be part of the file state and carried forward at all times,
803 		 * but nobody has complained so far, so this may not matter
804 		 * in practice.
805 		 */
806 		memset(&mbs, 0, sizeof(mbstate_t));
807 		while (nf < up_to && scan < end) {
808 			if (nf >= fw->nf) {
809 				*buf = end;
810 				return nf;
811 			}
812 			scan += calc_mbslen(scan, end, fw->fields[nf].skip, &mbs);
813 			flen = calc_mbslen(scan, end, fw->fields[nf].len, &mbs);
814 			(*set)(++nf, scan, (long) flen, n);
815 			scan += flen;
816 		}
817 	} else {
818 		while (nf < up_to && scan < end) {
819 			if (nf >= fw->nf) {
820 				*buf = end;
821 				return nf;
822 			}
823 			skiplen = fw->fields[nf].skip;
824 			if (skiplen > end - scan)
825 				skiplen = end - scan;
826 			scan += skiplen;
827 			flen = fw->fields[nf].len;
828 			if (flen > end - scan)
829 				flen = end - scan;
830 			(*set)(++nf, scan, (long) flen, n);
831 			scan += flen;
832 		}
833 	}
834 	*buf = scan;
835 	return nf;
836 }
837 
838 /* invalidate_field0 --- $0 needs reconstruction */
839 
840 void
invalidate_field0()841 invalidate_field0()
842 {
843 	field0_valid = false;
844 }
845 
846 /* get_field --- return a particular $n */
847 
848 /* assign is not NULL if this field is on the LHS of an assign */
849 
850 NODE **
get_field(long requested,Func_ptr * assign)851 get_field(long requested, Func_ptr *assign)
852 {
853 	bool in_middle = false;
854 	static bool warned = false;
855 	extern int currule;
856 	NODE *saved_fs;
857 	Regexp *fs_regexp;
858 
859 	if (do_lint && currule == END && ! warned) {
860 		warned = true;
861 		lintwarn(_("accessing fields from an END rule may not be portable"));
862 	}
863 
864 	/*
865 	 * if requesting whole line but some other field has been altered,
866 	 * then the whole line must be rebuilt
867 	 */
868 	if (requested == 0) {
869 		if (! field0_valid) {
870 			/* first, parse remainder of input record */
871 			if (NF == -1) {
872 				in_middle = (parse_high_water != 0);
873 				if (current_field_sep() == Using_FPAT) {
874 					saved_fs = save_FPAT;
875 					fs_regexp = FPAT_regexp;
876 				} else {
877 					saved_fs = save_FS;
878 					fs_regexp = FS_regexp;
879 				}
880 				NF = (*parse_field)(UNLIMITED - 1, &parse_extent,
881 		    			fields_arr[0]->stlen -
882 					(parse_extent - fields_arr[0]->stptr),
883 					saved_fs, fs_regexp, set_field,
884 					(NODE *) NULL,
885 					(NODE *) NULL,
886 					in_middle);
887 				parse_high_water = NF;
888 			}
889 			rebuild_record();
890 		}
891 		if (assign != NULL)
892 			*assign = reset_record;
893 		return &fields_arr[0];
894 	}
895 
896 	/* assert(requested > 0); */
897 
898 #if 0
899 	if (assign != NULL)
900 		field0_valid = false;		/* $0 needs reconstruction */
901 #else
902 	/*
903 	 * Keep things uniform. Also, mere intention of assigning something
904 	 * to $n should not make $0 invalid. Makes sense to invalidate $0
905 	 * after the actual assignment is performed. Not a real issue in
906 	 * the interpreter otherwise, but causes problem in the
907 	 * debugger when watching or printing fields.
908 	 */
909 
910 	if (assign != NULL)
911 		*assign = invalidate_field0;	/* $0 needs reconstruction */
912 #endif
913 
914 	if (requested <= parse_high_water)	/* already parsed this field */
915 		return &fields_arr[requested];
916 
917 	if (NF == -1) {	/* have not yet parsed to end of record */
918 		/*
919 		 * parse up to requested fields, calling set_field() for each,
920 		 * saving in parse_extent the point where the parse left off
921 		 */
922 		if (parse_high_water == 0)	/* starting at the beginning */
923 			parse_extent = fields_arr[0]->stptr;
924 		else
925 			in_middle = true;
926 		parse_high_water = (*parse_field)(requested, &parse_extent,
927 		     fields_arr[0]->stlen - (parse_extent - fields_arr[0]->stptr),
928 		     save_FS, NULL, set_field, (NODE *) NULL, (NODE *) NULL, in_middle);
929 
930 		/*
931 		 * if we reached the end of the record, set NF to the number of
932 		 * fields so far.  Note that requested might actually refer to
933 		 * a field that is beyond the end of the record, but we won't
934 		 * set NF to that value at this point, since this is only a
935 		 * reference to the field and NF only gets set if the field
936 		 * is assigned to -- this case is handled below
937 		 */
938 		if (parse_extent == fields_arr[0]->stptr + fields_arr[0]->stlen)
939 			NF = parse_high_water;
940 		if (requested == UNLIMITED - 1)	/* UNLIMITED-1 means set NF */
941 			requested = parse_high_water;
942 	}
943 	if (parse_high_water < requested) { /* requested beyond end of record */
944 		if (assign != NULL) {	/* expand record */
945 			if (requested > nf_high_water)
946 				grow_fields_arr(requested);
947 
948 			NF = requested;
949 			parse_high_water = requested;
950 		} else
951 			return &Null_field;
952 	}
953 
954 	return &fields_arr[requested];
955 }
956 
957 /* set_element --- set an array element, used by do_split() */
958 
959 static void
set_element(long num,char * s,long len,NODE * n)960 set_element(long num, char *s, long len, NODE *n)
961 {
962 	NODE *it;
963 	NODE *sub;
964 
965 	it = make_string(s, len);
966 	it->flags |= USER_INPUT;
967 	sub = make_number((AWKNUM) (num));
968 	assoc_set(n, sub, it);
969 }
970 
971 /* do_split --- implement split(), semantics are same as for field splitting */
972 
973 NODE *
do_split(int nargs)974 do_split(int nargs)
975 {
976 	NODE *src, *arr, *sep, *fs, *tmp, *sep_arr = NULL;
977 	char *s;
978 	long (*parseit)(long, char **, int, NODE *,
979 			 Regexp *, Setfunc, NODE *, NODE *, bool);
980 	Regexp *rp = NULL;
981 
982 	if (nargs == 4) {
983 		static bool warned = false;
984 
985 		if (do_traditional || do_posix) {
986 			fatal(_("split: fourth argument is a gawk extension"));
987 		}
988 		sep_arr = POP_PARAM();
989 		if (sep_arr->type != Node_var_array)
990 			fatal(_("split: fourth argument is not an array"));
991 		check_symtab_functab(sep_arr, "split",
992 				_("%s: cannot use %s as fourth argument"));
993 		if ((do_lint_extensions || do_lint_old) && ! warned) {
994 			warned = true;
995 			lintwarn(_("split: fourth argument is a gawk extension"));
996 		}
997 	}
998 
999 	sep = POP();
1000 	arr = POP_PARAM();
1001 	if (arr->type != Node_var_array)
1002 		fatal(_("split: second argument is not an array"));
1003 	check_symtab_functab(arr, "split",
1004 			_("%s: cannot use %s as second argument"));
1005 
1006 	if (sep_arr != NULL) {
1007 		if (sep_arr == arr)
1008 			fatal(_("split: cannot use the same array for second and fourth args"));
1009 
1010 		/* This checks need to be done before clearing any of the arrays */
1011 		for (tmp = sep_arr->parent_array; tmp != NULL; tmp = tmp->parent_array)
1012 			if (tmp == arr)
1013 				fatal(_("split: cannot use a subarray of second arg for fourth arg"));
1014 		for (tmp = arr->parent_array; tmp != NULL; tmp = tmp->parent_array)
1015 			if (tmp == sep_arr)
1016 				fatal(_("split: cannot use a subarray of fourth arg for second arg"));
1017 		assoc_clear(sep_arr);
1018 	}
1019 	assoc_clear(arr);
1020 
1021 	src = TOP_STRING();
1022 	if (src->stlen == 0) {
1023 		/*
1024 		 * Skip the work if first arg is the null string.
1025 		 */
1026 		tmp = POP_SCALAR();
1027 		DEREF(tmp);
1028 		return make_number((AWKNUM) 0);
1029 	}
1030 
1031 	if ((sep->flags & REGEX) != 0)
1032 		sep = sep->typed_re;
1033 
1034 	if (   (sep->re_flags & FS_DFLT) != 0
1035 	    && current_field_sep() == Using_FS
1036 	    && ! RS_is_null) {
1037 		parseit = parse_field;
1038 		fs = force_string(FS_node->var_value);
1039 		rp = FS_regexp;
1040 	} else {
1041 		fs = sep->re_exp;
1042 
1043 		if (fs->stlen == 0) {
1044 			static bool warned = false;
1045 
1046 			parseit = null_parse_field;
1047 
1048 			if (do_lint && ! warned) {
1049 				warned = true;
1050 				lintwarn(_("split: null string for third arg is a non-standard extension"));
1051 			}
1052 		} else if (fs->stlen == 1 && (sep->re_flags & CONSTANT) == 0) {
1053 			if (fs->stptr[0] == ' ') {
1054 				parseit = def_parse_field;
1055 			} else
1056 				parseit = sc_parse_field;
1057 		} else {
1058 			parseit = re_parse_field;
1059 			rp = re_update(sep);
1060 		}
1061 	}
1062 
1063 	s = src->stptr;
1064 	tmp = make_number((AWKNUM) (*parseit)(UNLIMITED, &s, (int) src->stlen,
1065 					     fs, rp, set_element, arr, sep_arr, false));
1066 
1067 	src = POP_SCALAR();	/* really pop off stack */
1068 	DEREF(src);
1069 	return tmp;
1070 }
1071 
1072 /*
1073  * do_patsplit --- implement patsplit(), semantics are same as for field
1074  *		   splitting with FPAT.
1075  */
1076 
1077 NODE *
do_patsplit(int nargs)1078 do_patsplit(int nargs)
1079 {
1080 	NODE *src, *arr, *sep, *fpat, *tmp, *sep_arr = NULL;
1081 	char *s;
1082 	Regexp *rp = NULL;
1083 
1084 	if (nargs == 4) {
1085 		sep_arr = POP_PARAM();
1086 		if (sep_arr->type != Node_var_array)
1087 			fatal(_("patsplit: fourth argument is not an array"));
1088 		check_symtab_functab(sep_arr, "patsplit",
1089 				_("%s: cannot use %s as fourth argument"));
1090 	}
1091 	sep = POP();
1092 	arr = POP_PARAM();
1093 	if (arr->type != Node_var_array)
1094 		fatal(_("patsplit: second argument is not an array"));
1095 	check_symtab_functab(arr, "patsplit",
1096 			_("%s: cannot use %s as second argument"));
1097 
1098 	src = TOP_STRING();
1099 
1100 	if ((sep->flags & REGEX) != 0)
1101 		sep = sep->typed_re;
1102 
1103 	fpat = sep->re_exp;
1104 	if (fpat->stlen == 0)
1105 		fatal(_("patsplit: third argument must be non-null"));
1106 
1107 	if (sep_arr != NULL) {
1108 		if (sep_arr == arr)
1109 			fatal(_("patsplit: cannot use the same array for second and fourth args"));
1110 
1111 		/* These checks need to be done before clearing any of the arrays */
1112 		for (tmp = sep_arr->parent_array; tmp != NULL; tmp = tmp->parent_array)
1113 			if (tmp == arr)
1114 				fatal(_("patsplit: cannot use a subarray of second arg for fourth arg"));
1115 		for (tmp = arr->parent_array; tmp != NULL; tmp = tmp->parent_array)
1116 			if (tmp == sep_arr)
1117 				fatal(_("patsplit: cannot use a subarray of fourth arg for second arg"));
1118 		assoc_clear(sep_arr);
1119 	}
1120 	assoc_clear(arr);
1121 
1122 	if (src->stlen == 0) {
1123 		/*
1124 		 * Skip the work if first arg is the null string.
1125 		 */
1126 		tmp =  make_number((AWKNUM) 0);
1127 	} else {
1128 		rp = re_update(sep);
1129 		s = src->stptr;
1130 		tmp = make_number((AWKNUM) fpat_parse_field(UNLIMITED, &s,
1131 				(int) src->stlen, fpat, rp,
1132 				set_element, arr, sep_arr, false));
1133 	}
1134 
1135 	src = POP_SCALAR();	/* really pop off stack */
1136 	DEREF(src);
1137 	return tmp;
1138 }
1139 
1140 /* set_parser --- update the current (non-API) parser */
1141 
1142 static void
set_parser(parse_field_func_t func)1143 set_parser(parse_field_func_t func)
1144 {
1145 	normal_parse_field = func;
1146 	if (! api_parser_override && parse_field != func) {
1147 		parse_field = func;
1148 	        update_PROCINFO_str("FS", current_field_sep_str());
1149 	}
1150 }
1151 
1152 /* set_FIELDWIDTHS --- handle an assignment to FIELDWIDTHS */
1153 
1154 void
set_FIELDWIDTHS()1155 set_FIELDWIDTHS()
1156 {
1157 	char *scan;
1158 	char *end;
1159 	int i;
1160 	static int fw_alloc = 4;
1161 	static bool warned = false;
1162 	bool fatal_error = false;
1163 	NODE *tmp;
1164 
1165 	if (do_lint_extensions && ! warned) {
1166 		warned = true;
1167 		lintwarn(_("`FIELDWIDTHS' is a gawk extension"));
1168 	}
1169 	if (do_traditional)	/* quick and dirty, does the trick */
1170 		return;
1171 
1172 	/*
1173 	 * If changing the way fields are split, obey least-surprise
1174 	 * semantics, and force $0 to be split totally.
1175 	 */
1176 	if (fields_arr != NULL)
1177 		(void) get_field(UNLIMITED - 1, 0);
1178 
1179 	set_parser(fw_parse_field);
1180 	tmp = force_string(FIELDWIDTHS_node->var_value);
1181 	scan = tmp->stptr;
1182 
1183 	if (FIELDWIDTHS == NULL) {
1184 		emalloc(FIELDWIDTHS, awk_fieldwidth_info_t *, awk_fieldwidth_info_size(fw_alloc), "set_FIELDWIDTHS");
1185 		FIELDWIDTHS->use_chars = awk_true;
1186 	}
1187 	FIELDWIDTHS->nf = 0;
1188 	for (i = 0; ; i++) {
1189 		unsigned long int tmp;
1190 		if (i >= fw_alloc) {
1191 			fw_alloc *= 2;
1192 			erealloc(FIELDWIDTHS, awk_fieldwidth_info_t *, awk_fieldwidth_info_size(fw_alloc), "set_FIELDWIDTHS");
1193 		}
1194 		/* Ensure that there is no leading `-' sign.  Otherwise,
1195 		   strtoul would accept it and return a bogus result.  */
1196 		while (is_blank(*scan)) {
1197 			++scan;
1198 		}
1199 		if (*scan == '-') {
1200 			fatal_error = true;
1201 			break;
1202 		}
1203 		if (*scan == '\0')
1204 			break;
1205 
1206 		// Look for skip value. We allow N:M and N:*.
1207 		/*
1208 		 * Detect an invalid base-10 integer, a valid value that
1209 		 * is followed by something other than a blank or '\0',
1210 		 * or a value that is not in the range [1..UINT_MAX].
1211 		 */
1212 		errno = 0;
1213 		tmp = strtoul(scan, &end, 10);
1214 		if (errno == 0 && *end == ':' && (0 < tmp && tmp <= UINT_MAX)) {
1215 			FIELDWIDTHS->fields[i].skip = tmp;
1216 			scan = end + 1;
1217 			if (*scan == '-' || is_blank(*scan)) {
1218 				fatal_error = true;
1219 				break;
1220 			}
1221 			// try scanning for field width
1222 			tmp = strtoul(scan, &end, 10);
1223 		}
1224 		else
1225 			FIELDWIDTHS->fields[i].skip = 0;
1226 
1227 		if (errno != 0
1228 		    	|| (*end != '\0' && ! is_blank(*end))
1229 				|| !(0 < tmp && tmp <= UINT_MAX)
1230 		) {
1231 			if (*scan == '*') {
1232 				for (scan++; is_blank(*scan); scan++)
1233 					continue;
1234 
1235 				if (*scan != '\0')
1236 					fatal(_("`*' must be the last designator in FIELDWIDTHS"));
1237 
1238 				FIELDWIDTHS->fields[i].len = UINT_MAX;
1239 				FIELDWIDTHS->nf = i+1;
1240 			}
1241 			else
1242 				fatal_error = true;
1243 			break;
1244 		}
1245 		FIELDWIDTHS->fields[i].len = tmp;
1246 		FIELDWIDTHS->nf = i+1;
1247 		scan = end;
1248 		/* Skip past any trailing blanks.  */
1249 		while (is_blank(*scan)) {
1250 			++scan;
1251 		}
1252 		if (*scan == '\0')
1253 			break;
1254 	}
1255 
1256 	if (fatal_error)
1257 		fatal(_("invalid FIELDWIDTHS value, for field %d, near `%s'"),
1258 			      i + 1, scan);
1259 }
1260 
1261 /* set_FS --- handle things when FS is assigned to */
1262 
1263 void
set_FS()1264 set_FS()
1265 {
1266 	char buf[10];
1267 	NODE *fs;
1268 	static NODE *save_fs = NULL;
1269 	static NODE *save_rs = NULL;
1270 	bool remake_re = true;
1271 
1272 	/*
1273 	 * If changing the way fields are split, obey least-surprise
1274 	 * semantics, and force $0 to be split totally.
1275 	 */
1276 	if (fields_arr != NULL)
1277 		(void) get_field(UNLIMITED - 1, 0);
1278 
1279 	/* It's possible that only IGNORECASE changed, or FS = FS */
1280 	/*
1281 	 * This comparison can't use cmp_nodes(), which pays attention
1282 	 * to IGNORECASE, and that's not what we want.
1283 	 */
1284 	if (save_fs
1285 		&& FS_node->var_value->stlen == save_fs->stlen
1286 		&& memcmp(FS_node->var_value->stptr, save_fs->stptr, save_fs->stlen) == 0
1287 		&& save_rs
1288 		&& RS_node->var_value->stlen == save_rs->stlen
1289 		&& memcmp(RS_node->var_value->stptr, save_rs->stptr, save_rs->stlen) == 0) {
1290 		if (FS_regexp != NULL)
1291 			FS_regexp = (IGNORECASE ? FS_re_no_case : FS_re_yes_case);
1292 
1293 		/* FS = FS */
1294 		if (current_field_sep() == Using_FS) {
1295 			return;
1296 		} else {
1297 			remake_re = false;
1298 			goto choose_fs_function;
1299 		}
1300 	}
1301 
1302 	unref(save_fs);
1303 	save_fs = dupnode(FS_node->var_value);
1304 	unref(save_rs);
1305 	save_rs = dupnode(RS_node->var_value);
1306 	resave_fs = true;
1307 
1308 	/* If FS_re_no_case assignment is fatal (make_regexp in remake_re)
1309 	 * FS_regexp will be NULL with a non-null FS_re_yes_case.
1310 	 * refree() handles null argument; no need for `if (FS_regexp != NULL)' below.
1311 	 * Please do not remerge.
1312 	 */
1313 	refree(FS_re_yes_case);
1314 	refree(FS_re_no_case);
1315 	FS_re_yes_case = FS_re_no_case = FS_regexp = NULL;
1316 
1317 
1318 choose_fs_function:
1319 	buf[0] = '\0';
1320 	default_FS = false;
1321 	fs = force_string(FS_node->var_value);
1322 
1323 	if (! do_traditional && fs->stlen == 0) {
1324 		static bool warned = false;
1325 
1326 		set_parser(null_parse_field);
1327 
1328 		if (do_lint_extensions && ! warned) {
1329 			warned = true;
1330 			lintwarn(_("null string for `FS' is a gawk extension"));
1331 		}
1332 	} else if (fs->stlen > 1 || (fs->flags & REGEX) != 0) {
1333 		if (do_lint_old)
1334 			lintwarn(_("old awk does not support regexps as value of `FS'"));
1335 		set_parser(re_parse_field);
1336 	} else if (RS_is_null) {
1337 		/* we know that fs->stlen <= 1 */
1338 		set_parser(sc_parse_field);
1339 		if (fs->stlen == 1) {
1340 			if (fs->stptr[0] == ' ') {
1341 				default_FS = true;
1342 				strcpy(buf, "[ \t\n]+");
1343 			} else if (fs->stptr[0] == '\\') {
1344 				/* yet another special case */
1345 				strcpy(buf, "[\\\\\n]");
1346 			} else if (fs->stptr[0] == '\0') {
1347 				/* and yet another special case */
1348 				strcpy(buf, "[\\000\n]");
1349 			} else if (fs->stptr[0] != '\n') {
1350 				sprintf(buf, "[%c\n]", fs->stptr[0]);
1351 			}
1352 		}
1353 	} else {
1354 		set_parser(def_parse_field);
1355 
1356 		if (fs->stlen == 1) {
1357 			if (fs->stptr[0] == ' ')
1358 				default_FS = true;
1359 			else if (fs->stptr[0] == '\\')
1360 				/* same special case */
1361 				strcpy(buf, "[\\\\]");
1362 			else
1363 				set_parser(sc_parse_field);
1364 		}
1365 	}
1366 	if (remake_re) {
1367 		refree(FS_re_yes_case);
1368 		refree(FS_re_no_case);
1369 		FS_re_yes_case = FS_re_no_case = FS_regexp = NULL;
1370 
1371 		if (buf[0] != '\0') {
1372 			FS_re_yes_case = make_regexp(buf, strlen(buf), false, true, true);
1373 			FS_re_no_case = make_regexp(buf, strlen(buf), true, true, true);
1374 			FS_regexp = (IGNORECASE ? FS_re_no_case : FS_re_yes_case);
1375 			set_parser(re_parse_field);
1376 		} else if (parse_field == re_parse_field) {
1377 			FS_re_yes_case = make_regexp(fs->stptr, fs->stlen, false, true, true);
1378 			FS_re_no_case = make_regexp(fs->stptr, fs->stlen, true, true, true);
1379 			FS_regexp = (IGNORECASE ? FS_re_no_case : FS_re_yes_case);
1380 		} else
1381 			FS_re_yes_case = FS_re_no_case = FS_regexp = NULL;
1382 	}
1383 
1384 	/*
1385 	 * For FS = "c", we don't use IGNORECASE. But we must use
1386 	 * re_parse_field to get the character and the newline as
1387 	 * field separators.
1388 	 */
1389 	if (fs->stlen == 1 && parse_field == re_parse_field)
1390 		FS_regexp = FS_re_yes_case;
1391 }
1392 
1393 /* current_field_sep --- return the field separator type */
1394 
1395 field_sep_type
current_field_sep()1396 current_field_sep()
1397 {
1398 	if (api_parser_override)
1399 		return Using_API;
1400 	else if (parse_field == fw_parse_field)
1401 		return Using_FIELDWIDTHS;
1402 	else if (parse_field == fpat_parse_field)
1403 		return Using_FPAT;
1404 	else
1405 		return Using_FS;
1406 }
1407 
1408 /* current_field_sep_str --- return the field separator type as a string */
1409 
1410 const char *
current_field_sep_str()1411 current_field_sep_str()
1412 {
1413 	if (api_parser_override)
1414 		return "API";
1415 	else if (parse_field == fw_parse_field)
1416 		return "FIELDWIDTHS";
1417 	else if (parse_field == fpat_parse_field)
1418 		return "FPAT";
1419 	else
1420 		return "FS";
1421 }
1422 
1423 /* update_PROCINFO_str --- update PROCINFO[sub] with string value */
1424 
1425 void
update_PROCINFO_str(const char * subscript,const char * str)1426 update_PROCINFO_str(const char *subscript, const char *str)
1427 {
1428 	NODE *tmp;
1429 
1430 	if (PROCINFO_node == NULL)
1431 		return;
1432 	tmp = make_string(subscript, strlen(subscript));
1433 	assoc_set(PROCINFO_node, tmp, make_string(str, strlen(str)));
1434 }
1435 
1436 /* update_PROCINFO_num --- update PROCINFO[sub] with numeric value */
1437 
1438 void
update_PROCINFO_num(const char * subscript,AWKNUM val)1439 update_PROCINFO_num(const char *subscript, AWKNUM val)
1440 {
1441 	NODE *tmp;
1442 
1443 	if (PROCINFO_node == NULL)
1444 		return;
1445 	tmp = make_string(subscript, strlen(subscript));
1446 	assoc_set(PROCINFO_node, tmp, make_number(val));
1447 }
1448 
1449 /* set_FPAT --- handle an assignment to FPAT */
1450 
1451 void
set_FPAT()1452 set_FPAT()
1453 {
1454 	static bool warned = false;
1455 	bool remake_re = true;
1456 	NODE *fpat;
1457 
1458 	if (do_lint_extensions && ! warned) {
1459 		warned = true;
1460 		lintwarn(_("`FPAT' is a gawk extension"));
1461 	}
1462 	if (do_traditional)	/* quick and dirty, does the trick */
1463 		return;
1464 
1465 	/*
1466 	 * If changing the way fields are split, obey least-suprise
1467 	 * semantics, and force $0 to be split totally.
1468 	 */
1469 	if (fields_arr != NULL)
1470 		(void) get_field(UNLIMITED - 1, 0);
1471 
1472 	/* It's possible that only IGNORECASE changed, or FPAT = FPAT */
1473 	/*
1474 	 * This comparison can't use cmp_nodes(), which pays attention
1475 	 * to IGNORECASE, and that's not what we want.
1476 	 */
1477 	if (save_FPAT
1478 		&& FPAT_node->var_value->stlen == save_FPAT->stlen
1479 		&& memcmp(FPAT_node->var_value->stptr, save_FPAT->stptr, save_FPAT->stlen) == 0) {
1480 		if (FPAT_regexp != NULL)
1481 			FPAT_regexp = (IGNORECASE ? FPAT_re_no_case : FPAT_re_yes_case);
1482 
1483 		/* FPAT = FPAT */
1484 		if (current_field_sep() == Using_FPAT) {
1485 			return;
1486 		} else {
1487 			remake_re = false;
1488 			goto set_fpat_function;
1489 		}
1490 	}
1491 
1492 	unref(save_FPAT);
1493 	save_FPAT = dupnode(FPAT_node->var_value);
1494 	refree(FPAT_re_yes_case);
1495 	refree(FPAT_re_no_case);
1496 	FPAT_re_yes_case = FPAT_re_no_case = FPAT_regexp = NULL;
1497 
1498 set_fpat_function:
1499 	fpat = force_string(FPAT_node->var_value);
1500 	set_parser(fpat_parse_field);
1501 
1502 	if (remake_re) {
1503 		refree(FPAT_re_yes_case);
1504 		refree(FPAT_re_no_case);
1505 		FPAT_re_yes_case = FPAT_re_no_case = FPAT_regexp = NULL;
1506 
1507 		FPAT_re_yes_case = make_regexp(fpat->stptr, fpat->stlen, false, true, true);
1508 		FPAT_re_no_case = make_regexp(fpat->stptr, fpat->stlen, true, true, true);
1509 		FPAT_regexp = (IGNORECASE ? FPAT_re_no_case : FPAT_re_yes_case);
1510 	}
1511 }
1512 
1513 /*
1514  * increment_scan --- macro to move scan pointer ahead by one character.
1515  * 			Implementation varies if doing MBS or not.
1516  */
1517 
1518 #define increment_scan(scanp, len) incr_scan(scanp, len, & mbs)
1519 
1520 /* incr_scan --- MBS version of increment_scan() */
1521 
1522 static void
incr_scan(char ** scanp,size_t len,mbstate_t * mbs)1523 incr_scan(char **scanp, size_t len, mbstate_t *mbs)
1524 {
1525 	size_t mbclen = 0;
1526 
1527 	if (gawk_mb_cur_max > 1) {
1528 		mbclen = mbrlen(*scanp, len, mbs);
1529 		if (   (mbclen == 1)
1530 		    || (mbclen == (size_t) -1)
1531 		    || (mbclen == (size_t) -2)
1532 		    || (mbclen == 0)) {
1533 			/* We treat it as a singlebyte character.  */
1534 			mbclen = 1;
1535 		}
1536 		*scanp += mbclen;
1537 	} else
1538 		(*scanp)++;
1539 }
1540 
1541 /*
1542  * fpat_parse_field --- parse fields using a regexp.
1543  *
1544  * This is called both from get_field() and from do_patsplit()
1545  * via (*parse_field)().  This variation is for when FPAT is a regular
1546  * expression -- use the value to find field contents.
1547  *
1548  * The FPAT parsing logic is a bit difficult to specify. In particular
1549  * to allow null fields at certain locations. To make the code as robust
1550  * as possible, an awk reference implementation was written and tested
1551  * as a first step, and later recoded in C, preserving its structure as
1552  * much as possible.
1553  *
1554  * # Reference implementation of the FPAT record parsing.
1555  * #
1556  * # Each loop iteration identifies a (separator[n-1],field[n]) pair.
1557  * # Each loop iteration must consume some characters, except for the first field.
1558  * # So a null field is only valid as a first field or after a non-null separator.
1559  * # A null record has no fields (not a single null field).
1560  *
1561  * function refpatsplit(string, fields, pattern, seps,
1562  *         parse_start, sep_start, field_start, field_length, field_found, nf) # locals
1563  * {
1564  *     # Local state variables:
1565  *     # - parse_start: pointer to the first not yet consumed character
1566  *     # - sep_start: pointer to the beginning of the parsed separator
1567  *     # - field start: pointer to the beginning of the parsed field
1568  *     # - field length: length of the parsed field
1569  *     # - field_found: flag for succesful field match
1570  *     # - nf: Number of fields found so far
1571  *
1572  *     # Prepare for parsing
1573  *     parse_start = 1   # first not yet parsed char
1574  *     nf = 0            # fields found so far
1575  *     delete fields
1576  *     delete seps
1577  *
1578  *     # Loop that consumes the whole record
1579  *     while (parse_start <= length(string)) {  # still something to parse
1580  *
1581  *         # first attempt to match the next field
1582  *         sep_start = parse_start
1583  *         field_found = match(substr(string, parse_start), pattern)
1584  *
1585  *         # check for an invalid null field and retry one character away
1586  *         if (nf > 0 && field_found && RSTART == 1 && RLENGTH == 0) {
1587  *             parse_start++
1588  *             field_found = match(substr(string, parse_start), pattern)
1589  *         }
1590  *
1591  *         # store the (sep[n-1],field[n]) pair
1592  *         if (field_found) {
1593  *             field_start = parse_start + RSTART - 1
1594  *             field_length = RLENGTH
1595  *             seps[nf] = substr(string, sep_start, field_start-sep_start)
1596  *             fields[++nf] = substr(string, field_start, field_length)
1597  *             parse_start = field_start + field_length
1598  *
1599  *         # store the final extra sep after the last field
1600  *         } else {
1601  *             seps[nf] = substr(string, sep_start)
1602  *             parse_start = length(string) + 1
1603  *         }
1604  *     }
1605  *
1606  *     return nf
1607  * }
1608  */
1609 static long
fpat_parse_field(long up_to,char ** buf,int len,NODE * fs ATTRIBUTE_UNUSED,Regexp * rp,Setfunc set,NODE * n,NODE * sep_arr,bool in_middle)1610 fpat_parse_field(long up_to,	/* parse only up to this field number */
1611 	char **buf,	/* on input: string to parse; on output: point to start next */
1612 	int len,
1613 	NODE *fs ATTRIBUTE_UNUSED,
1614 	Regexp *rp,
1615 	Setfunc set,	/* routine to set the value of the parsed field */
1616 	NODE *n,
1617 	NODE *sep_arr,  /* array of field separators (may be NULL) */
1618 	bool in_middle)
1619 {
1620 	char *scan = *buf;
1621 	long nf = parse_high_water;
1622 	char *start;
1623 	char *end = scan + len;
1624 	int regex_flags = RE_NEED_START;
1625 	mbstate_t mbs;
1626 	char* field_start;
1627 	bool field_found = false;
1628 
1629 	memset(&mbs, 0, sizeof(mbstate_t));
1630 
1631 	if (up_to == UNLIMITED)
1632 		nf = 0;
1633 
1634 	if (len == 0)
1635 		return nf;
1636 
1637 	if (rp == NULL) /* use FPAT */
1638 		rp = FPAT_regexp;
1639 
1640 	while (scan < end && nf < up_to) {  /* still something to parse */
1641 
1642 		/* first attempt to match the next field */
1643 		start = scan;
1644 		field_found = research(rp, scan, 0, (end - scan), regex_flags) != -1;
1645 
1646 		/* check for an invalid null field and retry one character away */
1647 		if (nf > 0 && field_found && REEND(rp, scan) == 0) { /* invalid null field */
1648 			increment_scan(& scan, end - scan);
1649 			field_found = research(rp, scan, 0, (end - scan), regex_flags) != -1;
1650 		}
1651 
1652 		/* store the (sep[n-1],field[n]) pair */
1653 		if (field_found) {
1654 			field_start = scan + RESTART(rp, scan);
1655 			if (sep_arr != NULL) { /* store the separator */
1656 				if (field_start == start) /* match at front */
1657 					set_element(nf, start, 0L, sep_arr);
1658 				else
1659 					set_element(nf,
1660 						start,
1661 						(long) (field_start - start),
1662 						sep_arr);
1663 			}
1664 			/* field is text that matched */
1665 			(*set)(++nf,
1666 				field_start,
1667 				(long)(REEND(rp, scan) - RESTART(rp, scan)),
1668 				n);
1669 			scan += REEND(rp, scan);
1670 
1671 		} else {
1672 			/*
1673 			 * No match, store the final extra separator after
1674 			 * the last field.
1675 			 */
1676 			if (sep_arr != NULL)
1677 				set_element(nf, start, (long) (end - start), sep_arr);
1678 			scan = end;
1679 		}
1680 	}
1681 
1682 	/*
1683 	 * If the last field extends up to the end of the record, generate
1684 	 * a null trailing separator
1685 	 */
1686 	if (sep_arr != NULL && scan == end && field_found)
1687 		set_element(nf, scan, 0L, sep_arr);
1688 
1689 	*buf = scan;
1690 	return nf;
1691 }
1692