1 /*
2  * re.c - compile regular expressions.
3  */
4 
5 /*
6  * Copyright (C) 1991-2019, 2021 the Free Software Foundation, Inc.
7  *
8  * This file is part of GAWK, the GNU implementation of the
9  * AWK Programming Language.
10  *
11  * GAWK is free software; you can redistribute it and/or modify
12  * it under the terms of the GNU General Public License as published by
13  * the Free Software Foundation; either version 3 of the License, or
14  * (at your option) any later version.
15  *
16  * GAWK is distributed in the hope that it will be useful,
17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
19  * GNU General Public License for more details.
20  *
21  * You should have received a copy of the GNU General Public License
22  * along with this program; if not, write to the Free Software
23  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA
24  */
25 
26 #include "awk.h"
27 
28 #include "localeinfo.h"
29 
30 static reg_syntax_t syn;
31 static void check_bracket_exp(char *s, size_t len);
32 const char *regexflags2str(int flags);
33 
34 static struct localeinfo localeinfo;
35 
36 /* make_regexp --- generate compiled regular expressions */
37 
38 Regexp *
make_regexp(const char * s,size_t len,bool ignorecase,bool dfa,bool canfatal)39 make_regexp(const char *s, size_t len, bool ignorecase, bool dfa, bool canfatal)
40 {
41 	static char metas[] = ".*+(){}[]|?^$\\";
42 	Regexp *rp;
43 	const char *rerr;
44 	const char *src = s;
45 	static char *buf = NULL;
46 	static size_t buflen;
47 	const char *end = s + len;
48 	char *dest;
49 	int c, c2;
50 	static bool first = true;
51 	static bool no_dfa = false;
52 	int i;
53 	static struct dfa* dfaregs[2] = { NULL, NULL };
54 	static bool nul_warned = false;
55 
56 	if (do_lint && ! nul_warned && memchr(s, '\0', len) != NULL) {
57 		nul_warned = true;
58 		lintwarn(_("behavior of matching a regexp containing NUL characters is not defined by POSIX"));
59 	}
60 
61 	/*
62 	 * The number of bytes in the current multibyte character.
63 	 * It is 0, when the current character is a singlebyte character.
64 	 */
65 	size_t is_multibyte = 0;
66 	mbstate_t mbs;
67 
68 	memset(&mbs, 0, sizeof(mbstate_t)); /* Initialize.  */
69 
70 	if (first) {
71 		/* for debugging and testing */
72 		no_dfa = (getenv("GAWK_NO_DFA") != NULL);
73 		/* don't set first to false here, we do it below */
74 	}
75 
76 	/* always check */
77 	check_bracket_exp((char *) s, len);
78 
79 	/* Handle escaped characters first. */
80 
81 	/*
82 	 * Build a copy of the string (in buf) with the
83 	 * escaped characters translated, and generate the regex
84 	 * from that.
85 	 */
86 	if (buf == NULL) {
87 		emalloc(buf, char *, len + 1, "make_regexp");
88 		buflen = len;
89 	} else if (len > buflen) {
90 		erealloc(buf, char *, len + 1, "make_regexp");
91 		buflen = len;
92 	}
93 	dest = buf;
94 
95 	while (src < end) {
96 		if (gawk_mb_cur_max > 1 && ! is_multibyte) {
97 			/* The previous byte is a singlebyte character, or last byte
98 			   of a multibyte character.  We check the next character.  */
99 			is_multibyte = mbrlen(src, end - src, &mbs);
100 			if (   is_multibyte == 1
101 			    || is_multibyte == (size_t) -1
102 			    || is_multibyte == (size_t) -2
103 			    || is_multibyte == 0) {
104 				/* We treat it as a single-byte character.  */
105 				is_multibyte = 0;
106 			}
107 		}
108 
109 		const char *ok_to_escape;
110 		if (do_posix)
111 			ok_to_escape = "{}()|*+?.^$\\[]/-";
112 		else if (do_traditional)
113 			ok_to_escape = "()|*+?.^$\\[]/-";
114 		else
115 			ok_to_escape = "<>`'BywWsS{}()|*+?.^$\\[]/-";
116 
117 		/* We skip multibyte character, since it must not be a special
118 		   character.  */
119 		if ((gawk_mb_cur_max == 1 || ! is_multibyte) &&
120 		    (*src == '\\')) {
121 			c = *++src;
122 			switch (c) {
123 			case '\0':	/* \\ before \0, either dynamic data or real end of string */
124 				if (src >= s + len)
125 					*dest++ = '\\';	// at end of string, will fatal below
126 				else
127 					fatal(_("invalid NUL byte in dynamic regexp"));
128 				break;
129 			case 'a':
130 			case 'b':
131 			case 'f':
132 			case 'n':
133 			case 'r':
134 			case 't':
135 			case 'v':
136 			case 'x':
137 			case '0':
138 			case '1':
139 			case '2':
140 			case '3':
141 			case '4':
142 			case '5':
143 			case '6':
144 			case '7':
145 				c2 = parse_escape(&src);
146 				if (c2 < 0)
147 					cant_happen();
148 				/*
149 				 * Unix awk treats octal (and hex?) chars
150 				 * literally in re's, so escape regexp
151 				 * metacharacters.
152 				 */
153 				if (do_traditional
154 				    && ! do_posix
155 				    && (isdigit(c) || c == 'x')
156 				    && strchr("()|*+?.^$\\[]", c2) != NULL)
157 					*dest++ = '\\';
158 				*dest++ = (char) c2;
159 				if (do_lint
160 				    && ! nul_warned
161 				    && c2 == '\0') {
162 					nul_warned = true;
163 					lintwarn(_("behavior of matching a regexp containing NUL characters is not defined by POSIX"));
164 				}
165 				break;
166 			case '8':
167 			case '9':	/* a\9b not valid */
168 				*dest++ = c;
169 				src++;
170 			{
171 				static bool warned[2];
172 
173 				if (! warned[c - '8']) {
174 					warning(_("regexp escape sequence `\\%c' treated as plain `%c'"), c, c);
175 					warned[c - '8'] = true;
176 				}
177 			}
178 				break;
179 			case 'y':	/* normally \b */
180 				/* gnu regex op */
181 				if (! do_traditional) {
182 					*dest++ = '\\';
183 					*dest++ = 'b';
184 					src++;
185 					break;
186 				}
187 				/* else, fall through */
188 			default:
189 				if (strchr(ok_to_escape, c) == NULL) {
190 					static bool warned[256];
191 
192 					if (! warned[c & 0xFF]) {
193 						warning(_("regexp escape sequence `\\%c' is not a known regexp operator"), c);
194 						warned[c & 0xFF] = true;
195 					}
196 				}
197 				*dest++ = '\\';
198 				*dest++ = (char) c;
199 				src++;
200 				break;
201 			} /* switch */
202 		} else {
203 			c = *src;
204 			*dest++ = *src++;	/* not '\\' */
205 		}
206 		if (gawk_mb_cur_max > 1 && is_multibyte)
207 			is_multibyte--;
208 	} /* while */
209 
210 	*dest = '\0';
211 	len = dest - buf;
212 
213 	ezalloc(rp, Regexp *, sizeof(*rp), "make_regexp");
214 	rp->pat.allocated = 0;	/* regex will allocate the buffer */
215 	emalloc(rp->pat.fastmap, char *, 256, "make_regexp");
216 
217 	/*
218 	 * Lo these many years ago, had I known what a P.I.T.A. IGNORECASE
219 	 * was going to turn out to be, I wouldn't have bothered with it.
220 	 *
221 	 * In the case where we have a multibyte character set, we have no
222 	 * choice but to use RE_ICASE, since the casetable is for single-byte
223 	 * character sets only.
224 	 *
225 	 * On the other hand, if we do have a single-byte character set,
226 	 * using the casetable should give  a performance improvement, since
227 	 * it's computed only once, not each time a regex is compiled.  We
228 	 * also think it's probably better for portability.  See the
229 	 * discussion by the definition of casetable[] in eval.c.
230 	 */
231 
232 	ignorecase = !! ignorecase;	/* force to 1 or 0 */
233 	if (ignorecase) {
234 		if (gawk_mb_cur_max > 1) {
235 			syn |= RE_ICASE;
236 			rp->pat.translate = NULL;
237 		} else {
238 			syn &= ~RE_ICASE;
239 			rp->pat.translate = (RE_TRANSLATE_TYPE) casetable;
240 		}
241 	} else {
242 		rp->pat.translate = NULL;
243 		syn &= ~RE_ICASE;
244 	}
245 
246 	/* initialize dfas to hold syntax */
247 	if (first) {
248 		first = false;
249 		dfaregs[0] = dfaalloc();
250 		dfaregs[1] = dfaalloc();
251 		dfasyntax(dfaregs[0], & localeinfo, syn, DFA_ANCHOR);
252 		dfasyntax(dfaregs[1], & localeinfo, syn | RE_ICASE, DFA_ANCHOR);
253 	}
254 
255 	re_set_syntax(syn);
256 
257 	if ((rerr = re_compile_pattern(buf, len, &(rp->pat))) != NULL) {
258 		refree(rp);
259 		if (! canfatal) {
260 			/* rerr already gettextized inside regex routines */
261 			error("%s: /%.*s/", rerr, (int) len, s);
262  			return NULL;
263 		}
264 		fatal("invalid regexp: %s: /%.*s/", rerr, (int) len, s);
265 	}
266 
267 	/* gack. this must be done *after* re_compile_pattern */
268 	rp->pat.newline_anchor = false; /* don't get \n in middle of string */
269 	if (dfa && ! no_dfa) {
270 		rp->dfareg = dfaalloc();
271 		dfacopysyntax(rp->dfareg, dfaregs[ignorecase]);
272 		dfacomp(buf, len, rp->dfareg, true);
273 	} else
274 		rp->dfareg = NULL;
275 
276 	/* Additional flags that help with RS as regexp. */
277 	for (i = 0; i < len; i++) {
278 		if (strchr(metas, buf[i]) != NULL) {
279 			rp->has_meta = true;
280 			break;
281 		}
282 	}
283 
284 	for (i = len - 1; i >= 0; i--) {
285 		if (strchr("*+|?{}", buf[i]) != NULL) {
286 			rp->maybe_long = true;
287 			break;
288 		}
289 	}
290 
291 	return rp;
292 }
293 
294 /* research --- do a regexp search. use dfa if possible */
295 
296 int
research(Regexp * rp,char * str,int start,size_t len,int flags)297 research(Regexp *rp, char *str, int start,
298 	 size_t len, int flags)
299 {
300 	const char *ret = str;
301 	bool try_backref = false;
302 	int need_start;
303 	int no_bol;
304 	int res;
305 
306 	need_start = ((flags & RE_NEED_START) != 0);
307 	no_bol = ((flags & RE_NO_BOL) != 0);
308 
309 	if (no_bol)
310 		rp->pat.not_bol = 1;
311 
312 	/*
313 	 * Always do dfa search if can; if it fails, then even if
314 	 * need_start is true, we won't bother with the regex search.
315 	 *
316 	 * The dfa matcher doesn't have a no_bol flag, so don't bother
317 	 * trying it in that case.
318 	 *
319 	 * 7/2008: Skip the dfa matcher if need_start. The dfa matcher
320 	 * has bugs in certain multibyte cases and it's too difficult
321 	 * to try to special case things.
322 	 * 7/2017: Apparently there are some cases where DFA gets
323 	 * stuck, even in the C locale, so we use dfa only if not need_start.
324 	 *
325 	 * Should that issue ever get resolved, note this comment:
326 	 *
327 	 * 7/2016: The dfa matcher can't handle a case where searching
328 	 * starts in the middle of a string, so don't bother trying it
329 	 * in that case.
330 	 *	if (rp->dfa && ! no_bol && start == 0) ...
331 	 */
332 	if (rp->dfareg != NULL && ! no_bol && ! need_start) {
333 		struct dfa *superset = dfasuperset(rp->dfareg);
334 		if (superset)
335 			ret = dfaexec(superset, str+start, str+start+len,
336 							true, NULL, NULL);
337 
338 		if (ret && (! need_start
339 				|| (! superset && dfaisfast(rp->dfareg))))
340 			ret = dfaexec(rp->dfareg, str+start, str+start+len,
341 						true, NULL, &try_backref);
342 	}
343 
344 	if (ret) {
345 		if (   rp->dfareg == NULL
346 			|| start != 0
347 			|| no_bol
348 			|| need_start
349 			|| try_backref) {
350 			/*
351 			 * Passing NULL as last arg speeds up search for cases
352 			 * where we don't need the start/end info.
353 			 */
354 			res = re_search(&(rp->pat), str, start+len,
355 				start, len, need_start ? &(rp->regs) : NULL);
356 		} else
357 			res = 1;
358 	} else
359 		res = -1;
360 
361 	rp->pat.not_bol = 0;
362 	return res;
363 }
364 
365 /* refree --- free up the dynamic memory used by a compiled regexp */
366 
367 void
refree(Regexp * rp)368 refree(Regexp *rp)
369 {
370 	if (rp == NULL)
371 		return;
372 	rp->pat.translate = NULL;
373 	regfree(& rp->pat);
374 	if (rp->regs.start)
375 		free(rp->regs.start);
376 	if (rp->regs.end)
377 		free(rp->regs.end);
378 	if (rp->dfareg != NULL) {
379 		dfafree(rp->dfareg);
380 		free(rp->dfareg);
381 	}
382 	efree(rp);
383 }
384 
385 /* dfaerror --- print an error message for the dfa routines */
386 
387 void
dfaerror(const char * s)388 dfaerror(const char *s)
389 {
390 	fatal("%s", s);
391 	exit(EXIT_FATAL);	/* for DJGPP */
392 }
393 
394 /* re_cache_get --- populate regexp cache if empty */
395 
396 static inline Regexp *
re_cache_get(NODE * t)397 re_cache_get(NODE *t)
398 {
399 	if (t->re_reg[IGNORECASE] == NULL)
400 		t->re_reg[IGNORECASE] = make_regexp(t->re_exp->stptr, t->re_exp->stlen, IGNORECASE, t->re_cnt, true);
401 	return t->re_reg[IGNORECASE];
402 }
403 
404 /* re_update --- recompile a dynamic regexp */
405 
406 Regexp *
re_update(NODE * t)407 re_update(NODE *t)
408 {
409 	NODE *t1;
410 
411 	if (t->type == Node_val && (t->flags & REGEX) != 0)
412 		return re_cache_get(t->typed_re);
413 
414 	if ((t->re_flags & CONSTANT) != 0) {
415 		/* it's a constant, so just return it as is */
416 		assert(t->type == Node_regex);
417 		return re_cache_get(t);
418 	}
419 	t1 = t->re_exp;
420 	if (t->re_text != NULL) {
421 		/* if contents haven't changed, just return it */
422 		if (cmp_nodes(t->re_text, t1, true) == 0)
423 			return re_cache_get(t);
424 		/* things changed, fall through to recompile */
425 		unref(t->re_text);
426 	}
427 	/* get fresh copy of the text of the regexp */
428 	t->re_text = dupnode(t1);
429 
430 	/* text changed */
431 
432 	/* free old */
433 	if (t->re_reg[0] != NULL) {
434 		refree(t->re_reg[0]);
435 		t->re_reg[0] = NULL;
436 	}
437 	if (t->re_reg[1] != NULL) {
438 		refree(t->re_reg[1]);
439 		t->re_reg[1] = NULL;
440 	}
441 	if (t->re_cnt > 0 && ++t->re_cnt > 10)
442 		/*
443 		 * The regex appears to update frequently, so disable DFA
444 		 * matching (which trades off expensive upfront compilation
445 		 * overhead for faster subsequent matching).
446 		 */
447 		t->re_cnt = 0;
448 	if (t->re_text == NULL) {
449 		/* reset regexp text if needed */
450 		t1 = t->re_exp;
451 		unref(t->re_text);
452 		t->re_text = dupnode(t1);
453 	}
454 	return re_cache_get(t);
455 }
456 
457 /* resetup --- choose what kind of regexps we match */
458 
459 void
resetup()460 resetup()
461 {
462 	// init localeinfo for dfa
463 	init_localeinfo(& localeinfo);
464 
465 	/*
466 	 * Syntax bits: _that_ is yet another mind trip.  Recreational drugs
467 	 * are helpful for recovering from the experience.
468 	 *
469 	 *	Aharon Robbins <arnold@skeeve.com>
470 	 *	Sun, 21 Oct 2007 23:55:33 +0200
471 	 */
472 	if (do_posix)
473 		syn = RE_SYNTAX_POSIX_AWK;	/* strict POSIX re's */
474 	else if (do_traditional)
475 		syn = RE_SYNTAX_AWK;		/* traditional Unix awk re's */
476 	else
477 		syn = RE_SYNTAX_GNU_AWK;	/* POSIX re's + GNU ops */
478 
479 	/*
480 	 * Interval expressions are now on by default, as POSIX is
481 	 * wide-spread enough that people want it. The do_intervals
482 	 * variable remains for use with --traditional.
483 	 */
484 	if (do_intervals)
485 		syn |= RE_INTERVALS | RE_INVALID_INTERVAL_ORD | RE_NO_BK_BRACES;
486 
487 	(void) re_set_syntax(syn);
488 }
489 
490 /* using_utf8 --- are we using utf8 */
491 
492 bool
using_utf8(void)493 using_utf8(void)
494 {
495 	return localeinfo.using_utf8;
496 }
497 
498 /* reisstring --- return true if the RE match is a simple string match */
499 
500 int
reisstring(const char * text,size_t len,Regexp * re,const char * buf)501 reisstring(const char *text, size_t len, Regexp *re, const char *buf)
502 {
503 	int res;
504 	const char *matched;
505 
506 	/* simple checking for meta characters in re */
507 	if (re->has_meta)
508 		return false;	/* give up early, can't be string match */
509 
510 	/* make accessable to gdb */
511 	matched = &buf[RESTART(re, buf)];
512 
513 	res = (memcmp(text, matched, len) == 0);
514 
515 	return res;
516 }
517 
518 /* reflags2str --- make a regex flags value readable */
519 
520 const char *
reflags2str(int flagval)521 reflags2str(int flagval)
522 {
523 	static const struct flagtab values[] = {
524 		{ RE_BACKSLASH_ESCAPE_IN_LISTS, "RE_BACKSLASH_ESCAPE_IN_LISTS" },
525 		{ RE_BK_PLUS_QM, "RE_BK_PLUS_QM" },
526 		{ RE_CHAR_CLASSES, "RE_CHAR_CLASSES" },
527 		{ RE_CONTEXT_INDEP_ANCHORS, "RE_CONTEXT_INDEP_ANCHORS" },
528 		{ RE_CONTEXT_INDEP_OPS, "RE_CONTEXT_INDEP_OPS" },
529 		{ RE_CONTEXT_INVALID_OPS, "RE_CONTEXT_INVALID_OPS" },
530 		{ RE_DOT_NEWLINE, "RE_DOT_NEWLINE" },
531 		{ RE_DOT_NOT_NULL, "RE_DOT_NOT_NULL" },
532 		{ RE_HAT_LISTS_NOT_NEWLINE, "RE_HAT_LISTS_NOT_NEWLINE" },
533 		{ RE_INTERVALS, "RE_INTERVALS" },
534 		{ RE_LIMITED_OPS, "RE_LIMITED_OPS" },
535 		{ RE_NEWLINE_ALT, "RE_NEWLINE_ALT" },
536 		{ RE_NO_BK_BRACES, "RE_NO_BK_BRACES" },
537 		{ RE_NO_BK_PARENS, "RE_NO_BK_PARENS" },
538 		{ RE_NO_BK_REFS, "RE_NO_BK_REFS" },
539 		{ RE_NO_BK_VBAR, "RE_NO_BK_VBAR" },
540 		{ RE_NO_EMPTY_RANGES, "RE_NO_EMPTY_RANGES" },
541 		{ RE_UNMATCHED_RIGHT_PAREN_ORD, "RE_UNMATCHED_RIGHT_PAREN_ORD" },
542 		{ RE_NO_POSIX_BACKTRACKING, "RE_NO_POSIX_BACKTRACKING" },
543 		{ RE_NO_GNU_OPS, "RE_NO_GNU_OPS" },
544 		{ RE_INVALID_INTERVAL_ORD, "RE_INVALID_INTERVAL_ORD" },
545 		{ RE_ICASE, "RE_ICASE" },
546 		{ RE_CARET_ANCHORS_HERE, "RE_CARET_ANCHORS_HERE" },
547 		{ RE_CONTEXT_INVALID_DUP, "RE_CONTEXT_INVALID_DUP" },
548 		{ RE_NO_SUB, "RE_NO_SUB" },
549 		{ 0,	NULL },
550 	};
551 
552 	if (flagval == RE_SYNTAX_EMACS) /* == 0 */
553 		return "RE_SYNTAX_EMACS";
554 
555 	return genflags2str(flagval, values);
556 }
557 
558 /*
559  * dfawarn() is called by the dfa routines whenever a regex is compiled
560  * must supply a dfawarn.
561  */
562 
563 void
dfawarn(const char * dfa_warning)564 dfawarn(const char *dfa_warning)
565 {
566 	/*
567 	 * This routine does nothing, since gawk does its own
568 	 * (better) check for bad [[:foo:]] syntax.
569 	 */
570 }
571 
572 /* check_bracket_exp --- look for /[:space:]/ that should be /[[:space:]]/ */
573 
574 static void
check_bracket_exp(char * s,size_t length)575 check_bracket_exp(char *s, size_t length)
576 {
577 	static struct reclass {
578 		const char *name;
579 		size_t len;
580 		bool warned;
581 	} classes[] = {
582 		/*
583 		 * Ordered by what we hope is frequency,
584 		 * since it's linear searched.
585 		 */
586 		{ "[:alpha:]", 9, false },
587 		{ "[:digit:]", 9, false },
588 		{ "[:alnum:]", 9, false },
589 		{ "[:upper:]", 9, false },
590 		{ "[:lower:]", 9, false },
591 		{ "[:space:]", 9, false },
592 		{ "[:xdigit:]", 10, false },
593 		{ "[:punct:]", 9, false },
594 		{ "[:print:]", 9, false },
595 		{ "[:graph:]", 9, false },
596 		{ "[:cntrl:]", 9, false },
597 		{ "[:blank:]", 9, false },
598 		{ NULL, 0 }
599 	};
600 	int i;
601 	bool found = false;
602 	char save;
603 	char *sp, *sp2, *end;
604 	int len;
605 	int count = 0;
606 
607 	if (length == 0)
608 		return;
609 
610 	end = s + length;
611 	save = s[length];
612 	s[length] = '\0';
613 	sp = s;
614 
615 again:
616 	sp = sp2 = (char *) memchr(sp, '[', (end - sp));
617 	if (sp == NULL)
618 		goto done;
619 
620 	for (count++, sp++; *sp != '\0'; sp++) {
621 		if (*sp == '[')
622 			count++;
623 		/*
624 		 * ] as first char after open [ is skipped
625 		 * \] is skipped
626 		 * [^]] is skipped
627 		 */
628 		if (*sp == ']' && sp > sp2) {
629 			 if (sp[-1] != '['
630 			     && sp[-1] != '\\')
631 				 ;
632 			 else if ((sp - sp2) >= 2
633 				  && sp[-1] == '^' && sp[-2] == '[')
634 				 ;
635 			 else
636 				count--;
637 		}
638 
639 		if (count == 0) {
640 			sp++;	/* skip past ']' */
641 			break;
642 		}
643 	}
644 
645 	if (count > 0) {	/* bad regex, give up */
646 		goto done;
647 	}
648 
649 	/* sp2 has start */
650 
651 	for (i = 0; classes[i].name != NULL; i++) {
652 		if (classes[i].warned)
653 			continue;
654 		len = classes[i].len;
655 		if (   len == (sp - sp2)
656 		    && memcmp(sp2, classes[i].name, len) == 0) {
657 			found = true;
658 			break;
659 		}
660 	}
661 
662 	if (found && ! classes[i].warned) {
663 		warning(_("regexp component `%.*s' should probably be `[%.*s]'"),
664 				len, sp2, len, sp2);
665 		classes[i].warned = true;
666 	}
667 
668 	if (sp < end) {
669 		found = false;
670 		goto again;
671 	}
672 done:
673 	s[length] = save;
674 }
675