1 /*
2 * re.c - compile regular expressions.
3 */
4
5 /*
6 * Copyright (C) 1991-2019, 2021 the Free Software Foundation, Inc.
7 *
8 * This file is part of GAWK, the GNU implementation of the
9 * AWK Programming Language.
10 *
11 * GAWK is free software; you can redistribute it and/or modify
12 * it under the terms of the GNU General Public License as published by
13 * the Free Software Foundation; either version 3 of the License, or
14 * (at your option) any later version.
15 *
16 * GAWK is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 * GNU General Public License for more details.
20 *
21 * You should have received a copy of the GNU General Public License
22 * along with this program; if not, write to the Free Software
23 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
24 */
25
26 #include "awk.h"
27
28 #include "localeinfo.h"
29
30 static reg_syntax_t syn;
31 static void check_bracket_exp(char *s, size_t len);
32 const char *regexflags2str(int flags);
33
34 static struct localeinfo localeinfo;
35
36 /* make_regexp --- generate compiled regular expressions */
37
38 Regexp *
make_regexp(const char * s,size_t len,bool ignorecase,bool dfa,bool canfatal)39 make_regexp(const char *s, size_t len, bool ignorecase, bool dfa, bool canfatal)
40 {
41 static char metas[] = ".*+(){}[]|?^$\\";
42 Regexp *rp;
43 const char *rerr;
44 const char *src = s;
45 static char *buf = NULL;
46 static size_t buflen;
47 const char *end = s + len;
48 char *dest;
49 int c, c2;
50 static bool first = true;
51 static bool no_dfa = false;
52 int i;
53 static struct dfa* dfaregs[2] = { NULL, NULL };
54 static bool nul_warned = false;
55
56 if (do_lint && ! nul_warned && memchr(s, '\0', len) != NULL) {
57 nul_warned = true;
58 lintwarn(_("behavior of matching a regexp containing NUL characters is not defined by POSIX"));
59 }
60
61 /*
62 * The number of bytes in the current multibyte character.
63 * It is 0, when the current character is a singlebyte character.
64 */
65 size_t is_multibyte = 0;
66 mbstate_t mbs;
67
68 memset(&mbs, 0, sizeof(mbstate_t)); /* Initialize. */
69
70 if (first) {
71 /* for debugging and testing */
72 no_dfa = (getenv("GAWK_NO_DFA") != NULL);
73 /* don't set first to false here, we do it below */
74 }
75
76 /* always check */
77 check_bracket_exp((char *) s, len);
78
79 /* Handle escaped characters first. */
80
81 /*
82 * Build a copy of the string (in buf) with the
83 * escaped characters translated, and generate the regex
84 * from that.
85 */
86 if (buf == NULL) {
87 emalloc(buf, char *, len + 1, "make_regexp");
88 buflen = len;
89 } else if (len > buflen) {
90 erealloc(buf, char *, len + 1, "make_regexp");
91 buflen = len;
92 }
93 dest = buf;
94
95 while (src < end) {
96 if (gawk_mb_cur_max > 1 && ! is_multibyte) {
97 /* The previous byte is a singlebyte character, or last byte
98 of a multibyte character. We check the next character. */
99 is_multibyte = mbrlen(src, end - src, &mbs);
100 if ( is_multibyte == 1
101 || is_multibyte == (size_t) -1
102 || is_multibyte == (size_t) -2
103 || is_multibyte == 0) {
104 /* We treat it as a single-byte character. */
105 is_multibyte = 0;
106 }
107 }
108
109 const char *ok_to_escape;
110 if (do_posix)
111 ok_to_escape = "{}()|*+?.^$\\[]/-";
112 else if (do_traditional)
113 ok_to_escape = "()|*+?.^$\\[]/-";
114 else
115 ok_to_escape = "<>`'BywWsS{}()|*+?.^$\\[]/-";
116
117 /* We skip multibyte character, since it must not be a special
118 character. */
119 if ((gawk_mb_cur_max == 1 || ! is_multibyte) &&
120 (*src == '\\')) {
121 c = *++src;
122 switch (c) {
123 case '\0': /* \\ before \0, either dynamic data or real end of string */
124 if (src >= s + len)
125 *dest++ = '\\'; // at end of string, will fatal below
126 else
127 fatal(_("invalid NUL byte in dynamic regexp"));
128 break;
129 case 'a':
130 case 'b':
131 case 'f':
132 case 'n':
133 case 'r':
134 case 't':
135 case 'v':
136 case 'x':
137 case '0':
138 case '1':
139 case '2':
140 case '3':
141 case '4':
142 case '5':
143 case '6':
144 case '7':
145 c2 = parse_escape(&src);
146 if (c2 < 0)
147 cant_happen();
148 /*
149 * Unix awk treats octal (and hex?) chars
150 * literally in re's, so escape regexp
151 * metacharacters.
152 */
153 if (do_traditional
154 && ! do_posix
155 && (isdigit(c) || c == 'x')
156 && strchr("()|*+?.^$\\[]", c2) != NULL)
157 *dest++ = '\\';
158 *dest++ = (char) c2;
159 if (do_lint
160 && ! nul_warned
161 && c2 == '\0') {
162 nul_warned = true;
163 lintwarn(_("behavior of matching a regexp containing NUL characters is not defined by POSIX"));
164 }
165 break;
166 case '8':
167 case '9': /* a\9b not valid */
168 *dest++ = c;
169 src++;
170 {
171 static bool warned[2];
172
173 if (! warned[c - '8']) {
174 warning(_("regexp escape sequence `\\%c' treated as plain `%c'"), c, c);
175 warned[c - '8'] = true;
176 }
177 }
178 break;
179 case 'y': /* normally \b */
180 /* gnu regex op */
181 if (! do_traditional) {
182 *dest++ = '\\';
183 *dest++ = 'b';
184 src++;
185 break;
186 }
187 /* else, fall through */
188 default:
189 if (strchr(ok_to_escape, c) == NULL) {
190 static bool warned[256];
191
192 if (! warned[c & 0xFF]) {
193 warning(_("regexp escape sequence `\\%c' is not a known regexp operator"), c);
194 warned[c & 0xFF] = true;
195 }
196 }
197 *dest++ = '\\';
198 *dest++ = (char) c;
199 src++;
200 break;
201 } /* switch */
202 } else {
203 c = *src;
204 *dest++ = *src++; /* not '\\' */
205 }
206 if (gawk_mb_cur_max > 1 && is_multibyte)
207 is_multibyte--;
208 } /* while */
209
210 *dest = '\0';
211 len = dest - buf;
212
213 ezalloc(rp, Regexp *, sizeof(*rp), "make_regexp");
214 rp->pat.allocated = 0; /* regex will allocate the buffer */
215 emalloc(rp->pat.fastmap, char *, 256, "make_regexp");
216
217 /*
218 * Lo these many years ago, had I known what a P.I.T.A. IGNORECASE
219 * was going to turn out to be, I wouldn't have bothered with it.
220 *
221 * In the case where we have a multibyte character set, we have no
222 * choice but to use RE_ICASE, since the casetable is for single-byte
223 * character sets only.
224 *
225 * On the other hand, if we do have a single-byte character set,
226 * using the casetable should give a performance improvement, since
227 * it's computed only once, not each time a regex is compiled. We
228 * also think it's probably better for portability. See the
229 * discussion by the definition of casetable[] in eval.c.
230 */
231
232 ignorecase = !! ignorecase; /* force to 1 or 0 */
233 if (ignorecase) {
234 if (gawk_mb_cur_max > 1) {
235 syn |= RE_ICASE;
236 rp->pat.translate = NULL;
237 } else {
238 syn &= ~RE_ICASE;
239 rp->pat.translate = (RE_TRANSLATE_TYPE) casetable;
240 }
241 } else {
242 rp->pat.translate = NULL;
243 syn &= ~RE_ICASE;
244 }
245
246 /* initialize dfas to hold syntax */
247 if (first) {
248 first = false;
249 dfaregs[0] = dfaalloc();
250 dfaregs[1] = dfaalloc();
251 dfasyntax(dfaregs[0], & localeinfo, syn, DFA_ANCHOR);
252 dfasyntax(dfaregs[1], & localeinfo, syn | RE_ICASE, DFA_ANCHOR);
253 }
254
255 re_set_syntax(syn);
256
257 if ((rerr = re_compile_pattern(buf, len, &(rp->pat))) != NULL) {
258 refree(rp);
259 if (! canfatal) {
260 /* rerr already gettextized inside regex routines */
261 error("%s: /%.*s/", rerr, (int) len, s);
262 return NULL;
263 }
264 fatal("invalid regexp: %s: /%.*s/", rerr, (int) len, s);
265 }
266
267 /* gack. this must be done *after* re_compile_pattern */
268 rp->pat.newline_anchor = false; /* don't get \n in middle of string */
269 if (dfa && ! no_dfa) {
270 rp->dfareg = dfaalloc();
271 dfacopysyntax(rp->dfareg, dfaregs[ignorecase]);
272 dfacomp(buf, len, rp->dfareg, true);
273 } else
274 rp->dfareg = NULL;
275
276 /* Additional flags that help with RS as regexp. */
277 for (i = 0; i < len; i++) {
278 if (strchr(metas, buf[i]) != NULL) {
279 rp->has_meta = true;
280 break;
281 }
282 }
283
284 for (i = len - 1; i >= 0; i--) {
285 if (strchr("*+|?{}", buf[i]) != NULL) {
286 rp->maybe_long = true;
287 break;
288 }
289 }
290
291 return rp;
292 }
293
294 /* research --- do a regexp search. use dfa if possible */
295
296 int
research(Regexp * rp,char * str,int start,size_t len,int flags)297 research(Regexp *rp, char *str, int start,
298 size_t len, int flags)
299 {
300 const char *ret = str;
301 bool try_backref = false;
302 int need_start;
303 int no_bol;
304 int res;
305
306 need_start = ((flags & RE_NEED_START) != 0);
307 no_bol = ((flags & RE_NO_BOL) != 0);
308
309 if (no_bol)
310 rp->pat.not_bol = 1;
311
312 /*
313 * Always do dfa search if can; if it fails, then even if
314 * need_start is true, we won't bother with the regex search.
315 *
316 * The dfa matcher doesn't have a no_bol flag, so don't bother
317 * trying it in that case.
318 *
319 * 7/2008: Skip the dfa matcher if need_start. The dfa matcher
320 * has bugs in certain multibyte cases and it's too difficult
321 * to try to special case things.
322 * 7/2017: Apparently there are some cases where DFA gets
323 * stuck, even in the C locale, so we use dfa only if not need_start.
324 *
325 * Should that issue ever get resolved, note this comment:
326 *
327 * 7/2016: The dfa matcher can't handle a case where searching
328 * starts in the middle of a string, so don't bother trying it
329 * in that case.
330 * if (rp->dfa && ! no_bol && start == 0) ...
331 */
332 if (rp->dfareg != NULL && ! no_bol && ! need_start) {
333 struct dfa *superset = dfasuperset(rp->dfareg);
334 if (superset)
335 ret = dfaexec(superset, str+start, str+start+len,
336 true, NULL, NULL);
337
338 if (ret && (! need_start
339 || (! superset && dfaisfast(rp->dfareg))))
340 ret = dfaexec(rp->dfareg, str+start, str+start+len,
341 true, NULL, &try_backref);
342 }
343
344 if (ret) {
345 if ( rp->dfareg == NULL
346 || start != 0
347 || no_bol
348 || need_start
349 || try_backref) {
350 /*
351 * Passing NULL as last arg speeds up search for cases
352 * where we don't need the start/end info.
353 */
354 res = re_search(&(rp->pat), str, start+len,
355 start, len, need_start ? &(rp->regs) : NULL);
356 } else
357 res = 1;
358 } else
359 res = -1;
360
361 rp->pat.not_bol = 0;
362 return res;
363 }
364
365 /* refree --- free up the dynamic memory used by a compiled regexp */
366
367 void
refree(Regexp * rp)368 refree(Regexp *rp)
369 {
370 if (rp == NULL)
371 return;
372 rp->pat.translate = NULL;
373 regfree(& rp->pat);
374 if (rp->regs.start)
375 free(rp->regs.start);
376 if (rp->regs.end)
377 free(rp->regs.end);
378 if (rp->dfareg != NULL) {
379 dfafree(rp->dfareg);
380 free(rp->dfareg);
381 }
382 efree(rp);
383 }
384
385 /* dfaerror --- print an error message for the dfa routines */
386
387 void
dfaerror(const char * s)388 dfaerror(const char *s)
389 {
390 fatal("%s", s);
391 exit(EXIT_FATAL); /* for DJGPP */
392 }
393
394 /* re_cache_get --- populate regexp cache if empty */
395
396 static inline Regexp *
re_cache_get(NODE * t)397 re_cache_get(NODE *t)
398 {
399 if (t->re_reg[IGNORECASE] == NULL)
400 t->re_reg[IGNORECASE] = make_regexp(t->re_exp->stptr, t->re_exp->stlen, IGNORECASE, t->re_cnt, true);
401 return t->re_reg[IGNORECASE];
402 }
403
404 /* re_update --- recompile a dynamic regexp */
405
406 Regexp *
re_update(NODE * t)407 re_update(NODE *t)
408 {
409 NODE *t1;
410
411 if (t->type == Node_val && (t->flags & REGEX) != 0)
412 return re_cache_get(t->typed_re);
413
414 if ((t->re_flags & CONSTANT) != 0) {
415 /* it's a constant, so just return it as is */
416 assert(t->type == Node_regex);
417 return re_cache_get(t);
418 }
419 t1 = t->re_exp;
420 if (t->re_text != NULL) {
421 /* if contents haven't changed, just return it */
422 if (cmp_nodes(t->re_text, t1, true) == 0)
423 return re_cache_get(t);
424 /* things changed, fall through to recompile */
425 unref(t->re_text);
426 }
427 /* get fresh copy of the text of the regexp */
428 t->re_text = dupnode(t1);
429
430 /* text changed */
431
432 /* free old */
433 if (t->re_reg[0] != NULL) {
434 refree(t->re_reg[0]);
435 t->re_reg[0] = NULL;
436 }
437 if (t->re_reg[1] != NULL) {
438 refree(t->re_reg[1]);
439 t->re_reg[1] = NULL;
440 }
441 if (t->re_cnt > 0 && ++t->re_cnt > 10)
442 /*
443 * The regex appears to update frequently, so disable DFA
444 * matching (which trades off expensive upfront compilation
445 * overhead for faster subsequent matching).
446 */
447 t->re_cnt = 0;
448 if (t->re_text == NULL) {
449 /* reset regexp text if needed */
450 t1 = t->re_exp;
451 unref(t->re_text);
452 t->re_text = dupnode(t1);
453 }
454 return re_cache_get(t);
455 }
456
457 /* resetup --- choose what kind of regexps we match */
458
459 void
resetup()460 resetup()
461 {
462 // init localeinfo for dfa
463 init_localeinfo(& localeinfo);
464
465 /*
466 * Syntax bits: _that_ is yet another mind trip. Recreational drugs
467 * are helpful for recovering from the experience.
468 *
469 * Aharon Robbins <arnold@skeeve.com>
470 * Sun, 21 Oct 2007 23:55:33 +0200
471 */
472 if (do_posix)
473 syn = RE_SYNTAX_POSIX_AWK; /* strict POSIX re's */
474 else if (do_traditional)
475 syn = RE_SYNTAX_AWK; /* traditional Unix awk re's */
476 else
477 syn = RE_SYNTAX_GNU_AWK; /* POSIX re's + GNU ops */
478
479 /*
480 * Interval expressions are now on by default, as POSIX is
481 * wide-spread enough that people want it. The do_intervals
482 * variable remains for use with --traditional.
483 */
484 if (do_intervals)
485 syn |= RE_INTERVALS | RE_INVALID_INTERVAL_ORD | RE_NO_BK_BRACES;
486
487 (void) re_set_syntax(syn);
488 }
489
490 /* using_utf8 --- are we using utf8 */
491
492 bool
using_utf8(void)493 using_utf8(void)
494 {
495 return localeinfo.using_utf8;
496 }
497
498 /* reisstring --- return true if the RE match is a simple string match */
499
500 int
reisstring(const char * text,size_t len,Regexp * re,const char * buf)501 reisstring(const char *text, size_t len, Regexp *re, const char *buf)
502 {
503 int res;
504 const char *matched;
505
506 /* simple checking for meta characters in re */
507 if (re->has_meta)
508 return false; /* give up early, can't be string match */
509
510 /* make accessable to gdb */
511 matched = &buf[RESTART(re, buf)];
512
513 res = (memcmp(text, matched, len) == 0);
514
515 return res;
516 }
517
518 /* reflags2str --- make a regex flags value readable */
519
520 const char *
reflags2str(int flagval)521 reflags2str(int flagval)
522 {
523 static const struct flagtab values[] = {
524 { RE_BACKSLASH_ESCAPE_IN_LISTS, "RE_BACKSLASH_ESCAPE_IN_LISTS" },
525 { RE_BK_PLUS_QM, "RE_BK_PLUS_QM" },
526 { RE_CHAR_CLASSES, "RE_CHAR_CLASSES" },
527 { RE_CONTEXT_INDEP_ANCHORS, "RE_CONTEXT_INDEP_ANCHORS" },
528 { RE_CONTEXT_INDEP_OPS, "RE_CONTEXT_INDEP_OPS" },
529 { RE_CONTEXT_INVALID_OPS, "RE_CONTEXT_INVALID_OPS" },
530 { RE_DOT_NEWLINE, "RE_DOT_NEWLINE" },
531 { RE_DOT_NOT_NULL, "RE_DOT_NOT_NULL" },
532 { RE_HAT_LISTS_NOT_NEWLINE, "RE_HAT_LISTS_NOT_NEWLINE" },
533 { RE_INTERVALS, "RE_INTERVALS" },
534 { RE_LIMITED_OPS, "RE_LIMITED_OPS" },
535 { RE_NEWLINE_ALT, "RE_NEWLINE_ALT" },
536 { RE_NO_BK_BRACES, "RE_NO_BK_BRACES" },
537 { RE_NO_BK_PARENS, "RE_NO_BK_PARENS" },
538 { RE_NO_BK_REFS, "RE_NO_BK_REFS" },
539 { RE_NO_BK_VBAR, "RE_NO_BK_VBAR" },
540 { RE_NO_EMPTY_RANGES, "RE_NO_EMPTY_RANGES" },
541 { RE_UNMATCHED_RIGHT_PAREN_ORD, "RE_UNMATCHED_RIGHT_PAREN_ORD" },
542 { RE_NO_POSIX_BACKTRACKING, "RE_NO_POSIX_BACKTRACKING" },
543 { RE_NO_GNU_OPS, "RE_NO_GNU_OPS" },
544 { RE_INVALID_INTERVAL_ORD, "RE_INVALID_INTERVAL_ORD" },
545 { RE_ICASE, "RE_ICASE" },
546 { RE_CARET_ANCHORS_HERE, "RE_CARET_ANCHORS_HERE" },
547 { RE_CONTEXT_INVALID_DUP, "RE_CONTEXT_INVALID_DUP" },
548 { RE_NO_SUB, "RE_NO_SUB" },
549 { 0, NULL },
550 };
551
552 if (flagval == RE_SYNTAX_EMACS) /* == 0 */
553 return "RE_SYNTAX_EMACS";
554
555 return genflags2str(flagval, values);
556 }
557
558 /*
559 * dfawarn() is called by the dfa routines whenever a regex is compiled
560 * must supply a dfawarn.
561 */
562
563 void
dfawarn(const char * dfa_warning)564 dfawarn(const char *dfa_warning)
565 {
566 /*
567 * This routine does nothing, since gawk does its own
568 * (better) check for bad [[:foo:]] syntax.
569 */
570 }
571
572 /* check_bracket_exp --- look for /[:space:]/ that should be /[[:space:]]/ */
573
574 static void
check_bracket_exp(char * s,size_t length)575 check_bracket_exp(char *s, size_t length)
576 {
577 static struct reclass {
578 const char *name;
579 size_t len;
580 bool warned;
581 } classes[] = {
582 /*
583 * Ordered by what we hope is frequency,
584 * since it's linear searched.
585 */
586 { "[:alpha:]", 9, false },
587 { "[:digit:]", 9, false },
588 { "[:alnum:]", 9, false },
589 { "[:upper:]", 9, false },
590 { "[:lower:]", 9, false },
591 { "[:space:]", 9, false },
592 { "[:xdigit:]", 10, false },
593 { "[:punct:]", 9, false },
594 { "[:print:]", 9, false },
595 { "[:graph:]", 9, false },
596 { "[:cntrl:]", 9, false },
597 { "[:blank:]", 9, false },
598 { NULL, 0 }
599 };
600 int i;
601 bool found = false;
602 char save;
603 char *sp, *sp2, *end;
604 int len;
605 int count = 0;
606
607 if (length == 0)
608 return;
609
610 end = s + length;
611 save = s[length];
612 s[length] = '\0';
613 sp = s;
614
615 again:
616 sp = sp2 = (char *) memchr(sp, '[', (end - sp));
617 if (sp == NULL)
618 goto done;
619
620 for (count++, sp++; *sp != '\0'; sp++) {
621 if (*sp == '[')
622 count++;
623 /*
624 * ] as first char after open [ is skipped
625 * \] is skipped
626 * [^]] is skipped
627 */
628 if (*sp == ']' && sp > sp2) {
629 if (sp[-1] != '['
630 && sp[-1] != '\\')
631 ;
632 else if ((sp - sp2) >= 2
633 && sp[-1] == '^' && sp[-2] == '[')
634 ;
635 else
636 count--;
637 }
638
639 if (count == 0) {
640 sp++; /* skip past ']' */
641 break;
642 }
643 }
644
645 if (count > 0) { /* bad regex, give up */
646 goto done;
647 }
648
649 /* sp2 has start */
650
651 for (i = 0; classes[i].name != NULL; i++) {
652 if (classes[i].warned)
653 continue;
654 len = classes[i].len;
655 if ( len == (sp - sp2)
656 && memcmp(sp2, classes[i].name, len) == 0) {
657 found = true;
658 break;
659 }
660 }
661
662 if (found && ! classes[i].warned) {
663 warning(_("regexp component `%.*s' should probably be `[%.*s]'"),
664 len, sp2, len, sp2);
665 classes[i].warned = true;
666 }
667
668 if (sp < end) {
669 found = false;
670 goto again;
671 }
672 done:
673 s[length] = save;
674 }
675