1 /* dfasearch.c - searching subroutines using dfa and regex for grep.
2 Copyright 1992, 1998, 2000, 2007, 2009-2020 Free Software Foundation, Inc.
3
4 This program is free software; you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation; either version 3, or (at your option)
7 any later version.
8
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
13
14 You should have received a copy of the GNU General Public License
15 along with this program; if not, write to the Free Software
16 Foundation, Inc., 51 Franklin Street - Fifth Floor, Boston, MA
17 02110-1301, USA. */
18
19 /* Written August 1992 by Mike Haertel. */
20
21 #include <config.h>
22 #include "intprops.h"
23 #include "search.h"
24 #include "die.h"
25 #include <error.h>
26
27 struct dfa_comp
28 {
29 /* KWset compiled pattern. For Ecompile and Gcompile, we compile
30 a list of strings, at least one of which is known to occur in
31 any string matching the regexp. */
32 kwset_t kwset;
33
34 /* DFA compiled regexp. */
35 struct dfa *dfa;
36
37 /* Regex compiled regexps. */
38 struct re_pattern_buffer *patterns;
39 size_t pcount;
40 struct re_registers regs;
41
42 /* Number of compiled fixed strings known to exactly match the regexp.
43 If kwsexec returns < kwset_exact_matches, then we don't need to
44 call the regexp matcher at all. */
45 ptrdiff_t kwset_exact_matches;
46
47 bool begline;
48 };
49
50 void
dfaerror(char const * mesg)51 dfaerror (char const *mesg)
52 {
53 die (EXIT_TROUBLE, 0, "%s", mesg);
54 }
55
56 /* For now, the sole dfawarn-eliciting condition (use of a regexp
57 like '[:lower:]') is unequivocally an error, so treat it as such,
58 when possible. */
59 void
dfawarn(char const * mesg)60 dfawarn (char const *mesg)
61 {
62 if (!getenv ("POSIXLY_CORRECT"))
63 dfaerror (mesg);
64 }
65
66 /* If the DFA turns out to have some set of fixed strings one of
67 which must occur in the match, then we build a kwset matcher
68 to find those strings, and thus quickly filter out impossible
69 matches. */
70 static void
kwsmusts(struct dfa_comp * dc)71 kwsmusts (struct dfa_comp *dc)
72 {
73 struct dfamust *dm = dfamust (dc->dfa);
74 if (!dm)
75 return;
76 dc->kwset = kwsinit (false);
77 if (dm->exact)
78 {
79 /* Prepare a substring whose presence implies a match.
80 The kwset matcher will return the index of the matching
81 string that it chooses. */
82 ++dc->kwset_exact_matches;
83 ptrdiff_t old_len = strlen (dm->must);
84 ptrdiff_t new_len = old_len + dm->begline + dm->endline;
85 char *must = xmalloc (new_len);
86 char *mp = must;
87 *mp = eolbyte;
88 mp += dm->begline;
89 dc->begline |= dm->begline;
90 memcpy (mp, dm->must, old_len);
91 if (dm->endline)
92 mp[old_len] = eolbyte;
93 kwsincr (dc->kwset, must, new_len);
94 free (must);
95 }
96 else
97 {
98 /* Otherwise, filtering with this substring should help reduce the
99 search space, but we'll still have to use the regexp matcher. */
100 kwsincr (dc->kwset, dm->must, strlen (dm->must));
101 }
102 kwsprep (dc->kwset);
103 dfamustfree (dm);
104 }
105
106 /* Return true if KEYS, of length LEN, might contain a back-reference.
107 Return false if KEYS cannot contain a back-reference.
108 BS_SAFE is true of encodings where a backslash cannot appear as the
109 last byte of a multibyte character. */
110 static bool _GL_ATTRIBUTE_PURE
possible_backrefs_in_pattern(char const * keys,ptrdiff_t len,bool bs_safe)111 possible_backrefs_in_pattern (char const *keys, ptrdiff_t len, bool bs_safe)
112 {
113 /* Normally a backslash, but in an unsafe encoding this is a non-char
114 value so that the comparison below always fails, because if there
115 are two adjacent '\' bytes, the first might be the last byte of a
116 multibyte character. */
117 int second_backslash = bs_safe ? '\\' : CHAR_MAX + 1;
118
119 /* This code can return true even if KEYS lacks a back-reference, for
120 patterns like [\2], or for encodings where '\' appears as the last
121 byte of a multibyte character. However, false alarms should be
122 rare and do not affect correctness. */
123
124 /* Do not look for a backslash in the pattern's last byte, since it
125 can't be part of a back-reference and this streamlines the code. */
126 len--;
127
128 if (0 <= len)
129 {
130 char const *lim = keys + len;
131 for (char const *p = keys; (p = memchr (p, '\\', lim - p)); p++)
132 {
133 if ('1' <= p[1] && p[1] <= '9')
134 return true;
135 if (p[1] == second_backslash)
136 {
137 p++;
138 if (p == lim)
139 break;
140 }
141 }
142 }
143 return false;
144 }
145
146 static bool
regex_compile(struct dfa_comp * dc,char const * p,ptrdiff_t len,ptrdiff_t pcount,ptrdiff_t lineno,bool syntax_only)147 regex_compile (struct dfa_comp *dc, char const *p, ptrdiff_t len,
148 ptrdiff_t pcount, ptrdiff_t lineno, bool syntax_only)
149 {
150 struct re_pattern_buffer pat0;
151 struct re_pattern_buffer *pat = syntax_only ? &pat0 : &dc->patterns[pcount];
152 pat->buffer = NULL;
153 pat->allocated = 0;
154
155 /* Do not use a fastmap with -i, to work around glibc Bug#20381. */
156 pat->fastmap = syntax_only | match_icase ? NULL : xmalloc (UCHAR_MAX + 1);
157
158 pat->translate = NULL;
159
160 char const *err = re_compile_pattern (p, len, pat);
161 if (!err)
162 return true;
163
164 /* Emit a filename:lineno: prefix for patterns taken from files. */
165 size_t pat_lineno = lineno;
166 char const *pat_filename
167 = lineno < 0 ? "\0" : pattern_file_name (lineno + 1, &pat_lineno);
168
169 if (*pat_filename == '\0')
170 error (0, 0, "%s", err);
171 else
172 error (0, 0, "%s:%zu: %s", pat_filename, pat_lineno, err);
173
174 return false;
175 }
176
177 void *
GEAcompile(char * pattern,size_t size,reg_syntax_t syntax_bits)178 GEAcompile (char *pattern, size_t size, reg_syntax_t syntax_bits)
179 {
180 char *motif;
181 struct dfa_comp *dc = xcalloc (1, sizeof (*dc));
182
183 dc->dfa = dfaalloc ();
184
185 if (match_icase)
186 syntax_bits |= RE_ICASE;
187 re_set_syntax (syntax_bits);
188 int dfaopts = eolbyte ? 0 : DFA_EOL_NUL;
189 dfasyntax (dc->dfa, &localeinfo, syntax_bits, dfaopts);
190 bool bs_safe = !localeinfo.multibyte | localeinfo.using_utf8;
191
192 /* For GNU regex, pass the patterns separately to detect errors like
193 "[\nallo\n]\n", where the patterns are "[", "allo" and "]", and
194 this should be a syntax error. The same for backref, where the
195 backref should be local to each pattern. */
196 char const *p = pattern;
197 char const *patlim = pattern + size;
198 bool compilation_failed = false;
199
200 dc->patterns = xmalloc (sizeof *dc->patterns);
201 dc->patterns++;
202 dc->pcount = 0;
203 size_t palloc = 1;
204
205 char const *prev = pattern;
206
207 /* Buffer containing back-reference-free patterns. */
208 char *buf = NULL;
209 ptrdiff_t buflen = 0;
210 size_t bufalloc = 0;
211
212 ptrdiff_t lineno = 0;
213
214 do
215 {
216 size_t len;
217 char const *sep = memchr (p, '\n', patlim - p);
218 if (sep)
219 {
220 len = sep - p;
221 sep++;
222 }
223 else
224 len = patlim - p;
225
226 bool backref = possible_backrefs_in_pattern (p, len, bs_safe);
227
228 if (backref && prev < p)
229 {
230 ptrdiff_t prevlen = p - prev;
231 while (bufalloc < buflen + prevlen)
232 buf = x2realloc (buf, &bufalloc);
233 memcpy (buf + buflen, prev, prevlen);
234 buflen += prevlen;
235 }
236
237 /* Ensure room for at least two more patterns. The extra one is
238 for the regex_compile that may be executed after this loop
239 exits, and its (unused) slot is patterns[-1] until then. */
240 while (palloc <= dc->pcount + 1)
241 {
242 dc->patterns = x2nrealloc (dc->patterns - 1, &palloc,
243 sizeof *dc->patterns);
244 dc->patterns++;
245 }
246
247 if (!regex_compile (dc, p, len, dc->pcount, lineno, !backref))
248 compilation_failed = true;
249
250 p = sep;
251 lineno++;
252
253 if (backref)
254 {
255 dc->pcount++;
256 prev = p;
257 }
258 }
259 while (p);
260
261 if (compilation_failed)
262 exit (EXIT_TROUBLE);
263
264 if (prev != NULL)
265 {
266 if (pattern < prev)
267 {
268 ptrdiff_t prevlen = patlim - prev;
269 buf = xrealloc (buf, buflen + prevlen);
270 memcpy (buf + buflen, prev, prevlen);
271 buflen += prevlen;
272 }
273 else
274 {
275 buf = pattern;
276 buflen = size;
277 }
278 }
279
280 if (buf != NULL)
281 {
282 dc->patterns--;
283 dc->pcount++;
284
285 if (!regex_compile (dc, buf, buflen, 0, -1, false))
286 abort ();
287
288 if (buf != pattern)
289 free (buf);
290 }
291
292 /* In the match_words and match_lines cases, we use a different pattern
293 for the DFA matcher that will quickly throw out cases that won't work.
294 Then if DFA succeeds we do some hairy stuff using the regex matcher
295 to decide whether the match should really count. */
296 if (match_words || match_lines)
297 {
298 static char const line_beg_no_bk[] = "^(";
299 static char const line_end_no_bk[] = ")$";
300 static char const word_beg_no_bk[] = "(^|[^[:alnum:]_])(";
301 static char const word_end_no_bk[] = ")([^[:alnum:]_]|$)";
302 static char const line_beg_bk[] = "^\\(";
303 static char const line_end_bk[] = "\\)$";
304 static char const word_beg_bk[] = "\\(^\\|[^[:alnum:]_]\\)\\(";
305 static char const word_end_bk[] = "\\)\\([^[:alnum:]_]\\|$\\)";
306 int bk = !(syntax_bits & RE_NO_BK_PARENS);
307 char *n = xmalloc (sizeof word_beg_bk - 1 + size + sizeof word_end_bk);
308
309 strcpy (n, match_lines ? (bk ? line_beg_bk : line_beg_no_bk)
310 : (bk ? word_beg_bk : word_beg_no_bk));
311 size_t total = strlen (n);
312 memcpy (n + total, pattern, size);
313 total += size;
314 strcpy (n + total, match_lines ? (bk ? line_end_bk : line_end_no_bk)
315 : (bk ? word_end_bk : word_end_no_bk));
316 total += strlen (n + total);
317 pattern = motif = n;
318 size = total;
319 }
320 else
321 motif = NULL;
322
323 dfaparse (pattern, size, dc->dfa);
324 kwsmusts (dc);
325 dfacomp (NULL, 0, dc->dfa, 1);
326
327 free (motif);
328
329 return dc;
330 }
331
332 size_t
EGexecute(void * vdc,char const * buf,size_t size,size_t * match_size,char const * start_ptr)333 EGexecute (void *vdc, char const *buf, size_t size, size_t *match_size,
334 char const *start_ptr)
335 {
336 char const *buflim, *beg, *end, *ptr, *match, *best_match, *mb_start;
337 char eol = eolbyte;
338 regoff_t start;
339 size_t len, best_len;
340 struct kwsmatch kwsm;
341 size_t i;
342 struct dfa_comp *dc = vdc;
343 struct dfa *superset = dfasuperset (dc->dfa);
344 bool dfafast = dfaisfast (dc->dfa);
345
346 mb_start = buf;
347 buflim = buf + size;
348
349 for (beg = end = buf; end < buflim; beg = end)
350 {
351 end = buflim;
352
353 if (!start_ptr)
354 {
355 char const *next_beg, *dfa_beg = beg;
356 ptrdiff_t count = 0;
357 bool exact_kwset_match = false;
358 bool backref = false;
359
360 /* Try matching with KWset, if it's defined. */
361 if (dc->kwset)
362 {
363 char const *prev_beg;
364
365 /* Find a possible match using the KWset matcher. */
366 ptrdiff_t offset = kwsexec (dc->kwset, beg - dc->begline,
367 buflim - beg + dc->begline,
368 &kwsm, true);
369 if (offset < 0)
370 goto failure;
371 match = beg + offset;
372 prev_beg = beg;
373
374 /* Narrow down to the line containing the possible match. */
375 beg = memrchr (buf, eol, match - buf);
376 beg = beg ? beg + 1 : buf;
377 dfa_beg = beg;
378
379 /* Determine the end pointer to give the DFA next. Typically
380 this is after the first newline after MATCH; but if the KWset
381 match is not exact, the DFA is fast, and the offset from
382 PREV_BEG is less than 64 or (MATCH - PREV_BEG), this is the
383 greater of the latter two values; this temporarily prefers
384 the DFA to KWset. */
385 exact_kwset_match = kwsm.index < dc->kwset_exact_matches;
386 end = ((exact_kwset_match || !dfafast
387 || MAX (16, match - beg) < (match - prev_beg) >> 2)
388 ? match
389 : MAX (16, match - beg) < (buflim - prev_beg) >> 2
390 ? prev_beg + 4 * MAX (16, match - beg)
391 : buflim);
392 end = memchr (end, eol, buflim - end);
393 end = end ? end + 1 : buflim;
394
395 if (exact_kwset_match)
396 {
397 if (!localeinfo.multibyte | localeinfo.using_utf8)
398 goto success;
399 if (mb_start < beg)
400 mb_start = beg;
401 if (mb_goback (&mb_start, NULL, match, buflim) == 0)
402 goto success;
403 /* The matched line starts in the middle of a multibyte
404 character. Perform the DFA search starting from the
405 beginning of the next character. */
406 dfa_beg = mb_start;
407 }
408 }
409
410 /* Try matching with the superset of DFA, if it's defined. */
411 if (superset && !exact_kwset_match)
412 {
413 /* Keep using the superset while it reports multiline
414 potential matches; this is more likely to be fast
415 than falling back to KWset would be. */
416 next_beg = dfaexec (superset, dfa_beg, (char *) end, 0,
417 &count, NULL);
418 if (next_beg == NULL || next_beg == end)
419 continue;
420
421 /* Narrow down to the line we've found. */
422 if (count != 0)
423 {
424 beg = memrchr (buf, eol, next_beg - buf);
425 beg++;
426 dfa_beg = beg;
427 }
428 end = memchr (next_beg, eol, buflim - next_beg);
429 end = end ? end + 1 : buflim;
430
431 count = 0;
432 }
433
434 /* Try matching with DFA. */
435 next_beg = dfaexec (dc->dfa, dfa_beg, (char *) end, 0, &count,
436 &backref);
437
438 /* If there's no match, or if we've matched the sentinel,
439 we're done. */
440 if (next_beg == NULL || next_beg == end)
441 continue;
442
443 /* Narrow down to the line we've found. */
444 if (count != 0)
445 {
446 beg = memrchr (buf, eol, next_beg - buf);
447 beg++;
448 }
449 end = memchr (next_beg, eol, buflim - next_beg);
450 end = end ? end + 1 : buflim;
451
452 /* Successful, no back-references encountered! */
453 if (!backref)
454 goto success;
455 ptr = beg;
456 }
457 else
458 {
459 /* We are looking for the leftmost (then longest) exact match.
460 We will go through the outer loop only once. */
461 ptr = start_ptr;
462 }
463
464 /* If the "line" is longer than the maximum regexp offset,
465 die as if we've run out of memory. */
466 if (TYPE_MAXIMUM (regoff_t) < end - beg - 1)
467 xalloc_die ();
468
469 /* Run the possible match through Regex. */
470 best_match = end;
471 best_len = 0;
472 for (i = 0; i < dc->pcount; i++)
473 {
474 dc->patterns[i].not_eol = 0;
475 dc->patterns[i].newline_anchor = eolbyte == '\n';
476 start = re_search (&dc->patterns[i], beg, end - beg - 1,
477 ptr - beg, end - ptr - 1, &dc->regs);
478 if (start < -1)
479 xalloc_die ();
480 else if (0 <= start)
481 {
482 len = dc->regs.end[0] - start;
483 match = beg + start;
484 if (match > best_match)
485 continue;
486 if (start_ptr && !match_words)
487 goto assess_pattern_match;
488 if ((!match_lines && !match_words)
489 || (match_lines && len == end - ptr - 1))
490 {
491 match = ptr;
492 len = end - ptr;
493 goto assess_pattern_match;
494 }
495 /* If -w and not -x, check whether the match aligns with
496 word boundaries. Do this iteratively because:
497 (a) the line may contain more than one occurrence of the
498 pattern, and
499 (b) Several alternatives in the pattern might be valid at a
500 given point, and we may need to consider a shorter one to
501 find a word boundary. */
502 if (!match_lines && match_words)
503 while (match <= best_match)
504 {
505 regoff_t shorter_len = 0;
506 if (! wordchar_next (match + len, end - 1)
507 && ! wordchar_prev (beg, match, end - 1))
508 goto assess_pattern_match;
509 if (len > 0)
510 {
511 /* Try a shorter length anchored at the same place. */
512 --len;
513 dc->patterns[i].not_eol = 1;
514 shorter_len = re_match (&dc->patterns[i], beg,
515 match + len - ptr, match - beg,
516 &dc->regs);
517 if (shorter_len < -1)
518 xalloc_die ();
519 }
520 if (0 < shorter_len)
521 len = shorter_len;
522 else
523 {
524 /* Try looking further on. */
525 if (match == end - 1)
526 break;
527 match++;
528 dc->patterns[i].not_eol = 0;
529 start = re_search (&dc->patterns[i], beg, end - beg - 1,
530 match - beg, end - match - 1,
531 &dc->regs);
532 if (start < 0)
533 {
534 if (start < -1)
535 xalloc_die ();
536 break;
537 }
538 len = dc->regs.end[0] - start;
539 match = beg + start;
540 }
541 } /* while (match <= best_match) */
542 continue;
543 assess_pattern_match:
544 if (!start_ptr)
545 {
546 /* Good enough for a non-exact match.
547 No need to look at further patterns, if any. */
548 goto success;
549 }
550 if (match < best_match || (match == best_match && len > best_len))
551 {
552 /* Best exact match: leftmost, then longest. */
553 best_match = match;
554 best_len = len;
555 }
556 } /* if re_search >= 0 */
557 } /* for Regex patterns. */
558 if (best_match < end)
559 {
560 /* We have found an exact match. We were just
561 waiting for the best one (leftmost then longest). */
562 beg = best_match;
563 len = best_len;
564 goto success_in_len;
565 }
566 } /* for (beg = end ..) */
567
568 failure:
569 return -1;
570
571 success:
572 len = end - beg;
573 success_in_len:;
574 size_t off = beg - buf;
575 *match_size = len;
576 return off;
577 }
578