xref: /dragonfly/contrib/grep/src/dfasearch.c (revision 0720b42f)
1 /* dfasearch.c - searching subroutines using dfa and regex for grep.
2    Copyright 1992, 1998, 2000, 2007, 2009-2015 Free Software Foundation, Inc.
3 
4    This program is free software; you can redistribute it and/or modify
5    it under the terms of the GNU General Public License as published by
6    the Free Software Foundation; either version 3, or (at your option)
7    any later version.
8 
9    This program is distributed in the hope that it will be useful,
10    but WITHOUT ANY WARRANTY; without even the implied warranty of
11    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12    GNU General Public License for more details.
13 
14    You should have received a copy of the GNU General Public License
15    along with this program; if not, write to the Free Software
16    Foundation, Inc., 51 Franklin Street - Fifth Floor, Boston, MA
17    02110-1301, USA.  */
18 
19 /* Written August 1992 by Mike Haertel. */
20 
21 #include <config.h>
22 #include "intprops.h"
23 #include "search.h"
24 
25 /* Whether -w considers WC to be a word constituent.  */
26 static bool
27 wordchar (wint_t wc)
28 {
29   return wc == L'_' || iswalnum (wc);
30 }
31 
32 /* KWset compiled pattern.  For Ecompile and Gcompile, we compile
33    a list of strings, at least one of which is known to occur in
34    any string matching the regexp. */
35 static kwset_t kwset;
36 
37 /* DFA compiled regexp. */
38 static struct dfa *dfa;
39 
40 /* The Regex compiled patterns.  */
41 static struct patterns
42 {
43   /* Regex compiled regexp. */
44   struct re_pattern_buffer regexbuf;
45   struct re_registers regs; /* This is here on account of a BRAIN-DEAD
46                                Q@#%!# library interface in regex.c.  */
47 } patterns0;
48 
49 static struct patterns *patterns;
50 static size_t pcount;
51 
52 /* Number of compiled fixed strings known to exactly match the regexp.
53    If kwsexec returns < kwset_exact_matches, then we don't need to
54    call the regexp matcher at all. */
55 static size_t kwset_exact_matches;
56 
57 static bool begline;
58 
59 void
60 dfaerror (char const *mesg)
61 {
62   error (EXIT_TROUBLE, 0, "%s", mesg);
63 
64   /* notreached */
65   /* Tell static analyzers that this function does not return.  */
66   abort ();
67 }
68 
69 /* For now, the sole dfawarn-eliciting condition (use of a regexp
70    like '[:lower:]') is unequivocally an error, so treat it as such,
71    when possible.  */
72 void
73 dfawarn (char const *mesg)
74 {
75   static enum { DW_NONE = 0, DW_POSIX, DW_GNU } mode;
76   if (mode == DW_NONE)
77     mode = (getenv ("POSIXLY_CORRECT") ? DW_POSIX : DW_GNU);
78   if (mode == DW_GNU)
79     dfaerror (mesg);
80 }
81 
82 /* If the DFA turns out to have some set of fixed strings one of
83    which must occur in the match, then we build a kwset matcher
84    to find those strings, and thus quickly filter out impossible
85    matches. */
86 static void
87 kwsmusts (void)
88 {
89   struct dfamust *dm = dfamust (dfa);
90   if (!dm)
91     return;
92   kwsinit (&kwset);
93   if (dm->exact)
94     {
95       /* Prepare a substring whose presence implies a match.
96          The kwset matcher will return the index of the matching
97          string that it chooses. */
98       ++kwset_exact_matches;
99       size_t old_len = strlen (dm->must);
100       size_t new_len = old_len + dm->begline + dm->endline;
101       char *must = xmalloc (new_len);
102       char *mp = must;
103       *mp = eolbyte;
104       mp += dm->begline;
105       begline |= dm->begline;
106       memcpy (mp, dm->must, old_len);
107       if (dm->endline)
108         mp[old_len] = eolbyte;
109       kwsincr (kwset, must, new_len);
110       free (must);
111     }
112   else
113     {
114       /* Otherwise, filtering with this substring should help reduce the
115          search space, but we'll still have to use the regexp matcher.  */
116       kwsincr (kwset, dm->must, strlen (dm->must));
117     }
118   kwsprep (kwset);
119   dfamustfree (dm);
120 }
121 
122 void
123 GEAcompile (char const *pattern, size_t size, reg_syntax_t syntax_bits)
124 {
125   size_t total = size;
126   char *motif;
127 
128   if (match_icase)
129     syntax_bits |= RE_ICASE;
130   re_set_syntax (syntax_bits);
131   dfasyntax (syntax_bits, match_icase, eolbyte);
132 
133   /* For GNU regex, pass the patterns separately to detect errors like
134      "[\nallo\n]\n", where the patterns are "[", "allo" and "]", and
135      this should be a syntax error.  The same for backref, where the
136      backref should be local to each pattern.  */
137   char const *p = pattern;
138   do
139     {
140       size_t len;
141       char const *sep = memchr (p, '\n', total);
142       if (sep)
143         {
144           len = sep - p;
145           sep++;
146           total -= (len + 1);
147         }
148       else
149         {
150           len = total;
151           total = 0;
152         }
153 
154       patterns = xnrealloc (patterns, pcount + 1, sizeof *patterns);
155       patterns[pcount] = patterns0;
156 
157       char const *err = re_compile_pattern (p, len,
158                                             &(patterns[pcount].regexbuf));
159       if (err)
160         error (EXIT_TROUBLE, 0, "%s", err);
161       pcount++;
162       p = sep;
163     }
164   while (p);
165 
166   /* In the match_words and match_lines cases, we use a different pattern
167      for the DFA matcher that will quickly throw out cases that won't work.
168      Then if DFA succeeds we do some hairy stuff using the regex matcher
169      to decide whether the match should really count. */
170   if (match_words || match_lines)
171     {
172       static char const line_beg_no_bk[] = "^(";
173       static char const line_end_no_bk[] = ")$";
174       static char const word_beg_no_bk[] = "(^|[^[:alnum:]_])(";
175       static char const word_end_no_bk[] = ")([^[:alnum:]_]|$)";
176       static char const line_beg_bk[] = "^\\(";
177       static char const line_end_bk[] = "\\)$";
178       static char const word_beg_bk[] = "\\(^\\|[^[:alnum:]_]\\)\\(";
179       static char const word_end_bk[] = "\\)\\([^[:alnum:]_]\\|$\\)";
180       int bk = !(syntax_bits & RE_NO_BK_PARENS);
181       char *n = xmalloc (sizeof word_beg_bk - 1 + size + sizeof word_end_bk);
182 
183       strcpy (n, match_lines ? (bk ? line_beg_bk : line_beg_no_bk)
184                              : (bk ? word_beg_bk : word_beg_no_bk));
185       total = strlen(n);
186       memcpy (n + total, pattern, size);
187       total += size;
188       strcpy (n + total, match_lines ? (bk ? line_end_bk : line_end_no_bk)
189                                      : (bk ? word_end_bk : word_end_no_bk));
190       total += strlen (n + total);
191       pattern = motif = n;
192       size = total;
193     }
194   else
195     motif = NULL;
196 
197   dfa = dfaalloc ();
198   dfacomp (pattern, size, dfa, 1);
199   kwsmusts ();
200 
201   free(motif);
202 }
203 
204 size_t
205 EGexecute (char const *buf, size_t size, size_t *match_size,
206            char const *start_ptr)
207 {
208   char const *buflim, *beg, *end, *ptr, *match, *best_match, *mb_start;
209   char eol = eolbyte;
210   regoff_t start;
211   size_t len, best_len;
212   struct kwsmatch kwsm;
213   size_t i;
214   struct dfa *superset = dfasuperset (dfa);
215   bool dfafast = dfaisfast (dfa);
216 
217   mb_start = buf;
218   buflim = buf + size;
219 
220   for (beg = end = buf; end < buflim; beg = end)
221     {
222       end = buflim;
223 
224       if (!start_ptr)
225         {
226           char const *next_beg, *dfa_beg = beg;
227           size_t count = 0;
228           bool exact_kwset_match = false;
229           int backref = 0;
230 
231           /* Try matching with KWset, if it's defined.  */
232           if (kwset)
233             {
234               char const *prev_beg;
235 
236               /* Find a possible match using the KWset matcher.  */
237               size_t offset = kwsexec (kwset, beg - begline,
238                                        buflim - beg + begline, &kwsm);
239               if (offset == (size_t) -1)
240                 goto failure;
241               match = beg + offset;
242               prev_beg = beg;
243 
244               /* Narrow down to the line containing the possible match.  */
245               beg = memrchr (buf, eol, match - buf);
246               beg = beg ? beg + 1 : buf;
247               dfa_beg = beg;
248 
249               /* Determine the end pointer to give the DFA next.  Typically
250                  this is after the first newline after MATCH; but if the KWset
251                  match is not exact, the DFA is fast, and the offset from
252                  PREV_BEG is less than 64 or (MATCH - PREV_BEG), this is the
253                  greater of the latter two values; this temporarily prefers
254                  the DFA to KWset.  */
255               exact_kwset_match = kwsm.index < kwset_exact_matches;
256               end = ((exact_kwset_match || !dfafast
257                       || MAX (16, match - beg) < (match - prev_beg) >> 2)
258                      ? match
259                      : MAX (16, match - beg) < (buflim - prev_beg) >> 2
260                      ? prev_beg + 4 * MAX (16, match - beg)
261                      : buflim);
262               end = memchr (end, eol, buflim - end);
263               end = end ? end + 1 : buflim;
264 
265               if (exact_kwset_match)
266                 {
267                   if (MB_CUR_MAX == 1 || using_utf8 ())
268                     goto success;
269                   if (mb_start < beg)
270                     mb_start = beg;
271                   if (mb_goback (&mb_start, match, buflim) == 0)
272                     goto success;
273                   /* The matched line starts in the middle of a multibyte
274                      character.  Perform the DFA search starting from the
275                      beginning of the next character.  */
276                   dfa_beg = mb_start;
277                 }
278             }
279 
280           /* Try matching with the superset of DFA, if it's defined.  */
281           if (superset && !exact_kwset_match)
282             {
283               /* Keep using the superset while it reports multiline
284                  potential matches; this is more likely to be fast
285                  than falling back to KWset would be.  */
286               while ((next_beg = dfaexec (superset, dfa_beg, (char *) end, 1,
287                                           &count, NULL))
288                      && next_beg != end
289                      && count != 0)
290                 {
291                   /* Try to match in just one line.  */
292                   count = 0;
293                   beg = memrchr (buf, eol, next_beg - buf);
294                   beg++;
295                   dfa_beg = beg;
296                 }
297               if (next_beg == NULL || next_beg == end)
298                 continue;
299 
300               /* Narrow down to the line we've found.  */
301               end = memchr (next_beg, eol, buflim - next_beg);
302               end = end ? end + 1 : buflim;
303             }
304 
305           /* Try matching with DFA.  */
306           next_beg = dfaexec (dfa, dfa_beg, (char *) end, 0, &count, &backref);
307 
308           /* If there's no match, or if we've matched the sentinel,
309              we're done.  */
310           if (next_beg == NULL || next_beg == end)
311             continue;
312 
313           /* Narrow down to the line we've found.  */
314           if (count != 0)
315             {
316               beg = memrchr (buf, eol, next_beg - buf);
317               beg++;
318             }
319           end = memchr (next_beg, eol, buflim - next_beg);
320           end = end ? end + 1 : buflim;
321 
322           /* Successful, no backreferences encountered! */
323           if (!backref)
324             goto success;
325           ptr = beg;
326         }
327       else
328         {
329           /* We are looking for the leftmost (then longest) exact match.
330              We will go through the outer loop only once.  */
331           ptr = start_ptr;
332         }
333 
334       /* If the "line" is longer than the maximum regexp offset,
335          die as if we've run out of memory.  */
336       if (TYPE_MAXIMUM (regoff_t) < end - beg - 1)
337         xalloc_die ();
338 
339       /* Run the possible match through Regex.  */
340       best_match = end;
341       best_len = 0;
342       for (i = 0; i < pcount; i++)
343         {
344           patterns[i].regexbuf.not_eol = 0;
345           start = re_search (&(patterns[i].regexbuf),
346                              beg, end - beg - 1,
347                              ptr - beg, end - ptr - 1,
348                              &(patterns[i].regs));
349           if (start < -1)
350             xalloc_die ();
351           else if (0 <= start)
352             {
353               len = patterns[i].regs.end[0] - start;
354               match = beg + start;
355               if (match > best_match)
356                 continue;
357               if (start_ptr && !match_words)
358                 goto assess_pattern_match;
359               if ((!match_lines && !match_words)
360                   || (match_lines && len == end - ptr - 1))
361                 {
362                   match = ptr;
363                   len = end - ptr;
364                   goto assess_pattern_match;
365                 }
366               /* If -w, check if the match aligns with word boundaries.
367                  We do this iteratively because:
368                  (a) the line may contain more than one occurrence of the
369                  pattern, and
370                  (b) Several alternatives in the pattern might be valid at a
371                  given point, and we may need to consider a shorter one to
372                  find a word boundary.  */
373               if (match_words)
374                 while (match <= best_match)
375                   {
376                     regoff_t shorter_len = 0;
377                     if (!wordchar (mb_prev_wc (beg, match, end - 1))
378                         && !wordchar (mb_next_wc (match + len, end - 1)))
379                       goto assess_pattern_match;
380                     if (len > 0)
381                       {
382                         /* Try a shorter length anchored at the same place. */
383                         --len;
384                         patterns[i].regexbuf.not_eol = 1;
385                         shorter_len = re_match (&(patterns[i].regexbuf),
386                                                 beg, match + len - ptr,
387                                                 match - beg,
388                                                 &(patterns[i].regs));
389                         if (shorter_len < -1)
390                           xalloc_die ();
391                       }
392                     if (0 < shorter_len)
393                       len = shorter_len;
394                     else
395                       {
396                         /* Try looking further on. */
397                         if (match == end - 1)
398                           break;
399                         match++;
400                         patterns[i].regexbuf.not_eol = 0;
401                         start = re_search (&(patterns[i].regexbuf),
402                                            beg, end - beg - 1,
403                                            match - beg, end - match - 1,
404                                            &(patterns[i].regs));
405                         if (start < 0)
406                           {
407                             if (start < -1)
408                               xalloc_die ();
409                             break;
410                           }
411                         len = patterns[i].regs.end[0] - start;
412                         match = beg + start;
413                       }
414                   } /* while (match <= best_match) */
415               continue;
416             assess_pattern_match:
417               if (!start_ptr)
418                 {
419                   /* Good enough for a non-exact match.
420                      No need to look at further patterns, if any.  */
421                   goto success;
422                 }
423               if (match < best_match || (match == best_match && len > best_len))
424                 {
425                   /* Best exact match:  leftmost, then longest.  */
426                   best_match = match;
427                   best_len = len;
428                 }
429             } /* if re_search >= 0 */
430         } /* for Regex patterns.  */
431         if (best_match < end)
432           {
433             /* We have found an exact match.  We were just
434                waiting for the best one (leftmost then longest).  */
435             beg = best_match;
436             len = best_len;
437             goto success_in_len;
438           }
439     } /* for (beg = end ..) */
440 
441  failure:
442   return -1;
443 
444  success:
445   len = end - beg;
446  success_in_len:;
447   size_t off = beg - buf;
448   *match_size = len;
449   return off;
450 }
451