xref: /dragonfly/contrib/grep/src/dfasearch.c (revision f7df6c8e)
1 /* dfasearch.c - searching subroutines using dfa and regex for grep.
2    Copyright 1992, 1998, 2000, 2007, 2009-2014 Free Software Foundation, Inc.
3 
4    This program is free software; you can redistribute it and/or modify
5    it under the terms of the GNU General Public License as published by
6    the Free Software Foundation; either version 3, or (at your option)
7    any later version.
8 
9    This program is distributed in the hope that it will be useful,
10    but WITHOUT ANY WARRANTY; without even the implied warranty of
11    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12    GNU General Public License for more details.
13 
14    You should have received a copy of the GNU General Public License
15    along with this program; if not, write to the Free Software
16    Foundation, Inc., 51 Franklin Street - Fifth Floor, Boston, MA
17    02110-1301, USA.  */
18 
19 /* Written August 1992 by Mike Haertel. */
20 
21 #include <config.h>
22 #include "intprops.h"
23 #include "search.h"
24 
25 /* Whether -w considers WC to be a word constituent.  */
26 static bool
27 wordchar (wint_t wc)
28 {
29   return wc == L'_' || iswalnum (wc);
30 }
31 
32 /* KWset compiled pattern.  For Ecompile and Gcompile, we compile
33    a list of strings, at least one of which is known to occur in
34    any string matching the regexp. */
35 static kwset_t kwset;
36 
37 /* DFA compiled regexp. */
38 static struct dfa *dfa;
39 
40 /* The Regex compiled patterns.  */
41 static struct patterns
42 {
43   /* Regex compiled regexp. */
44   struct re_pattern_buffer regexbuf;
45   struct re_registers regs; /* This is here on account of a BRAIN-DEAD
46                                Q@#%!# library interface in regex.c.  */
47 } patterns0;
48 
49 static struct patterns *patterns;
50 static size_t pcount;
51 
52 /* Number of compiled fixed strings known to exactly match the regexp.
53    If kwsexec returns < kwset_exact_matches, then we don't need to
54    call the regexp matcher at all. */
55 static size_t kwset_exact_matches;
56 
57 static bool begline;
58 
59 void
60 dfaerror (char const *mesg)
61 {
62   error (EXIT_TROUBLE, 0, "%s", mesg);
63 
64   /* notreached */
65   /* Tell static analyzers that this function does not return.  */
66   abort ();
67 }
68 
69 /* For now, the sole dfawarn-eliciting condition (use of a regexp
70    like '[:lower:]') is unequivocally an error, so treat it as such,
71    when possible.  */
72 void
73 dfawarn (char const *mesg)
74 {
75   static enum { DW_NONE = 0, DW_POSIX, DW_GNU } mode;
76   if (mode == DW_NONE)
77     mode = (getenv ("POSIXLY_CORRECT") ? DW_POSIX : DW_GNU);
78   if (mode == DW_GNU)
79     dfaerror (mesg);
80 }
81 
82 /* If the DFA turns out to have some set of fixed strings one of
83    which must occur in the match, then we build a kwset matcher
84    to find those strings, and thus quickly filter out impossible
85    matches. */
86 static void
87 kwsmusts (void)
88 {
89   struct dfamust const *dm = dfamusts (dfa);
90   if (dm)
91     {
92       kwsinit (&kwset);
93       /* First, we compile in the substrings known to be exact
94          matches.  The kwset matcher will return the index
95          of the matching string that it chooses. */
96       for (; dm; dm = dm->next)
97         {
98           if (!dm->exact)
99             continue;
100           ++kwset_exact_matches;
101           size_t old_len = strlen (dm->must);
102           size_t new_len = old_len + dm->begline + dm->endline;
103           char *must = xmalloc (new_len);
104           char *mp = must;
105           *mp = eolbyte;
106           mp += dm->begline;
107           begline |= dm->begline;
108           memcpy (mp, dm->must, old_len);
109           if (dm->endline)
110             mp[old_len] = eolbyte;
111           kwsincr (kwset, must, new_len);
112           free (must);
113         }
114       /* Now, we compile the substrings that will require
115          the use of the regexp matcher.  */
116       for (dm = dfamusts (dfa); dm; dm = dm->next)
117         {
118           if (dm->exact)
119             continue;
120           kwsincr (kwset, dm->must, strlen (dm->must));
121         }
122       kwsprep (kwset);
123     }
124 }
125 
126 void
127 GEAcompile (char const *pattern, size_t size, reg_syntax_t syntax_bits)
128 {
129   size_t total = size;
130   char *motif;
131 
132   if (match_icase)
133     syntax_bits |= RE_ICASE;
134   re_set_syntax (syntax_bits);
135   dfasyntax (syntax_bits, match_icase, eolbyte);
136 
137   /* For GNU regex, pass the patterns separately to detect errors like
138      "[\nallo\n]\n", where the patterns are "[", "allo" and "]", and
139      this should be a syntax error.  The same for backref, where the
140      backref should be local to each pattern.  */
141   char const *p = pattern;
142   do
143     {
144       size_t len;
145       char const *sep = memchr (p, '\n', total);
146       if (sep)
147         {
148           len = sep - p;
149           sep++;
150           total -= (len + 1);
151         }
152       else
153         {
154           len = total;
155           total = 0;
156         }
157 
158       patterns = xnrealloc (patterns, pcount + 1, sizeof *patterns);
159       patterns[pcount] = patterns0;
160 
161       char const *err = re_compile_pattern (p, len,
162                                             &(patterns[pcount].regexbuf));
163       if (err)
164         error (EXIT_TROUBLE, 0, "%s", err);
165       pcount++;
166       p = sep;
167     }
168   while (p);
169 
170   /* In the match_words and match_lines cases, we use a different pattern
171      for the DFA matcher that will quickly throw out cases that won't work.
172      Then if DFA succeeds we do some hairy stuff using the regex matcher
173      to decide whether the match should really count. */
174   if (match_words || match_lines)
175     {
176       static char const line_beg_no_bk[] = "^(";
177       static char const line_end_no_bk[] = ")$";
178       static char const word_beg_no_bk[] = "(^|[^[:alnum:]_])(";
179       static char const word_end_no_bk[] = ")([^[:alnum:]_]|$)";
180       static char const line_beg_bk[] = "^\\(";
181       static char const line_end_bk[] = "\\)$";
182       static char const word_beg_bk[] = "\\(^\\|[^[:alnum:]_]\\)\\(";
183       static char const word_end_bk[] = "\\)\\([^[:alnum:]_]\\|$\\)";
184       int bk = !(syntax_bits & RE_NO_BK_PARENS);
185       char *n = xmalloc (sizeof word_beg_bk - 1 + size + sizeof word_end_bk);
186 
187       strcpy (n, match_lines ? (bk ? line_beg_bk : line_beg_no_bk)
188                              : (bk ? word_beg_bk : word_beg_no_bk));
189       total = strlen(n);
190       memcpy (n + total, pattern, size);
191       total += size;
192       strcpy (n + total, match_lines ? (bk ? line_end_bk : line_end_no_bk)
193                                      : (bk ? word_end_bk : word_end_no_bk));
194       total += strlen (n + total);
195       pattern = motif = n;
196       size = total;
197     }
198   else
199     motif = NULL;
200 
201   dfa = dfaalloc ();
202   dfacomp (pattern, size, dfa, 1);
203   kwsmusts ();
204 
205   free(motif);
206 }
207 
208 size_t
209 EGexecute (char const *buf, size_t size, size_t *match_size,
210            char const *start_ptr)
211 {
212   char const *buflim, *beg, *end, *ptr, *match, *best_match, *mb_start;
213   char eol = eolbyte;
214   int backref;
215   regoff_t start;
216   size_t len, best_len;
217   struct kwsmatch kwsm;
218   size_t i;
219   struct dfa *superset = dfasuperset (dfa);
220   bool dfafast = dfaisfast (dfa);
221 
222   mb_start = buf;
223   buflim = buf + size;
224 
225   for (beg = end = buf; end < buflim; beg = end)
226     {
227       end = buflim;
228 
229       if (!start_ptr)
230         {
231           char const *next_beg, *dfa_beg = beg;
232           size_t count = 0;
233           bool exact_kwset_match = false;
234 
235           /* Try matching with KWset, if it's defined.  */
236           if (kwset)
237             {
238               char const *prev_beg;
239 
240               /* Find a possible match using the KWset matcher.  */
241               size_t offset = kwsexec (kwset, beg - begline,
242                                        buflim - beg + begline, &kwsm);
243               if (offset == (size_t) -1)
244                 goto failure;
245               match = beg + offset;
246               prev_beg = beg;
247 
248               /* Narrow down to the line containing the possible match.  */
249               beg = memrchr (buf, eol, match - buf);
250               beg = beg ? beg + 1 : buf;
251               dfa_beg = beg;
252 
253               /* Determine the end pointer to give the DFA next.  Typically
254                  this is after the first newline after MATCH; but if the KWset
255                  match is not exact, the DFA is fast, and the offset from
256                  PREV_BEG is less than 64 or (MATCH - PREV_BEG), this is the
257                  greater of the latter two values; this temporarily prefers
258                  the DFA to KWset.  */
259               exact_kwset_match = kwsm.index < kwset_exact_matches;
260               end = ((exact_kwset_match || !dfafast
261                       || MAX (16, match - beg) < (match - prev_beg) >> 2)
262                      ? match
263                      : MAX (16, match - beg) < (buflim - prev_beg) >> 2
264                      ? prev_beg + 4 * MAX (16, match - beg)
265                      : buflim);
266               end = memchr (end, eol, buflim - end);
267               end = end ? end + 1 : buflim;
268 
269               if (exact_kwset_match)
270                 {
271                   if (MB_CUR_MAX == 1 || using_utf8 ())
272                     goto success;
273                   if (mb_start < beg)
274                     mb_start = beg;
275                   if (mb_goback (&mb_start, match, buflim) == 0)
276                     goto success;
277                   /* The matched line starts in the middle of a multibyte
278                      character.  Perform the DFA search starting from the
279                      beginning of the next character.  */
280                   dfa_beg = mb_start;
281                 }
282             }
283 
284           /* Try matching with the superset of DFA, if it's defined.  */
285           if (superset && !exact_kwset_match)
286             {
287               /* Keep using the superset while it reports multiline
288                  potential matches; this is more likely to be fast
289                  than falling back to KWset would be.  */
290               while ((next_beg = dfaexec (superset, dfa_beg, (char *) end, 1,
291                                           &count, NULL))
292                      && next_beg != end
293                      && count != 0)
294                 {
295                   /* Try to match in just one line.  */
296                   count = 0;
297                   beg = memrchr (buf, eol, next_beg - buf);
298                   beg++;
299                   dfa_beg = beg;
300                 }
301               if (next_beg == NULL || next_beg == end)
302                 continue;
303 
304               /* Narrow down to the line we've found.  */
305               end = memchr (next_beg, eol, buflim - next_beg);
306               end = end ? end + 1 : buflim;
307             }
308 
309           /* Try matching with DFA.  */
310           next_beg = dfaexec (dfa, dfa_beg, (char *) end, 0, &count, &backref);
311 
312           /* If there's no match, or if we've matched the sentinel,
313              we're done.  */
314           if (next_beg == NULL || next_beg == end)
315             continue;
316 
317           /* Narrow down to the line we've found.  */
318           if (count != 0)
319             {
320               beg = memrchr (buf, eol, next_beg - buf);
321               beg++;
322             }
323           end = memchr (next_beg, eol, buflim - next_beg);
324           end = end ? end + 1 : buflim;
325 
326           /* Successful, no backreferences encountered! */
327           if (!backref)
328             goto success;
329           ptr = beg;
330         }
331       else
332         {
333           /* We are looking for the leftmost (then longest) exact match.
334              We will go through the outer loop only once.  */
335           ptr = start_ptr;
336         }
337 
338       /* If the "line" is longer than the maximum regexp offset,
339          die as if we've run out of memory.  */
340       if (TYPE_MAXIMUM (regoff_t) < end - beg - 1)
341         xalloc_die ();
342 
343       /* Run the possible match through Regex.  */
344       best_match = end;
345       best_len = 0;
346       for (i = 0; i < pcount; i++)
347         {
348           patterns[i].regexbuf.not_eol = 0;
349           start = re_search (&(patterns[i].regexbuf),
350                              beg, end - beg - 1,
351                              ptr - beg, end - ptr - 1,
352                              &(patterns[i].regs));
353           if (start < -1)
354             xalloc_die ();
355           else if (0 <= start)
356             {
357               len = patterns[i].regs.end[0] - start;
358               match = beg + start;
359               if (match > best_match)
360                 continue;
361               if (start_ptr && !match_words)
362                 goto assess_pattern_match;
363               if ((!match_lines && !match_words)
364                   || (match_lines && len == end - ptr - 1))
365                 {
366                   match = ptr;
367                   len = end - ptr;
368                   goto assess_pattern_match;
369                 }
370               /* If -w, check if the match aligns with word boundaries.
371                  We do this iteratively because:
372                  (a) the line may contain more than one occurrence of the
373                  pattern, and
374                  (b) Several alternatives in the pattern might be valid at a
375                  given point, and we may need to consider a shorter one to
376                  find a word boundary.  */
377               if (match_words)
378                 while (match <= best_match)
379                   {
380                     regoff_t shorter_len = 0;
381                     if (!wordchar (mb_prev_wc (beg, match, end - 1))
382                         && !wordchar (mb_next_wc (match + len, end - 1)))
383                       goto assess_pattern_match;
384                     if (len > 0)
385                       {
386                         /* Try a shorter length anchored at the same place. */
387                         --len;
388                         patterns[i].regexbuf.not_eol = 1;
389                         shorter_len = re_match (&(patterns[i].regexbuf),
390                                                 beg, match + len - ptr,
391                                                 match - beg,
392                                                 &(patterns[i].regs));
393                         if (shorter_len < -1)
394                           xalloc_die ();
395                       }
396                     if (0 < shorter_len)
397                       len = shorter_len;
398                     else
399                       {
400                         /* Try looking further on. */
401                         if (match == end - 1)
402                           break;
403                         match++;
404                         patterns[i].regexbuf.not_eol = 0;
405                         start = re_search (&(patterns[i].regexbuf),
406                                            beg, end - beg - 1,
407                                            match - beg, end - match - 1,
408                                            &(patterns[i].regs));
409                         if (start < 0)
410                           {
411                             if (start < -1)
412                               xalloc_die ();
413                             break;
414                           }
415                         len = patterns[i].regs.end[0] - start;
416                         match = beg + start;
417                       }
418                   } /* while (match <= best_match) */
419               continue;
420             assess_pattern_match:
421               if (!start_ptr)
422                 {
423                   /* Good enough for a non-exact match.
424                      No need to look at further patterns, if any.  */
425                   goto success;
426                 }
427               if (match < best_match || (match == best_match && len > best_len))
428                 {
429                   /* Best exact match:  leftmost, then longest.  */
430                   best_match = match;
431                   best_len = len;
432                 }
433             } /* if re_search >= 0 */
434         } /* for Regex patterns.  */
435         if (best_match < end)
436           {
437             /* We have found an exact match.  We were just
438                waiting for the best one (leftmost then longest).  */
439             beg = best_match;
440             len = best_len;
441             goto success_in_len;
442           }
443     } /* for (beg = end ..) */
444 
445  failure:
446   return -1;
447 
448  success:
449   len = end - beg;
450  success_in_len:;
451   size_t off = beg - buf;
452   *match_size = len;
453   return off;
454 }
455