xref: /dragonfly/contrib/grep/src/dfasearch.c (revision 82730a9c)
1 /* dfasearch.c - searching subroutines using dfa and regex for grep.
2    Copyright 1992, 1998, 2000, 2007, 2009-2012 Free Software Foundation, Inc.
3 
4    This program is free software; you can redistribute it and/or modify
5    it under the terms of the GNU General Public License as published by
6    the Free Software Foundation; either version 3, or (at your option)
7    any later version.
8 
9    This program is distributed in the hope that it will be useful,
10    but WITHOUT ANY WARRANTY; without even the implied warranty of
11    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12    GNU General Public License for more details.
13 
14    You should have received a copy of the GNU General Public License
15    along with this program; if not, write to the Free Software
16    Foundation, Inc., 51 Franklin Street - Fifth Floor, Boston, MA
17    02110-1301, USA.  */
18 
19 /* Written August 1992 by Mike Haertel. */
20 
21 #include <config.h>
22 #include "intprops.h"
23 #include "search.h"
24 #include "dfa.h"
25 
26 /* For -w, we also consider _ to be word constituent.  */
27 #define WCHAR(C) (isalnum (C) || (C) == '_')
28 
29 /* KWset compiled pattern.  For Ecompile and Gcompile, we compile
30    a list of strings, at least one of which is known to occur in
31    any string matching the regexp. */
32 static kwset_t kwset;
33 
34 /* DFA compiled regexp. */
35 static struct dfa *dfa;
36 
37 /* The Regex compiled patterns.  */
38 static struct patterns
39 {
40   /* Regex compiled regexp. */
41   struct re_pattern_buffer regexbuf;
42   struct re_registers regs; /* This is here on account of a BRAIN-DEAD
43                                Q@#%!# library interface in regex.c.  */
44 } patterns0;
45 
46 static struct patterns *patterns;
47 static size_t pcount;
48 
49 void
50 dfaerror (char const *mesg)
51 {
52   error (EXIT_TROUBLE, 0, "%s", mesg);
53 
54   /* notreached */
55   /* Tell static analyzers that this function does not return.  */
56   abort ();
57 }
58 
59 /* For now, the sole dfawarn-eliciting condition (use of a regexp
60    like '[:lower:]') is unequivocally an error, so treat it as such,
61    when possible.  */
62 void
63 dfawarn (char const *mesg)
64 {
65   static enum { DW_NONE = 0, DW_POSIX, DW_GNU } mode;
66   if (mode == DW_NONE)
67     mode = (getenv ("POSIXLY_CORRECT") ? DW_POSIX : DW_GNU);
68   if (mode == DW_GNU)
69     dfaerror (mesg);
70 }
71 
72 /* Number of compiled fixed strings known to exactly match the regexp.
73    If kwsexec returns < kwset_exact_matches, then we don't need to
74    call the regexp matcher at all. */
75 static size_t kwset_exact_matches;
76 
77 static char const *
78 kwsincr_case (const char *must)
79 {
80   size_t n = strlen (must);
81   mb_len_map_t *map = NULL;
82   const char *buf = (match_icase && MB_CUR_MAX > 1
83                      ? mbtolower (must, &n, &map)
84                      : must);
85   return kwsincr (kwset, buf, n);
86 }
87 
88 /* If the DFA turns out to have some set of fixed strings one of
89    which must occur in the match, then we build a kwset matcher
90    to find those strings, and thus quickly filter out impossible
91    matches. */
92 static void
93 kwsmusts (void)
94 {
95   struct dfamust const *dm;
96   char const *err;
97 
98   dm = dfamusts (dfa);
99   if (dm)
100     {
101       kwsinit (&kwset);
102       /* First, we compile in the substrings known to be exact
103          matches.  The kwset matcher will return the index
104          of the matching string that it chooses. */
105       for (; dm; dm = dm->next)
106         {
107           if (!dm->exact)
108             continue;
109           ++kwset_exact_matches;
110           if ((err = kwsincr_case (dm->must)) != NULL)
111             error (EXIT_TROUBLE, 0, "%s", err);
112         }
113       /* Now, we compile the substrings that will require
114          the use of the regexp matcher.  */
115       for (dm = dfamusts (dfa); dm; dm = dm->next)
116         {
117           if (dm->exact)
118             continue;
119           if ((err = kwsincr_case (dm->must)) != NULL)
120             error (EXIT_TROUBLE, 0, "%s", err);
121         }
122       if ((err = kwsprep (kwset)) != NULL)
123         error (EXIT_TROUBLE, 0, "%s", err);
124     }
125 }
126 
127 void
128 GEAcompile (char const *pattern, size_t size, reg_syntax_t syntax_bits)
129 {
130   const char *err;
131   const char *p, *sep;
132   size_t total = size;
133   char *motif;
134 
135   if (match_icase)
136     syntax_bits |= RE_ICASE;
137   re_set_syntax (syntax_bits);
138   dfasyntax (syntax_bits, match_icase, eolbyte);
139 
140   /* For GNU regex compiler we have to pass the patterns separately to detect
141      errors like "[\nallo\n]\n".  The patterns here are "[", "allo" and "]"
142      GNU regex should have raise a syntax error.  The same for backref, where
143      the backref should have been local to each pattern.  */
144   p = pattern;
145   do
146     {
147       size_t len;
148       sep = memchr (p, '\n', total);
149       if (sep)
150         {
151           len = sep - p;
152           sep++;
153           total -= (len + 1);
154         }
155       else
156         {
157           len = total;
158           total = 0;
159         }
160 
161       patterns = xnrealloc (patterns, pcount + 1, sizeof *patterns);
162       patterns[pcount] = patterns0;
163 
164       if ((err = re_compile_pattern (p, len,
165                                     &(patterns[pcount].regexbuf))) != NULL)
166         error (EXIT_TROUBLE, 0, "%s", err);
167       pcount++;
168 
169       p = sep;
170     } while (sep && total != 0);
171 
172   /* In the match_words and match_lines cases, we use a different pattern
173      for the DFA matcher that will quickly throw out cases that won't work.
174      Then if DFA succeeds we do some hairy stuff using the regex matcher
175      to decide whether the match should really count. */
176   if (match_words || match_lines)
177     {
178       static char const line_beg_no_bk[] = "^(";
179       static char const line_end_no_bk[] = ")$";
180       static char const word_beg_no_bk[] = "(^|[^[:alnum:]_])(";
181       static char const word_end_no_bk[] = ")([^[:alnum:]_]|$)";
182       static char const line_beg_bk[] = "^\\(";
183       static char const line_end_bk[] = "\\)$";
184       static char const word_beg_bk[] = "\\(^\\|[^[:alnum:]_]\\)\\(";
185       static char const word_end_bk[] = "\\)\\([^[:alnum:]_]\\|$\\)";
186       int bk = !(syntax_bits & RE_NO_BK_PARENS);
187       char *n = xmalloc (sizeof word_beg_bk - 1 + size + sizeof word_end_bk);
188 
189       strcpy (n, match_lines ? (bk ? line_beg_bk : line_beg_no_bk)
190                              : (bk ? word_beg_bk : word_beg_no_bk));
191       total = strlen(n);
192       memcpy (n + total, pattern, size);
193       total += size;
194       strcpy (n + total, match_lines ? (bk ? line_end_bk : line_end_no_bk)
195                                      : (bk ? word_end_bk : word_end_no_bk));
196       total += strlen (n + total);
197       pattern = motif = n;
198       size = total;
199     }
200   else
201     motif = NULL;
202 
203   dfa = dfaalloc ();
204   dfacomp (pattern, size, dfa, 1);
205   kwsmusts ();
206 
207   free(motif);
208 }
209 
210 size_t
211 EGexecute (char const *buf, size_t size, size_t *match_size,
212            char const *start_ptr)
213 {
214   char const *buflim, *beg, *end, *match, *best_match, *mb_start;
215   char eol = eolbyte;
216   int backref;
217   regoff_t start;
218   size_t len, best_len;
219   struct kwsmatch kwsm;
220   size_t i, ret_val;
221   mb_len_map_t *map = NULL;
222 
223   if (MB_CUR_MAX > 1)
224     {
225       if (match_icase)
226         {
227           /* mbtolower adds a NUL byte at the end.  That will provide
228              space for the sentinel byte dfaexec may add.  */
229           char *case_buf = mbtolower (buf, &size, &map);
230           if (start_ptr)
231             start_ptr = case_buf + (start_ptr - buf);
232           buf = case_buf;
233         }
234     }
235 
236   mb_start = buf;
237   buflim = buf + size;
238 
239   for (beg = end = buf; end < buflim; beg = end)
240     {
241       if (!start_ptr)
242         {
243           /* We don't care about an exact match.  */
244           if (kwset)
245             {
246               /* Find a possible match using the KWset matcher. */
247               size_t offset = kwsexec (kwset, beg, buflim - beg, &kwsm);
248               if (offset == (size_t) -1)
249                 goto failure;
250               beg += offset;
251               /* Narrow down to the line containing the candidate, and
252                  run it through DFA. */
253               if ((end = memchr(beg, eol, buflim - beg)) != NULL)
254                 end++;
255               else
256                 end = buflim;
257               match = beg;
258               while (beg > buf && beg[-1] != eol)
259                 --beg;
260               if (kwsm.index < kwset_exact_matches)
261                 {
262                   if (!MBS_SUPPORT)
263                     goto success;
264 
265                   if (mb_start < beg)
266                     mb_start = beg;
267                   if (MB_CUR_MAX == 1
268                       || !is_mb_middle (&mb_start, match, buflim,
269                                         kwsm.size[0]))
270                     goto success;
271                 }
272               if (dfaexec (dfa, beg, (char *) end, 0, NULL, &backref) == NULL)
273                 continue;
274             }
275           else
276             {
277               /* No good fixed strings; start with DFA. */
278               char const *next_beg = dfaexec (dfa, beg, (char *) buflim,
279                                               0, NULL, &backref);
280               /* If there's no match, or if we've matched the sentinel,
281                  we're done.  */
282               if (next_beg == NULL || next_beg == buflim)
283                 break;
284               /* Narrow down to the line we've found. */
285               beg = next_beg;
286               if ((end = memchr(beg, eol, buflim - beg)) != NULL)
287                 end++;
288               else
289                 end = buflim;
290               while (beg > buf && beg[-1] != eol)
291                 --beg;
292             }
293           /* Successful, no backreferences encountered! */
294           if (!backref)
295             goto success;
296         }
297       else
298         {
299           /* We are looking for the leftmost (then longest) exact match.
300              We will go through the outer loop only once.  */
301           beg = start_ptr;
302           end = buflim;
303         }
304 
305       /* If the "line" is longer than the maximum regexp offset,
306          die as if we've run out of memory.  */
307       if (TYPE_MAXIMUM (regoff_t) < end - buf - 1)
308         xalloc_die ();
309 
310       /* If we've made it to this point, this means DFA has seen
311          a probable match, and we need to run it through Regex. */
312       best_match = end;
313       best_len = 0;
314       for (i = 0; i < pcount; i++)
315         {
316           patterns[i].regexbuf.not_eol = 0;
317           start = re_search (&(patterns[i].regexbuf),
318                              buf, end - buf - 1,
319                              beg - buf, end - beg - 1,
320                              &(patterns[i].regs));
321           if (start < -1)
322             xalloc_die ();
323           else if (0 <= start)
324             {
325               len = patterns[i].regs.end[0] - start;
326               match = buf + start;
327               if (match > best_match)
328                 continue;
329               if (start_ptr && !match_words)
330                 goto assess_pattern_match;
331               if ((!match_lines && !match_words)
332                   || (match_lines && len == end - beg - 1))
333                 {
334                   match = beg;
335                   len = end - beg;
336                   goto assess_pattern_match;
337                 }
338               /* If -w, check if the match aligns with word boundaries.
339                  We do this iteratively because:
340                  (a) the line may contain more than one occurrence of the
341                  pattern, and
342                  (b) Several alternatives in the pattern might be valid at a
343                  given point, and we may need to consider a shorter one to
344                  find a word boundary.  */
345               if (match_words)
346                 while (match <= best_match)
347                   {
348                     regoff_t shorter_len = 0;
349                     if ((match == buf || !WCHAR ((unsigned char) match[-1]))
350                         && (start + len == end - buf - 1
351                             || !WCHAR ((unsigned char) match[len])))
352                       goto assess_pattern_match;
353                     if (len > 0)
354                       {
355                         /* Try a shorter length anchored at the same place. */
356                         --len;
357                         patterns[i].regexbuf.not_eol = 1;
358                         shorter_len = re_match (&(patterns[i].regexbuf),
359                                                 buf, match + len - beg,
360                                                 match - buf,
361                                                 &(patterns[i].regs));
362                         if (shorter_len < -1)
363                           xalloc_die ();
364                       }
365                     if (0 < shorter_len)
366                       len = shorter_len;
367                     else
368                       {
369                         /* Try looking further on. */
370                         if (match == end - 1)
371                           break;
372                         match++;
373                         patterns[i].regexbuf.not_eol = 0;
374                         start = re_search (&(patterns[i].regexbuf),
375                                            buf, end - buf - 1,
376                                            match - buf, end - match - 1,
377                                            &(patterns[i].regs));
378                         if (start < 0)
379                           {
380                             if (start < -1)
381                               xalloc_die ();
382                             break;
383                           }
384                         len = patterns[i].regs.end[0] - start;
385                         match = buf + start;
386                       }
387                   } /* while (match <= best_match) */
388               continue;
389             assess_pattern_match:
390               if (!start_ptr)
391                 {
392                   /* Good enough for a non-exact match.
393                      No need to look at further patterns, if any.  */
394                   goto success;
395                 }
396               if (match < best_match || (match == best_match && len > best_len))
397                 {
398                   /* Best exact match:  leftmost, then longest.  */
399                   best_match = match;
400                   best_len = len;
401                 }
402             } /* if re_search >= 0 */
403         } /* for Regex patterns.  */
404         if (best_match < end)
405           {
406             /* We have found an exact match.  We were just
407                waiting for the best one (leftmost then longest).  */
408             beg = best_match;
409             len = best_len;
410             goto success_in_len;
411           }
412     } /* for (beg = end ..) */
413 
414  failure:
415   ret_val = -1;
416   goto out;
417 
418  success:
419   len = end - beg;
420  success_in_len:;
421   size_t off = beg - buf;
422   mb_case_map_apply (map, &off, &len);
423   *match_size = len;
424   ret_val = off;
425  out:
426   return ret_val;
427 }
428