1 /* dfasearch.c - searching subroutines using dfa and regex for grep. 2 Copyright 1992, 1998, 2000, 2007, 2009-2015 Free Software Foundation, Inc. 3 4 This program is free software; you can redistribute it and/or modify 5 it under the terms of the GNU General Public License as published by 6 the Free Software Foundation; either version 3, or (at your option) 7 any later version. 8 9 This program is distributed in the hope that it will be useful, 10 but WITHOUT ANY WARRANTY; without even the implied warranty of 11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 GNU General Public License for more details. 13 14 You should have received a copy of the GNU General Public License 15 along with this program; if not, write to the Free Software 16 Foundation, Inc., 51 Franklin Street - Fifth Floor, Boston, MA 17 02110-1301, USA. */ 18 19 /* Written August 1992 by Mike Haertel. */ 20 21 #include <config.h> 22 #include "intprops.h" 23 #include "search.h" 24 25 /* Whether -w considers WC to be a word constituent. */ 26 static bool 27 wordchar (wint_t wc) 28 { 29 return wc == L'_' || iswalnum (wc); 30 } 31 32 /* KWset compiled pattern. For Ecompile and Gcompile, we compile 33 a list of strings, at least one of which is known to occur in 34 any string matching the regexp. */ 35 static kwset_t kwset; 36 37 /* DFA compiled regexp. */ 38 static struct dfa *dfa; 39 40 /* The Regex compiled patterns. */ 41 static struct patterns 42 { 43 /* Regex compiled regexp. */ 44 struct re_pattern_buffer regexbuf; 45 struct re_registers regs; /* This is here on account of a BRAIN-DEAD 46 Q@#%!# library interface in regex.c. */ 47 } patterns0; 48 49 static struct patterns *patterns; 50 static size_t pcount; 51 52 /* Number of compiled fixed strings known to exactly match the regexp. 53 If kwsexec returns < kwset_exact_matches, then we don't need to 54 call the regexp matcher at all. */ 55 static size_t kwset_exact_matches; 56 57 static bool begline; 58 59 void 60 dfaerror (char const *mesg) 61 { 62 error (EXIT_TROUBLE, 0, "%s", mesg); 63 64 /* notreached */ 65 /* Tell static analyzers that this function does not return. */ 66 abort (); 67 } 68 69 /* For now, the sole dfawarn-eliciting condition (use of a regexp 70 like '[:lower:]') is unequivocally an error, so treat it as such, 71 when possible. */ 72 void 73 dfawarn (char const *mesg) 74 { 75 static enum { DW_NONE = 0, DW_POSIX, DW_GNU } mode; 76 if (mode == DW_NONE) 77 mode = (getenv ("POSIXLY_CORRECT") ? DW_POSIX : DW_GNU); 78 if (mode == DW_GNU) 79 dfaerror (mesg); 80 } 81 82 /* If the DFA turns out to have some set of fixed strings one of 83 which must occur in the match, then we build a kwset matcher 84 to find those strings, and thus quickly filter out impossible 85 matches. */ 86 static void 87 kwsmusts (void) 88 { 89 struct dfamust *dm = dfamust (dfa); 90 if (!dm) 91 return; 92 kwsinit (&kwset); 93 if (dm->exact) 94 { 95 /* Prepare a substring whose presence implies a match. 96 The kwset matcher will return the index of the matching 97 string that it chooses. */ 98 ++kwset_exact_matches; 99 size_t old_len = strlen (dm->must); 100 size_t new_len = old_len + dm->begline + dm->endline; 101 char *must = xmalloc (new_len); 102 char *mp = must; 103 *mp = eolbyte; 104 mp += dm->begline; 105 begline |= dm->begline; 106 memcpy (mp, dm->must, old_len); 107 if (dm->endline) 108 mp[old_len] = eolbyte; 109 kwsincr (kwset, must, new_len); 110 free (must); 111 } 112 else 113 { 114 /* Otherwise, filtering with this substring should help reduce the 115 search space, but we'll still have to use the regexp matcher. */ 116 kwsincr (kwset, dm->must, strlen (dm->must)); 117 } 118 kwsprep (kwset); 119 dfamustfree (dm); 120 } 121 122 void 123 GEAcompile (char const *pattern, size_t size, reg_syntax_t syntax_bits) 124 { 125 size_t total = size; 126 char *motif; 127 128 if (match_icase) 129 syntax_bits |= RE_ICASE; 130 re_set_syntax (syntax_bits); 131 dfasyntax (syntax_bits, match_icase, eolbyte); 132 133 /* For GNU regex, pass the patterns separately to detect errors like 134 "[\nallo\n]\n", where the patterns are "[", "allo" and "]", and 135 this should be a syntax error. The same for backref, where the 136 backref should be local to each pattern. */ 137 char const *p = pattern; 138 do 139 { 140 size_t len; 141 char const *sep = memchr (p, '\n', total); 142 if (sep) 143 { 144 len = sep - p; 145 sep++; 146 total -= (len + 1); 147 } 148 else 149 { 150 len = total; 151 total = 0; 152 } 153 154 patterns = xnrealloc (patterns, pcount + 1, sizeof *patterns); 155 patterns[pcount] = patterns0; 156 157 char const *err = re_compile_pattern (p, len, 158 &(patterns[pcount].regexbuf)); 159 if (err) 160 error (EXIT_TROUBLE, 0, "%s", err); 161 pcount++; 162 p = sep; 163 } 164 while (p); 165 166 /* In the match_words and match_lines cases, we use a different pattern 167 for the DFA matcher that will quickly throw out cases that won't work. 168 Then if DFA succeeds we do some hairy stuff using the regex matcher 169 to decide whether the match should really count. */ 170 if (match_words || match_lines) 171 { 172 static char const line_beg_no_bk[] = "^("; 173 static char const line_end_no_bk[] = ")$"; 174 static char const word_beg_no_bk[] = "(^|[^[:alnum:]_])("; 175 static char const word_end_no_bk[] = ")([^[:alnum:]_]|$)"; 176 static char const line_beg_bk[] = "^\\("; 177 static char const line_end_bk[] = "\\)$"; 178 static char const word_beg_bk[] = "\\(^\\|[^[:alnum:]_]\\)\\("; 179 static char const word_end_bk[] = "\\)\\([^[:alnum:]_]\\|$\\)"; 180 int bk = !(syntax_bits & RE_NO_BK_PARENS); 181 char *n = xmalloc (sizeof word_beg_bk - 1 + size + sizeof word_end_bk); 182 183 strcpy (n, match_lines ? (bk ? line_beg_bk : line_beg_no_bk) 184 : (bk ? word_beg_bk : word_beg_no_bk)); 185 total = strlen(n); 186 memcpy (n + total, pattern, size); 187 total += size; 188 strcpy (n + total, match_lines ? (bk ? line_end_bk : line_end_no_bk) 189 : (bk ? word_end_bk : word_end_no_bk)); 190 total += strlen (n + total); 191 pattern = motif = n; 192 size = total; 193 } 194 else 195 motif = NULL; 196 197 dfa = dfaalloc (); 198 dfacomp (pattern, size, dfa, 1); 199 kwsmusts (); 200 201 free(motif); 202 } 203 204 size_t 205 EGexecute (char const *buf, size_t size, size_t *match_size, 206 char const *start_ptr) 207 { 208 char const *buflim, *beg, *end, *ptr, *match, *best_match, *mb_start; 209 char eol = eolbyte; 210 regoff_t start; 211 size_t len, best_len; 212 struct kwsmatch kwsm; 213 size_t i; 214 struct dfa *superset = dfasuperset (dfa); 215 bool dfafast = dfaisfast (dfa); 216 217 mb_start = buf; 218 buflim = buf + size; 219 220 for (beg = end = buf; end < buflim; beg = end) 221 { 222 end = buflim; 223 224 if (!start_ptr) 225 { 226 char const *next_beg, *dfa_beg = beg; 227 size_t count = 0; 228 bool exact_kwset_match = false; 229 int backref = 0; 230 231 /* Try matching with KWset, if it's defined. */ 232 if (kwset) 233 { 234 char const *prev_beg; 235 236 /* Find a possible match using the KWset matcher. */ 237 size_t offset = kwsexec (kwset, beg - begline, 238 buflim - beg + begline, &kwsm); 239 if (offset == (size_t) -1) 240 goto failure; 241 match = beg + offset; 242 prev_beg = beg; 243 244 /* Narrow down to the line containing the possible match. */ 245 beg = memrchr (buf, eol, match - buf); 246 beg = beg ? beg + 1 : buf; 247 dfa_beg = beg; 248 249 /* Determine the end pointer to give the DFA next. Typically 250 this is after the first newline after MATCH; but if the KWset 251 match is not exact, the DFA is fast, and the offset from 252 PREV_BEG is less than 64 or (MATCH - PREV_BEG), this is the 253 greater of the latter two values; this temporarily prefers 254 the DFA to KWset. */ 255 exact_kwset_match = kwsm.index < kwset_exact_matches; 256 end = ((exact_kwset_match || !dfafast 257 || MAX (16, match - beg) < (match - prev_beg) >> 2) 258 ? match 259 : MAX (16, match - beg) < (buflim - prev_beg) >> 2 260 ? prev_beg + 4 * MAX (16, match - beg) 261 : buflim); 262 end = memchr (end, eol, buflim - end); 263 end = end ? end + 1 : buflim; 264 265 if (exact_kwset_match) 266 { 267 if (MB_CUR_MAX == 1 || using_utf8 ()) 268 goto success; 269 if (mb_start < beg) 270 mb_start = beg; 271 if (mb_goback (&mb_start, match, buflim) == 0) 272 goto success; 273 /* The matched line starts in the middle of a multibyte 274 character. Perform the DFA search starting from the 275 beginning of the next character. */ 276 dfa_beg = mb_start; 277 } 278 } 279 280 /* Try matching with the superset of DFA, if it's defined. */ 281 if (superset && !exact_kwset_match) 282 { 283 /* Keep using the superset while it reports multiline 284 potential matches; this is more likely to be fast 285 than falling back to KWset would be. */ 286 while ((next_beg = dfaexec (superset, dfa_beg, (char *) end, 1, 287 &count, NULL)) 288 && next_beg != end 289 && count != 0) 290 { 291 /* Try to match in just one line. */ 292 count = 0; 293 beg = memrchr (buf, eol, next_beg - buf); 294 beg++; 295 dfa_beg = beg; 296 } 297 if (next_beg == NULL || next_beg == end) 298 continue; 299 300 /* Narrow down to the line we've found. */ 301 end = memchr (next_beg, eol, buflim - next_beg); 302 end = end ? end + 1 : buflim; 303 } 304 305 /* Try matching with DFA. */ 306 next_beg = dfaexec (dfa, dfa_beg, (char *) end, 0, &count, &backref); 307 308 /* If there's no match, or if we've matched the sentinel, 309 we're done. */ 310 if (next_beg == NULL || next_beg == end) 311 continue; 312 313 /* Narrow down to the line we've found. */ 314 if (count != 0) 315 { 316 beg = memrchr (buf, eol, next_beg - buf); 317 beg++; 318 } 319 end = memchr (next_beg, eol, buflim - next_beg); 320 end = end ? end + 1 : buflim; 321 322 /* Successful, no backreferences encountered! */ 323 if (!backref) 324 goto success; 325 ptr = beg; 326 } 327 else 328 { 329 /* We are looking for the leftmost (then longest) exact match. 330 We will go through the outer loop only once. */ 331 ptr = start_ptr; 332 } 333 334 /* If the "line" is longer than the maximum regexp offset, 335 die as if we've run out of memory. */ 336 if (TYPE_MAXIMUM (regoff_t) < end - beg - 1) 337 xalloc_die (); 338 339 /* Run the possible match through Regex. */ 340 best_match = end; 341 best_len = 0; 342 for (i = 0; i < pcount; i++) 343 { 344 patterns[i].regexbuf.not_eol = 0; 345 start = re_search (&(patterns[i].regexbuf), 346 beg, end - beg - 1, 347 ptr - beg, end - ptr - 1, 348 &(patterns[i].regs)); 349 if (start < -1) 350 xalloc_die (); 351 else if (0 <= start) 352 { 353 len = patterns[i].regs.end[0] - start; 354 match = beg + start; 355 if (match > best_match) 356 continue; 357 if (start_ptr && !match_words) 358 goto assess_pattern_match; 359 if ((!match_lines && !match_words) 360 || (match_lines && len == end - ptr - 1)) 361 { 362 match = ptr; 363 len = end - ptr; 364 goto assess_pattern_match; 365 } 366 /* If -w, check if the match aligns with word boundaries. 367 We do this iteratively because: 368 (a) the line may contain more than one occurrence of the 369 pattern, and 370 (b) Several alternatives in the pattern might be valid at a 371 given point, and we may need to consider a shorter one to 372 find a word boundary. */ 373 if (match_words) 374 while (match <= best_match) 375 { 376 regoff_t shorter_len = 0; 377 if (!wordchar (mb_prev_wc (beg, match, end - 1)) 378 && !wordchar (mb_next_wc (match + len, end - 1))) 379 goto assess_pattern_match; 380 if (len > 0) 381 { 382 /* Try a shorter length anchored at the same place. */ 383 --len; 384 patterns[i].regexbuf.not_eol = 1; 385 shorter_len = re_match (&(patterns[i].regexbuf), 386 beg, match + len - ptr, 387 match - beg, 388 &(patterns[i].regs)); 389 if (shorter_len < -1) 390 xalloc_die (); 391 } 392 if (0 < shorter_len) 393 len = shorter_len; 394 else 395 { 396 /* Try looking further on. */ 397 if (match == end - 1) 398 break; 399 match++; 400 patterns[i].regexbuf.not_eol = 0; 401 start = re_search (&(patterns[i].regexbuf), 402 beg, end - beg - 1, 403 match - beg, end - match - 1, 404 &(patterns[i].regs)); 405 if (start < 0) 406 { 407 if (start < -1) 408 xalloc_die (); 409 break; 410 } 411 len = patterns[i].regs.end[0] - start; 412 match = beg + start; 413 } 414 } /* while (match <= best_match) */ 415 continue; 416 assess_pattern_match: 417 if (!start_ptr) 418 { 419 /* Good enough for a non-exact match. 420 No need to look at further patterns, if any. */ 421 goto success; 422 } 423 if (match < best_match || (match == best_match && len > best_len)) 424 { 425 /* Best exact match: leftmost, then longest. */ 426 best_match = match; 427 best_len = len; 428 } 429 } /* if re_search >= 0 */ 430 } /* for Regex patterns. */ 431 if (best_match < end) 432 { 433 /* We have found an exact match. We were just 434 waiting for the best one (leftmost then longest). */ 435 beg = best_match; 436 len = best_len; 437 goto success_in_len; 438 } 439 } /* for (beg = end ..) */ 440 441 failure: 442 return -1; 443 444 success: 445 len = end - beg; 446 success_in_len:; 447 size_t off = beg - buf; 448 *match_size = len; 449 return off; 450 } 451