1 /* dfasearch.c - searching subroutines using dfa and regex for grep. 2 Copyright 1992, 1998, 2000, 2007, 2009-2014 Free Software Foundation, Inc. 3 4 This program is free software; you can redistribute it and/or modify 5 it under the terms of the GNU General Public License as published by 6 the Free Software Foundation; either version 3, or (at your option) 7 any later version. 8 9 This program is distributed in the hope that it will be useful, 10 but WITHOUT ANY WARRANTY; without even the implied warranty of 11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 GNU General Public License for more details. 13 14 You should have received a copy of the GNU General Public License 15 along with this program; if not, write to the Free Software 16 Foundation, Inc., 51 Franklin Street - Fifth Floor, Boston, MA 17 02110-1301, USA. */ 18 19 /* Written August 1992 by Mike Haertel. */ 20 21 #include <config.h> 22 #include "intprops.h" 23 #include "search.h" 24 25 /* Whether -w considers WC to be a word constituent. */ 26 static bool 27 wordchar (wint_t wc) 28 { 29 return wc == L'_' || iswalnum (wc); 30 } 31 32 /* KWset compiled pattern. For Ecompile and Gcompile, we compile 33 a list of strings, at least one of which is known to occur in 34 any string matching the regexp. */ 35 static kwset_t kwset; 36 37 /* DFA compiled regexp. */ 38 static struct dfa *dfa; 39 40 /* The Regex compiled patterns. */ 41 static struct patterns 42 { 43 /* Regex compiled regexp. */ 44 struct re_pattern_buffer regexbuf; 45 struct re_registers regs; /* This is here on account of a BRAIN-DEAD 46 Q@#%!# library interface in regex.c. */ 47 } patterns0; 48 49 static struct patterns *patterns; 50 static size_t pcount; 51 52 /* Number of compiled fixed strings known to exactly match the regexp. 53 If kwsexec returns < kwset_exact_matches, then we don't need to 54 call the regexp matcher at all. */ 55 static size_t kwset_exact_matches; 56 57 static bool begline; 58 59 void 60 dfaerror (char const *mesg) 61 { 62 error (EXIT_TROUBLE, 0, "%s", mesg); 63 64 /* notreached */ 65 /* Tell static analyzers that this function does not return. */ 66 abort (); 67 } 68 69 /* For now, the sole dfawarn-eliciting condition (use of a regexp 70 like '[:lower:]') is unequivocally an error, so treat it as such, 71 when possible. */ 72 void 73 dfawarn (char const *mesg) 74 { 75 static enum { DW_NONE = 0, DW_POSIX, DW_GNU } mode; 76 if (mode == DW_NONE) 77 mode = (getenv ("POSIXLY_CORRECT") ? DW_POSIX : DW_GNU); 78 if (mode == DW_GNU) 79 dfaerror (mesg); 80 } 81 82 /* If the DFA turns out to have some set of fixed strings one of 83 which must occur in the match, then we build a kwset matcher 84 to find those strings, and thus quickly filter out impossible 85 matches. */ 86 static void 87 kwsmusts (void) 88 { 89 struct dfamust const *dm = dfamusts (dfa); 90 if (dm) 91 { 92 kwsinit (&kwset); 93 /* First, we compile in the substrings known to be exact 94 matches. The kwset matcher will return the index 95 of the matching string that it chooses. */ 96 for (; dm; dm = dm->next) 97 { 98 if (!dm->exact) 99 continue; 100 ++kwset_exact_matches; 101 size_t old_len = strlen (dm->must); 102 size_t new_len = old_len + dm->begline + dm->endline; 103 char *must = xmalloc (new_len); 104 char *mp = must; 105 *mp = eolbyte; 106 mp += dm->begline; 107 begline |= dm->begline; 108 memcpy (mp, dm->must, old_len); 109 if (dm->endline) 110 mp[old_len] = eolbyte; 111 kwsincr (kwset, must, new_len); 112 free (must); 113 } 114 /* Now, we compile the substrings that will require 115 the use of the regexp matcher. */ 116 for (dm = dfamusts (dfa); dm; dm = dm->next) 117 { 118 if (dm->exact) 119 continue; 120 kwsincr (kwset, dm->must, strlen (dm->must)); 121 } 122 kwsprep (kwset); 123 } 124 } 125 126 void 127 GEAcompile (char const *pattern, size_t size, reg_syntax_t syntax_bits) 128 { 129 size_t total = size; 130 char *motif; 131 132 if (match_icase) 133 syntax_bits |= RE_ICASE; 134 re_set_syntax (syntax_bits); 135 dfasyntax (syntax_bits, match_icase, eolbyte); 136 137 /* For GNU regex, pass the patterns separately to detect errors like 138 "[\nallo\n]\n", where the patterns are "[", "allo" and "]", and 139 this should be a syntax error. The same for backref, where the 140 backref should be local to each pattern. */ 141 char const *p = pattern; 142 do 143 { 144 size_t len; 145 char const *sep = memchr (p, '\n', total); 146 if (sep) 147 { 148 len = sep - p; 149 sep++; 150 total -= (len + 1); 151 } 152 else 153 { 154 len = total; 155 total = 0; 156 } 157 158 patterns = xnrealloc (patterns, pcount + 1, sizeof *patterns); 159 patterns[pcount] = patterns0; 160 161 char const *err = re_compile_pattern (p, len, 162 &(patterns[pcount].regexbuf)); 163 if (err) 164 error (EXIT_TROUBLE, 0, "%s", err); 165 pcount++; 166 p = sep; 167 } 168 while (p); 169 170 /* In the match_words and match_lines cases, we use a different pattern 171 for the DFA matcher that will quickly throw out cases that won't work. 172 Then if DFA succeeds we do some hairy stuff using the regex matcher 173 to decide whether the match should really count. */ 174 if (match_words || match_lines) 175 { 176 static char const line_beg_no_bk[] = "^("; 177 static char const line_end_no_bk[] = ")$"; 178 static char const word_beg_no_bk[] = "(^|[^[:alnum:]_])("; 179 static char const word_end_no_bk[] = ")([^[:alnum:]_]|$)"; 180 static char const line_beg_bk[] = "^\\("; 181 static char const line_end_bk[] = "\\)$"; 182 static char const word_beg_bk[] = "\\(^\\|[^[:alnum:]_]\\)\\("; 183 static char const word_end_bk[] = "\\)\\([^[:alnum:]_]\\|$\\)"; 184 int bk = !(syntax_bits & RE_NO_BK_PARENS); 185 char *n = xmalloc (sizeof word_beg_bk - 1 + size + sizeof word_end_bk); 186 187 strcpy (n, match_lines ? (bk ? line_beg_bk : line_beg_no_bk) 188 : (bk ? word_beg_bk : word_beg_no_bk)); 189 total = strlen(n); 190 memcpy (n + total, pattern, size); 191 total += size; 192 strcpy (n + total, match_lines ? (bk ? line_end_bk : line_end_no_bk) 193 : (bk ? word_end_bk : word_end_no_bk)); 194 total += strlen (n + total); 195 pattern = motif = n; 196 size = total; 197 } 198 else 199 motif = NULL; 200 201 dfa = dfaalloc (); 202 dfacomp (pattern, size, dfa, 1); 203 kwsmusts (); 204 205 free(motif); 206 } 207 208 size_t 209 EGexecute (char const *buf, size_t size, size_t *match_size, 210 char const *start_ptr) 211 { 212 char const *buflim, *beg, *end, *ptr, *match, *best_match, *mb_start; 213 char eol = eolbyte; 214 int backref; 215 regoff_t start; 216 size_t len, best_len; 217 struct kwsmatch kwsm; 218 size_t i; 219 struct dfa *superset = dfasuperset (dfa); 220 bool dfafast = dfaisfast (dfa); 221 222 mb_start = buf; 223 buflim = buf + size; 224 225 for (beg = end = buf; end < buflim; beg = end) 226 { 227 end = buflim; 228 229 if (!start_ptr) 230 { 231 char const *next_beg, *dfa_beg = beg; 232 size_t count = 0; 233 bool exact_kwset_match = false; 234 235 /* Try matching with KWset, if it's defined. */ 236 if (kwset) 237 { 238 char const *prev_beg; 239 240 /* Find a possible match using the KWset matcher. */ 241 size_t offset = kwsexec (kwset, beg - begline, 242 buflim - beg + begline, &kwsm); 243 if (offset == (size_t) -1) 244 goto failure; 245 match = beg + offset; 246 prev_beg = beg; 247 248 /* Narrow down to the line containing the possible match. */ 249 beg = memrchr (buf, eol, match - buf); 250 beg = beg ? beg + 1 : buf; 251 dfa_beg = beg; 252 253 /* Determine the end pointer to give the DFA next. Typically 254 this is after the first newline after MATCH; but if the KWset 255 match is not exact, the DFA is fast, and the offset from 256 PREV_BEG is less than 64 or (MATCH - PREV_BEG), this is the 257 greater of the latter two values; this temporarily prefers 258 the DFA to KWset. */ 259 exact_kwset_match = kwsm.index < kwset_exact_matches; 260 end = ((exact_kwset_match || !dfafast 261 || MAX (16, match - beg) < (match - prev_beg) >> 2) 262 ? match 263 : MAX (16, match - beg) < (buflim - prev_beg) >> 2 264 ? prev_beg + 4 * MAX (16, match - beg) 265 : buflim); 266 end = memchr (end, eol, buflim - end); 267 end = end ? end + 1 : buflim; 268 269 if (exact_kwset_match) 270 { 271 if (MB_CUR_MAX == 1 || using_utf8 ()) 272 goto success; 273 if (mb_start < beg) 274 mb_start = beg; 275 if (mb_goback (&mb_start, match, buflim) == 0) 276 goto success; 277 /* The matched line starts in the middle of a multibyte 278 character. Perform the DFA search starting from the 279 beginning of the next character. */ 280 dfa_beg = mb_start; 281 } 282 } 283 284 /* Try matching with the superset of DFA, if it's defined. */ 285 if (superset && !exact_kwset_match) 286 { 287 /* Keep using the superset while it reports multiline 288 potential matches; this is more likely to be fast 289 than falling back to KWset would be. */ 290 while ((next_beg = dfaexec (superset, dfa_beg, (char *) end, 1, 291 &count, NULL)) 292 && next_beg != end 293 && count != 0) 294 { 295 /* Try to match in just one line. */ 296 count = 0; 297 beg = memrchr (buf, eol, next_beg - buf); 298 beg++; 299 dfa_beg = beg; 300 } 301 if (next_beg == NULL || next_beg == end) 302 continue; 303 304 /* Narrow down to the line we've found. */ 305 end = memchr (next_beg, eol, buflim - next_beg); 306 end = end ? end + 1 : buflim; 307 } 308 309 /* Try matching with DFA. */ 310 next_beg = dfaexec (dfa, dfa_beg, (char *) end, 0, &count, &backref); 311 312 /* If there's no match, or if we've matched the sentinel, 313 we're done. */ 314 if (next_beg == NULL || next_beg == end) 315 continue; 316 317 /* Narrow down to the line we've found. */ 318 if (count != 0) 319 { 320 beg = memrchr (buf, eol, next_beg - buf); 321 beg++; 322 } 323 end = memchr (next_beg, eol, buflim - next_beg); 324 end = end ? end + 1 : buflim; 325 326 /* Successful, no backreferences encountered! */ 327 if (!backref) 328 goto success; 329 ptr = beg; 330 } 331 else 332 { 333 /* We are looking for the leftmost (then longest) exact match. 334 We will go through the outer loop only once. */ 335 ptr = start_ptr; 336 } 337 338 /* If the "line" is longer than the maximum regexp offset, 339 die as if we've run out of memory. */ 340 if (TYPE_MAXIMUM (regoff_t) < end - beg - 1) 341 xalloc_die (); 342 343 /* Run the possible match through Regex. */ 344 best_match = end; 345 best_len = 0; 346 for (i = 0; i < pcount; i++) 347 { 348 patterns[i].regexbuf.not_eol = 0; 349 start = re_search (&(patterns[i].regexbuf), 350 beg, end - beg - 1, 351 ptr - beg, end - ptr - 1, 352 &(patterns[i].regs)); 353 if (start < -1) 354 xalloc_die (); 355 else if (0 <= start) 356 { 357 len = patterns[i].regs.end[0] - start; 358 match = beg + start; 359 if (match > best_match) 360 continue; 361 if (start_ptr && !match_words) 362 goto assess_pattern_match; 363 if ((!match_lines && !match_words) 364 || (match_lines && len == end - ptr - 1)) 365 { 366 match = ptr; 367 len = end - ptr; 368 goto assess_pattern_match; 369 } 370 /* If -w, check if the match aligns with word boundaries. 371 We do this iteratively because: 372 (a) the line may contain more than one occurrence of the 373 pattern, and 374 (b) Several alternatives in the pattern might be valid at a 375 given point, and we may need to consider a shorter one to 376 find a word boundary. */ 377 if (match_words) 378 while (match <= best_match) 379 { 380 regoff_t shorter_len = 0; 381 if (!wordchar (mb_prev_wc (beg, match, end - 1)) 382 && !wordchar (mb_next_wc (match + len, end - 1))) 383 goto assess_pattern_match; 384 if (len > 0) 385 { 386 /* Try a shorter length anchored at the same place. */ 387 --len; 388 patterns[i].regexbuf.not_eol = 1; 389 shorter_len = re_match (&(patterns[i].regexbuf), 390 beg, match + len - ptr, 391 match - beg, 392 &(patterns[i].regs)); 393 if (shorter_len < -1) 394 xalloc_die (); 395 } 396 if (0 < shorter_len) 397 len = shorter_len; 398 else 399 { 400 /* Try looking further on. */ 401 if (match == end - 1) 402 break; 403 match++; 404 patterns[i].regexbuf.not_eol = 0; 405 start = re_search (&(patterns[i].regexbuf), 406 beg, end - beg - 1, 407 match - beg, end - match - 1, 408 &(patterns[i].regs)); 409 if (start < 0) 410 { 411 if (start < -1) 412 xalloc_die (); 413 break; 414 } 415 len = patterns[i].regs.end[0] - start; 416 match = beg + start; 417 } 418 } /* while (match <= best_match) */ 419 continue; 420 assess_pattern_match: 421 if (!start_ptr) 422 { 423 /* Good enough for a non-exact match. 424 No need to look at further patterns, if any. */ 425 goto success; 426 } 427 if (match < best_match || (match == best_match && len > best_len)) 428 { 429 /* Best exact match: leftmost, then longest. */ 430 best_match = match; 431 best_len = len; 432 } 433 } /* if re_search >= 0 */ 434 } /* for Regex patterns. */ 435 if (best_match < end) 436 { 437 /* We have found an exact match. We were just 438 waiting for the best one (leftmost then longest). */ 439 beg = best_match; 440 len = best_len; 441 goto success_in_len; 442 } 443 } /* for (beg = end ..) */ 444 445 failure: 446 return -1; 447 448 success: 449 len = end - beg; 450 success_in_len:; 451 size_t off = beg - buf; 452 *match_size = len; 453 return off; 454 } 455