1 /* dfasearch.c - searching subroutines using dfa and regex for grep. 2 Copyright 1992, 1998, 2000, 2007, 2009-2012 Free Software Foundation, Inc. 3 4 This program is free software; you can redistribute it and/or modify 5 it under the terms of the GNU General Public License as published by 6 the Free Software Foundation; either version 3, or (at your option) 7 any later version. 8 9 This program is distributed in the hope that it will be useful, 10 but WITHOUT ANY WARRANTY; without even the implied warranty of 11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 GNU General Public License for more details. 13 14 You should have received a copy of the GNU General Public License 15 along with this program; if not, write to the Free Software 16 Foundation, Inc., 51 Franklin Street - Fifth Floor, Boston, MA 17 02110-1301, USA. */ 18 19 /* Written August 1992 by Mike Haertel. */ 20 21 #include <config.h> 22 #include "intprops.h" 23 #include "search.h" 24 #include "dfa.h" 25 26 /* For -w, we also consider _ to be word constituent. */ 27 #define WCHAR(C) (isalnum (C) || (C) == '_') 28 29 /* KWset compiled pattern. For Ecompile and Gcompile, we compile 30 a list of strings, at least one of which is known to occur in 31 any string matching the regexp. */ 32 static kwset_t kwset; 33 34 /* DFA compiled regexp. */ 35 static struct dfa *dfa; 36 37 /* The Regex compiled patterns. */ 38 static struct patterns 39 { 40 /* Regex compiled regexp. */ 41 struct re_pattern_buffer regexbuf; 42 struct re_registers regs; /* This is here on account of a BRAIN-DEAD 43 Q@#%!# library interface in regex.c. */ 44 } patterns0; 45 46 static struct patterns *patterns; 47 static size_t pcount; 48 49 void 50 dfaerror (char const *mesg) 51 { 52 error (EXIT_TROUBLE, 0, "%s", mesg); 53 54 /* notreached */ 55 /* Tell static analyzers that this function does not return. */ 56 abort (); 57 } 58 59 /* For now, the sole dfawarn-eliciting condition (use of a regexp 60 like '[:lower:]') is unequivocally an error, so treat it as such, 61 when possible. */ 62 void 63 dfawarn (char const *mesg) 64 { 65 static enum { DW_NONE = 0, DW_POSIX, DW_GNU } mode; 66 if (mode == DW_NONE) 67 mode = (getenv ("POSIXLY_CORRECT") ? DW_POSIX : DW_GNU); 68 if (mode == DW_GNU) 69 dfaerror (mesg); 70 } 71 72 /* Number of compiled fixed strings known to exactly match the regexp. 73 If kwsexec returns < kwset_exact_matches, then we don't need to 74 call the regexp matcher at all. */ 75 static size_t kwset_exact_matches; 76 77 static char const * 78 kwsincr_case (const char *must) 79 { 80 size_t n = strlen (must); 81 mb_len_map_t *map = NULL; 82 const char *buf = (match_icase && MB_CUR_MAX > 1 83 ? mbtolower (must, &n, &map) 84 : must); 85 return kwsincr (kwset, buf, n); 86 } 87 88 /* If the DFA turns out to have some set of fixed strings one of 89 which must occur in the match, then we build a kwset matcher 90 to find those strings, and thus quickly filter out impossible 91 matches. */ 92 static void 93 kwsmusts (void) 94 { 95 struct dfamust const *dm; 96 char const *err; 97 98 dm = dfamusts (dfa); 99 if (dm) 100 { 101 kwsinit (&kwset); 102 /* First, we compile in the substrings known to be exact 103 matches. The kwset matcher will return the index 104 of the matching string that it chooses. */ 105 for (; dm; dm = dm->next) 106 { 107 if (!dm->exact) 108 continue; 109 ++kwset_exact_matches; 110 if ((err = kwsincr_case (dm->must)) != NULL) 111 error (EXIT_TROUBLE, 0, "%s", err); 112 } 113 /* Now, we compile the substrings that will require 114 the use of the regexp matcher. */ 115 for (dm = dfamusts (dfa); dm; dm = dm->next) 116 { 117 if (dm->exact) 118 continue; 119 if ((err = kwsincr_case (dm->must)) != NULL) 120 error (EXIT_TROUBLE, 0, "%s", err); 121 } 122 if ((err = kwsprep (kwset)) != NULL) 123 error (EXIT_TROUBLE, 0, "%s", err); 124 } 125 } 126 127 void 128 GEAcompile (char const *pattern, size_t size, reg_syntax_t syntax_bits) 129 { 130 const char *err; 131 const char *p, *sep; 132 size_t total = size; 133 char *motif; 134 135 if (match_icase) 136 syntax_bits |= RE_ICASE; 137 re_set_syntax (syntax_bits); 138 dfasyntax (syntax_bits, match_icase, eolbyte); 139 140 /* For GNU regex compiler we have to pass the patterns separately to detect 141 errors like "[\nallo\n]\n". The patterns here are "[", "allo" and "]" 142 GNU regex should have raise a syntax error. The same for backref, where 143 the backref should have been local to each pattern. */ 144 p = pattern; 145 do 146 { 147 size_t len; 148 sep = memchr (p, '\n', total); 149 if (sep) 150 { 151 len = sep - p; 152 sep++; 153 total -= (len + 1); 154 } 155 else 156 { 157 len = total; 158 total = 0; 159 } 160 161 patterns = xnrealloc (patterns, pcount + 1, sizeof *patterns); 162 patterns[pcount] = patterns0; 163 164 if ((err = re_compile_pattern (p, len, 165 &(patterns[pcount].regexbuf))) != NULL) 166 error (EXIT_TROUBLE, 0, "%s", err); 167 pcount++; 168 169 p = sep; 170 } while (sep && total != 0); 171 172 /* In the match_words and match_lines cases, we use a different pattern 173 for the DFA matcher that will quickly throw out cases that won't work. 174 Then if DFA succeeds we do some hairy stuff using the regex matcher 175 to decide whether the match should really count. */ 176 if (match_words || match_lines) 177 { 178 static char const line_beg_no_bk[] = "^("; 179 static char const line_end_no_bk[] = ")$"; 180 static char const word_beg_no_bk[] = "(^|[^[:alnum:]_])("; 181 static char const word_end_no_bk[] = ")([^[:alnum:]_]|$)"; 182 static char const line_beg_bk[] = "^\\("; 183 static char const line_end_bk[] = "\\)$"; 184 static char const word_beg_bk[] = "\\(^\\|[^[:alnum:]_]\\)\\("; 185 static char const word_end_bk[] = "\\)\\([^[:alnum:]_]\\|$\\)"; 186 int bk = !(syntax_bits & RE_NO_BK_PARENS); 187 char *n = xmalloc (sizeof word_beg_bk - 1 + size + sizeof word_end_bk); 188 189 strcpy (n, match_lines ? (bk ? line_beg_bk : line_beg_no_bk) 190 : (bk ? word_beg_bk : word_beg_no_bk)); 191 total = strlen(n); 192 memcpy (n + total, pattern, size); 193 total += size; 194 strcpy (n + total, match_lines ? (bk ? line_end_bk : line_end_no_bk) 195 : (bk ? word_end_bk : word_end_no_bk)); 196 total += strlen (n + total); 197 pattern = motif = n; 198 size = total; 199 } 200 else 201 motif = NULL; 202 203 dfa = dfaalloc (); 204 dfacomp (pattern, size, dfa, 1); 205 kwsmusts (); 206 207 free(motif); 208 } 209 210 size_t 211 EGexecute (char const *buf, size_t size, size_t *match_size, 212 char const *start_ptr) 213 { 214 char const *buflim, *beg, *end, *match, *best_match, *mb_start; 215 char eol = eolbyte; 216 int backref; 217 regoff_t start; 218 size_t len, best_len; 219 struct kwsmatch kwsm; 220 size_t i, ret_val; 221 mb_len_map_t *map = NULL; 222 223 if (MB_CUR_MAX > 1) 224 { 225 if (match_icase) 226 { 227 /* mbtolower adds a NUL byte at the end. That will provide 228 space for the sentinel byte dfaexec may add. */ 229 char *case_buf = mbtolower (buf, &size, &map); 230 if (start_ptr) 231 start_ptr = case_buf + (start_ptr - buf); 232 buf = case_buf; 233 } 234 } 235 236 mb_start = buf; 237 buflim = buf + size; 238 239 for (beg = end = buf; end < buflim; beg = end) 240 { 241 if (!start_ptr) 242 { 243 /* We don't care about an exact match. */ 244 if (kwset) 245 { 246 /* Find a possible match using the KWset matcher. */ 247 size_t offset = kwsexec (kwset, beg, buflim - beg, &kwsm); 248 if (offset == (size_t) -1) 249 goto failure; 250 beg += offset; 251 /* Narrow down to the line containing the candidate, and 252 run it through DFA. */ 253 if ((end = memchr(beg, eol, buflim - beg)) != NULL) 254 end++; 255 else 256 end = buflim; 257 match = beg; 258 while (beg > buf && beg[-1] != eol) 259 --beg; 260 if (kwsm.index < kwset_exact_matches) 261 { 262 if (!MBS_SUPPORT) 263 goto success; 264 265 if (mb_start < beg) 266 mb_start = beg; 267 if (MB_CUR_MAX == 1 268 || !is_mb_middle (&mb_start, match, buflim, 269 kwsm.size[0])) 270 goto success; 271 } 272 if (dfaexec (dfa, beg, (char *) end, 0, NULL, &backref) == NULL) 273 continue; 274 } 275 else 276 { 277 /* No good fixed strings; start with DFA. */ 278 char const *next_beg = dfaexec (dfa, beg, (char *) buflim, 279 0, NULL, &backref); 280 /* If there's no match, or if we've matched the sentinel, 281 we're done. */ 282 if (next_beg == NULL || next_beg == buflim) 283 break; 284 /* Narrow down to the line we've found. */ 285 beg = next_beg; 286 if ((end = memchr(beg, eol, buflim - beg)) != NULL) 287 end++; 288 else 289 end = buflim; 290 while (beg > buf && beg[-1] != eol) 291 --beg; 292 } 293 /* Successful, no backreferences encountered! */ 294 if (!backref) 295 goto success; 296 } 297 else 298 { 299 /* We are looking for the leftmost (then longest) exact match. 300 We will go through the outer loop only once. */ 301 beg = start_ptr; 302 end = buflim; 303 } 304 305 /* If the "line" is longer than the maximum regexp offset, 306 die as if we've run out of memory. */ 307 if (TYPE_MAXIMUM (regoff_t) < end - buf - 1) 308 xalloc_die (); 309 310 /* If we've made it to this point, this means DFA has seen 311 a probable match, and we need to run it through Regex. */ 312 best_match = end; 313 best_len = 0; 314 for (i = 0; i < pcount; i++) 315 { 316 patterns[i].regexbuf.not_eol = 0; 317 start = re_search (&(patterns[i].regexbuf), 318 buf, end - buf - 1, 319 beg - buf, end - beg - 1, 320 &(patterns[i].regs)); 321 if (start < -1) 322 xalloc_die (); 323 else if (0 <= start) 324 { 325 len = patterns[i].regs.end[0] - start; 326 match = buf + start; 327 if (match > best_match) 328 continue; 329 if (start_ptr && !match_words) 330 goto assess_pattern_match; 331 if ((!match_lines && !match_words) 332 || (match_lines && len == end - beg - 1)) 333 { 334 match = beg; 335 len = end - beg; 336 goto assess_pattern_match; 337 } 338 /* If -w, check if the match aligns with word boundaries. 339 We do this iteratively because: 340 (a) the line may contain more than one occurrence of the 341 pattern, and 342 (b) Several alternatives in the pattern might be valid at a 343 given point, and we may need to consider a shorter one to 344 find a word boundary. */ 345 if (match_words) 346 while (match <= best_match) 347 { 348 regoff_t shorter_len = 0; 349 if ((match == buf || !WCHAR ((unsigned char) match[-1])) 350 && (start + len == end - buf - 1 351 || !WCHAR ((unsigned char) match[len]))) 352 goto assess_pattern_match; 353 if (len > 0) 354 { 355 /* Try a shorter length anchored at the same place. */ 356 --len; 357 patterns[i].regexbuf.not_eol = 1; 358 shorter_len = re_match (&(patterns[i].regexbuf), 359 buf, match + len - beg, 360 match - buf, 361 &(patterns[i].regs)); 362 if (shorter_len < -1) 363 xalloc_die (); 364 } 365 if (0 < shorter_len) 366 len = shorter_len; 367 else 368 { 369 /* Try looking further on. */ 370 if (match == end - 1) 371 break; 372 match++; 373 patterns[i].regexbuf.not_eol = 0; 374 start = re_search (&(patterns[i].regexbuf), 375 buf, end - buf - 1, 376 match - buf, end - match - 1, 377 &(patterns[i].regs)); 378 if (start < 0) 379 { 380 if (start < -1) 381 xalloc_die (); 382 break; 383 } 384 len = patterns[i].regs.end[0] - start; 385 match = buf + start; 386 } 387 } /* while (match <= best_match) */ 388 continue; 389 assess_pattern_match: 390 if (!start_ptr) 391 { 392 /* Good enough for a non-exact match. 393 No need to look at further patterns, if any. */ 394 goto success; 395 } 396 if (match < best_match || (match == best_match && len > best_len)) 397 { 398 /* Best exact match: leftmost, then longest. */ 399 best_match = match; 400 best_len = len; 401 } 402 } /* if re_search >= 0 */ 403 } /* for Regex patterns. */ 404 if (best_match < end) 405 { 406 /* We have found an exact match. We were just 407 waiting for the best one (leftmost then longest). */ 408 beg = best_match; 409 len = best_len; 410 goto success_in_len; 411 } 412 } /* for (beg = end ..) */ 413 414 failure: 415 ret_val = -1; 416 goto out; 417 418 success: 419 len = end - beg; 420 success_in_len:; 421 size_t off = beg - buf; 422 mb_case_map_apply (map, &off, &len); 423 *match_size = len; 424 ret_val = off; 425 out: 426 return ret_val; 427 } 428