1 /* dfasearch.c - searching subroutines using dfa and regex for grep. 2 Copyright 1992, 1998, 2000, 2007, 2009-2011 Free Software Foundation, Inc. 3 4 This program is free software; you can redistribute it and/or modify 5 it under the terms of the GNU General Public License as published by 6 the Free Software Foundation; either version 3, or (at your option) 7 any later version. 8 9 This program is distributed in the hope that it will be useful, 10 but WITHOUT ANY WARRANTY; without even the implied warranty of 11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 GNU General Public License for more details. 13 14 You should have received a copy of the GNU General Public License 15 along with this program; if not, write to the Free Software 16 Foundation, Inc., 51 Franklin Street - Fifth Floor, Boston, MA 17 02110-1301, USA. */ 18 19 /* Written August 1992 by Mike Haertel. */ 20 21 #include <config.h> 22 #include "search.h" 23 #include "dfa.h" 24 25 /* For -w, we also consider _ to be word constituent. */ 26 #define WCHAR(C) (isalnum (C) || (C) == '_') 27 28 /* KWset compiled pattern. For Ecompile and Gcompile, we compile 29 a list of strings, at least one of which is known to occur in 30 any string matching the regexp. */ 31 static kwset_t kwset; 32 33 /* DFA compiled regexp. */ 34 static struct dfa *dfa; 35 36 /* The Regex compiled patterns. */ 37 static struct patterns 38 { 39 /* Regex compiled regexp. */ 40 struct re_pattern_buffer regexbuf; 41 struct re_registers regs; /* This is here on account of a BRAIN-DEAD 42 Q@#%!# library interface in regex.c. */ 43 } patterns0; 44 45 static struct patterns *patterns; 46 static size_t pcount; 47 48 void 49 dfaerror (char const *mesg) 50 { 51 error (EXIT_TROUBLE, 0, "%s", mesg); 52 53 /* notreached */ 54 /* Tell static analyzers that this function does not return. */ 55 abort (); 56 } 57 58 /* For now, the sole dfawarn-eliciting condition (use of a regexp 59 like '[:lower:]') is unequivocally an error, so treat it as such, 60 when possible. */ 61 void 62 dfawarn (char const *mesg) 63 { 64 static enum { DW_NONE = 0, DW_POSIX, DW_GNU } mode; 65 if (mode == DW_NONE) 66 mode = (getenv ("POSIXLY_CORRECT") ? DW_POSIX : DW_GNU); 67 if (mode == DW_GNU) 68 dfaerror (mesg); 69 } 70 71 /* Number of compiled fixed strings known to exactly match the regexp. 72 If kwsexec returns < kwset_exact_matches, then we don't need to 73 call the regexp matcher at all. */ 74 static int kwset_exact_matches; 75 76 static char const * 77 kwsincr_case (const char *must) 78 { 79 const char *buf; 80 size_t n; 81 82 n = strlen (must); 83 #if MBS_SUPPORT 84 if (match_icase && MB_CUR_MAX > 1) 85 buf = mbtolower (must, &n); 86 else 87 #endif 88 buf = must; 89 return kwsincr (kwset, buf, n); 90 } 91 92 /* If the DFA turns out to have some set of fixed strings one of 93 which must occur in the match, then we build a kwset matcher 94 to find those strings, and thus quickly filter out impossible 95 matches. */ 96 static void 97 kwsmusts (void) 98 { 99 struct dfamust const *dm; 100 char const *err; 101 102 dm = dfamusts (dfa); 103 if (dm) 104 { 105 kwsinit (&kwset); 106 /* First, we compile in the substrings known to be exact 107 matches. The kwset matcher will return the index 108 of the matching string that it chooses. */ 109 for (; dm; dm = dm->next) 110 { 111 if (!dm->exact) 112 continue; 113 ++kwset_exact_matches; 114 if ((err = kwsincr_case (dm->must)) != NULL) 115 error (EXIT_TROUBLE, 0, "%s", err); 116 } 117 /* Now, we compile the substrings that will require 118 the use of the regexp matcher. */ 119 for (dm = dfamusts (dfa); dm; dm = dm->next) 120 { 121 if (dm->exact) 122 continue; 123 if ((err = kwsincr_case (dm->must)) != NULL) 124 error (EXIT_TROUBLE, 0, "%s", err); 125 } 126 if ((err = kwsprep (kwset)) != NULL) 127 error (EXIT_TROUBLE, 0, "%s", err); 128 } 129 } 130 131 void 132 GEAcompile (char const *pattern, size_t size, reg_syntax_t syntax_bits) 133 { 134 const char *err; 135 const char *p, *sep; 136 size_t total = size; 137 char *motif; 138 139 if (match_icase) 140 syntax_bits |= RE_ICASE; 141 re_set_syntax (syntax_bits); 142 dfasyntax (syntax_bits, match_icase, eolbyte); 143 144 /* For GNU regex compiler we have to pass the patterns separately to detect 145 errors like "[\nallo\n]\n". The patterns here are "[", "allo" and "]" 146 GNU regex should have raise a syntax error. The same for backref, where 147 the backref should have been local to each pattern. */ 148 p = pattern; 149 do 150 { 151 size_t len; 152 sep = memchr (p, '\n', total); 153 if (sep) 154 { 155 len = sep - p; 156 sep++; 157 total -= (len + 1); 158 } 159 else 160 { 161 len = total; 162 total = 0; 163 } 164 165 patterns = xnrealloc (patterns, pcount + 1, sizeof *patterns); 166 patterns[pcount] = patterns0; 167 168 if ((err = re_compile_pattern (p, len, 169 &(patterns[pcount].regexbuf))) != NULL) 170 error (EXIT_TROUBLE, 0, "%s", err); 171 pcount++; 172 173 p = sep; 174 } while (sep && total != 0); 175 176 /* In the match_words and match_lines cases, we use a different pattern 177 for the DFA matcher that will quickly throw out cases that won't work. 178 Then if DFA succeeds we do some hairy stuff using the regex matcher 179 to decide whether the match should really count. */ 180 if (match_words || match_lines) 181 { 182 static char const line_beg_no_bk[] = "^("; 183 static char const line_end_no_bk[] = ")$"; 184 static char const word_beg_no_bk[] = "(^|[^[:alnum:]_])("; 185 static char const word_end_no_bk[] = ")([^[:alnum:]_]|$)"; 186 static char const line_beg_bk[] = "^\\("; 187 static char const line_end_bk[] = "\\)$"; 188 static char const word_beg_bk[] = "\\(^\\|[^[:alnum:]_]\\)\\("; 189 static char const word_end_bk[] = "\\)\\([^[:alnum:]_]\\|$\\)"; 190 int bk = !(syntax_bits & RE_NO_BK_PARENS); 191 char *n = xmalloc (sizeof word_beg_bk - 1 + size + sizeof word_end_bk); 192 193 strcpy (n, match_lines ? (bk ? line_beg_bk : line_beg_no_bk) 194 : (bk ? word_beg_bk : word_beg_no_bk)); 195 total = strlen(n); 196 memcpy (n + total, pattern, size); 197 total += size; 198 strcpy (n + total, match_lines ? (bk ? line_end_bk : line_end_no_bk) 199 : (bk ? word_end_bk : word_end_no_bk)); 200 total += strlen (n + total); 201 pattern = motif = n; 202 size = total; 203 } 204 else 205 motif = NULL; 206 207 dfa = dfaalloc (); 208 dfacomp (pattern, size, dfa, 1); 209 kwsmusts (); 210 211 free(motif); 212 } 213 214 size_t 215 EGexecute (char const *buf, size_t size, size_t *match_size, 216 char const *start_ptr) 217 { 218 char const *buflim, *beg, *end, *match, *best_match, *mb_start; 219 char eol = eolbyte; 220 int backref, start, len, best_len; 221 struct kwsmatch kwsm; 222 size_t i, ret_val; 223 #if MBS_SUPPORT 224 if (MB_CUR_MAX > 1) 225 { 226 if (match_icase) 227 { 228 /* mbtolower adds a NUL byte at the end. That will provide 229 space for the sentinel byte dfaexec may add. */ 230 char *case_buf = mbtolower (buf, &size); 231 if (start_ptr) 232 start_ptr = case_buf + (start_ptr - buf); 233 buf = case_buf; 234 } 235 } 236 #endif /* MBS_SUPPORT */ 237 238 mb_start = buf; 239 buflim = buf + size; 240 241 for (beg = end = buf; end < buflim; beg = end) 242 { 243 if (!start_ptr) 244 { 245 /* We don't care about an exact match. */ 246 if (kwset) 247 { 248 /* Find a possible match using the KWset matcher. */ 249 size_t offset = kwsexec (kwset, beg, buflim - beg, &kwsm); 250 if (offset == (size_t) -1) 251 goto failure; 252 beg += offset; 253 /* Narrow down to the line containing the candidate, and 254 run it through DFA. */ 255 if ((end = memchr(beg, eol, buflim - beg)) != NULL) 256 end++; 257 else 258 end = buflim; 259 match = beg; 260 while (beg > buf && beg[-1] != eol) 261 --beg; 262 if (kwsm.index < kwset_exact_matches) 263 { 264 #if MBS_SUPPORT 265 if (mb_start < beg) 266 mb_start = beg; 267 if (MB_CUR_MAX == 1 268 || !is_mb_middle (&mb_start, match, buflim, 269 kwsm.size[0])) 270 #endif 271 goto success; 272 } 273 if (dfaexec (dfa, beg, (char *) end, 0, NULL, &backref) == NULL) 274 continue; 275 } 276 else 277 { 278 /* No good fixed strings; start with DFA. */ 279 char const *next_beg = dfaexec (dfa, beg, (char *) buflim, 280 0, NULL, &backref); 281 if (next_beg == NULL) 282 break; 283 /* Narrow down to the line we've found. */ 284 beg = next_beg; 285 if ((end = memchr(beg, eol, buflim - beg)) != NULL) 286 end++; 287 else 288 end = buflim; 289 while (beg > buf && beg[-1] != eol) 290 --beg; 291 } 292 /* Successful, no backreferences encountered! */ 293 if (!backref) 294 goto success; 295 } 296 else 297 { 298 /* We are looking for the leftmost (then longest) exact match. 299 We will go through the outer loop only once. */ 300 beg = start_ptr; 301 end = buflim; 302 } 303 304 /* If we've made it to this point, this means DFA has seen 305 a probable match, and we need to run it through Regex. */ 306 best_match = end; 307 best_len = 0; 308 for (i = 0; i < pcount; i++) 309 { 310 patterns[i].regexbuf.not_eol = 0; 311 if (0 <= (start = re_search (&(patterns[i].regexbuf), 312 buf, end - buf - 1, 313 beg - buf, end - beg - 1, 314 &(patterns[i].regs)))) 315 { 316 len = patterns[i].regs.end[0] - start; 317 match = buf + start; 318 if (match > best_match) 319 continue; 320 if (start_ptr && !match_words) 321 goto assess_pattern_match; 322 if ((!match_lines && !match_words) 323 || (match_lines && len == end - beg - 1)) 324 { 325 match = beg; 326 len = end - beg; 327 goto assess_pattern_match; 328 } 329 /* If -w, check if the match aligns with word boundaries. 330 We do this iteratively because: 331 (a) the line may contain more than one occurence of the 332 pattern, and 333 (b) Several alternatives in the pattern might be valid at a 334 given point, and we may need to consider a shorter one to 335 find a word boundary. */ 336 if (match_words) 337 while (match <= best_match) 338 { 339 if ((match == buf || !WCHAR ((unsigned char) match[-1])) 340 && (start + len == end - buf - 1 341 || !WCHAR ((unsigned char) match[len]))) 342 goto assess_pattern_match; 343 if (len > 0) 344 { 345 /* Try a shorter length anchored at the same place. */ 346 --len; 347 patterns[i].regexbuf.not_eol = 1; 348 len = re_match (&(patterns[i].regexbuf), 349 buf, match + len - beg, match - buf, 350 &(patterns[i].regs)); 351 } 352 if (len <= 0) 353 { 354 /* Try looking further on. */ 355 if (match == end - 1) 356 break; 357 match++; 358 patterns[i].regexbuf.not_eol = 0; 359 start = re_search (&(patterns[i].regexbuf), 360 buf, end - buf - 1, 361 match - buf, end - match - 1, 362 &(patterns[i].regs)); 363 if (start < 0) 364 break; 365 len = patterns[i].regs.end[0] - start; 366 match = buf + start; 367 } 368 } /* while (match <= best_match) */ 369 continue; 370 assess_pattern_match: 371 if (!start_ptr) 372 { 373 /* Good enough for a non-exact match. 374 No need to look at further patterns, if any. */ 375 goto success; 376 } 377 if (match < best_match || (match == best_match && len > best_len)) 378 { 379 /* Best exact match: leftmost, then longest. */ 380 best_match = match; 381 best_len = len; 382 } 383 } /* if re_search >= 0 */ 384 } /* for Regex patterns. */ 385 if (best_match < end) 386 { 387 /* We have found an exact match. We were just 388 waiting for the best one (leftmost then longest). */ 389 beg = best_match; 390 len = best_len; 391 goto success_in_len; 392 } 393 } /* for (beg = end ..) */ 394 395 failure: 396 ret_val = -1; 397 goto out; 398 399 success: 400 len = end - beg; 401 success_in_len: 402 *match_size = len; 403 ret_val = beg - buf; 404 out: 405 return ret_val; 406 } 407