1 /* $OpenBSD: util.c,v 1.48 2014/05/20 01:25:23 guenther Exp $ */ 2 3 /*- 4 * Copyright (c) 1999 James Howard and Dag-Erling Co�dan Sm�rgrav 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 19 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 26 * SUCH DAMAGE. 27 */ 28 29 #include <sys/types.h> 30 #include <sys/stat.h> 31 32 #include <ctype.h> 33 #include <err.h> 34 #include <errno.h> 35 #include <fts.h> 36 #include <regex.h> 37 #include <stdio.h> 38 #include <stdlib.h> 39 #include <string.h> 40 #include <unistd.h> 41 #include <zlib.h> 42 43 #include "grep.h" 44 45 /* 46 * Process a file line by line... 47 */ 48 49 static int linesqueued; 50 static int procline(str_t *l, int); 51 static int grep_search(fastgrep_t *, unsigned char *, size_t, 52 regmatch_t *pmatch); 53 #ifndef SMALL 54 static int grep_cmp(const unsigned char *, const unsigned char *, size_t); 55 static void grep_revstr(unsigned char *, int); 56 #endif 57 58 int 59 grep_tree(char **argv) 60 { 61 FTS *fts; 62 FTSENT *p; 63 int c, fts_flags; 64 65 c = 0; 66 67 fts_flags = FTS_PHYSICAL | FTS_NOSTAT | FTS_NOCHDIR; 68 69 if (!(fts = fts_open(argv, fts_flags, NULL))) 70 err(2, NULL); 71 while ((p = fts_read(fts)) != NULL) { 72 switch (p->fts_info) { 73 case FTS_DNR: 74 break; 75 case FTS_ERR: 76 file_err = 1; 77 if(!sflag) { 78 errno = p->fts_errno; 79 warn("%s", p->fts_path); 80 } 81 break; 82 case FTS_DP: 83 break; 84 default: 85 c += procfile(p->fts_path); 86 break; 87 } 88 } 89 if (errno) 90 err(2, "fts_read"); 91 92 return c; 93 } 94 95 int 96 procfile(const char *fn) 97 { 98 str_t ln; 99 file_t *f; 100 int c, t, z, nottext; 101 102 if (fn == NULL) { 103 fn = "(standard input)"; 104 f = grep_fdopen(STDIN_FILENO, "r"); 105 } else { 106 f = grep_open(fn, "r"); 107 } 108 if (f == NULL) { 109 file_err = 1; 110 if (!sflag) 111 warn("%s", fn); 112 return 0; 113 } 114 115 nottext = grep_bin_file(f); 116 if (nottext && binbehave == BIN_FILE_SKIP) { 117 grep_close(f); 118 return 0; 119 } 120 121 ln.file = fn; 122 ln.line_no = 0; 123 ln.len = 0; 124 linesqueued = 0; 125 tail = 0; 126 ln.off = -1; 127 128 if (Bflag > 0) 129 initqueue(); 130 for (c = 0; c == 0 || !(lflag || qflag); ) { 131 ln.off += ln.len + 1; 132 if ((ln.dat = grep_fgetln(f, &ln.len)) == NULL) 133 break; 134 if (ln.len > 0 && ln.dat[ln.len - 1] == '\n') 135 --ln.len; 136 ln.line_no++; 137 138 z = tail; 139 140 if ((t = procline(&ln, nottext)) == 0 && Bflag > 0 && z == 0) { 141 enqueue(&ln); 142 linesqueued++; 143 } 144 c += t; 145 } 146 if (Bflag > 0) 147 clearqueue(); 148 grep_close(f); 149 150 if (cflag) { 151 if (!hflag) 152 printf("%s:", ln.file); 153 printf("%u\n", c); 154 } 155 if (lflag && c != 0) 156 printf("%s\n", fn); 157 if (Lflag && c == 0) 158 printf("%s\n", fn); 159 if (c && !cflag && !lflag && !Lflag && 160 binbehave == BIN_FILE_BIN && nottext && !qflag) 161 printf("Binary file %s matches\n", fn); 162 163 return c; 164 } 165 166 167 /* 168 * Process an individual line in a file. Return non-zero if it matches. 169 */ 170 171 #define isword(x) (isalnum((unsigned char)x) || (x) == '_') 172 173 static int 174 procline(str_t *l, int nottext) 175 { 176 regmatch_t pmatch; 177 int c, i, r; 178 regoff_t offset; 179 180 /* size_t will be converted to regoff_t. ssize_t is guaranteed to fit 181 * into regoff_t */ 182 if (l->len > SSIZE_MAX) { 183 errx(2, "Line is too big to process"); 184 } 185 186 c = 0; 187 i = 0; 188 if (matchall) { 189 c = 1; 190 goto print; 191 } 192 193 for (i = 0; i < patterns; i++) { 194 offset = 0; 195 redo: 196 if (fg_pattern[i].pattern) { 197 r = grep_search(&fg_pattern[i], 198 (unsigned char *)l->dat + offset, l->len - offset, 199 &pmatch); 200 pmatch.rm_so += offset; 201 pmatch.rm_eo += offset; 202 } else { 203 pmatch.rm_so = offset; 204 pmatch.rm_eo = l->len; 205 r = regexec(&r_pattern[i], l->dat, 1, &pmatch, eflags); 206 } 207 if (r == 0 && xflag) { 208 if (pmatch.rm_so != 0 || pmatch.rm_eo != l->len) 209 r = REG_NOMATCH; 210 } 211 if (r == 0) { 212 c = 1; 213 if (oflag && pmatch.rm_so != pmatch.rm_eo) 214 goto print; 215 break; 216 } 217 } 218 if (oflag) 219 return c; 220 print: 221 if (vflag) 222 c = !c; 223 224 if (c && binbehave == BIN_FILE_BIN && nottext) 225 return c; /* Binary file */ 226 227 if ((tail > 0 || c) && !cflag && !qflag) { 228 if (c) { 229 if (first > 0 && tail == 0 && (Bflag < linesqueued) && 230 (Aflag || Bflag)) 231 printf("--\n"); 232 first = 1; 233 tail = Aflag; 234 if (Bflag > 0) 235 printqueue(); 236 linesqueued = 0; 237 printline(l, ':', oflag ? &pmatch : NULL); 238 } else { 239 printline(l, '-', oflag ? &pmatch : NULL); 240 tail--; 241 } 242 } 243 if (oflag && !matchall) { 244 offset = pmatch.rm_eo; 245 goto redo; 246 } 247 return c; 248 } 249 250 #ifndef SMALL 251 void 252 fgrepcomp(fastgrep_t *fg, const unsigned char *pat) 253 { 254 int i; 255 256 /* Initialize. */ 257 fg->patternLen = strlen((const char *)pat); 258 fg->bol = 0; 259 fg->eol = 0; 260 fg->wmatch = wflag; 261 fg->reversedSearch = 0; 262 263 /* 264 * Make a copy and upper case it for later if in -i mode, 265 * else just copy the pointer. 266 */ 267 if (iflag) { 268 fg->pattern = grep_malloc(fg->patternLen + 1); 269 for (i = 0; i < fg->patternLen; i++) 270 fg->pattern[i] = toupper(pat[i]); 271 fg->pattern[fg->patternLen] = '\0'; 272 } else 273 fg->pattern = __UNCONST(pat); /* really const */ 274 275 /* Preprocess pattern. */ 276 for (i = 0; i <= UCHAR_MAX; i++) 277 fg->qsBc[i] = fg->patternLen; 278 for (i = 1; i < fg->patternLen; i++) { 279 fg->qsBc[fg->pattern[i]] = fg->patternLen - i; 280 /* 281 * If case is ignored, make the jump apply to both upper and 282 * lower cased characters. As the pattern is stored in upper 283 * case, apply the same to the lower case equivalents. 284 */ 285 if (iflag) 286 fg->qsBc[tolower(fg->pattern[i])] = fg->patternLen - i; 287 } 288 } 289 #endif 290 291 /* 292 * Returns: -1 on failure, 0 on success 293 */ 294 int 295 fastcomp(fastgrep_t *fg, const char *pat) 296 { 297 #ifdef SMALL 298 return -1; 299 #else 300 int i; 301 int bol = 0; 302 int eol = 0; 303 int shiftPatternLen; 304 int hasDot = 0; 305 int firstHalfDot = -1; 306 int firstLastHalfDot = -1; 307 int lastHalfDot = 0; 308 309 /* Initialize. */ 310 fg->patternLen = strlen(pat); 311 fg->bol = 0; 312 fg->eol = 0; 313 fg->wmatch = 0; 314 fg->reversedSearch = 0; 315 316 /* Remove end-of-line character ('$'). */ 317 if (fg->patternLen > 0 && pat[fg->patternLen - 1] == '$') { 318 eol++; 319 fg->eol = 1; 320 fg->patternLen--; 321 } 322 323 /* Remove beginning-of-line character ('^'). */ 324 if (pat[0] == '^') { 325 bol++; 326 fg->bol = 1; 327 fg->patternLen--; 328 } 329 330 /* Remove enclosing [[:<:]] and [[:>:]] (word match). */ 331 if (wflag) { 332 /* basic re's use \( \), extended re's ( ) */ 333 int extra = Eflag ? 1 : 2; 334 fg->patternLen -= 14 + 2 * extra; 335 fg->wmatch = 7 + extra; 336 } else if (fg->patternLen >= 14 && 337 strncmp(pat + fg->bol, "[[:<:]]", 7) == 0 && 338 strncmp(pat + fg->bol + fg->patternLen - 7, "[[:>:]]", 7) == 0) { 339 fg->patternLen -= 14; 340 fg->wmatch = 7; 341 } 342 343 /* 344 * Copy pattern minus '^' and '$' characters as well as word 345 * match character classes at the beginning and ending of the 346 * string respectively. 347 */ 348 fg->pattern = grep_malloc(fg->patternLen + 1); 349 memcpy(fg->pattern, pat + bol + fg->wmatch, fg->patternLen); 350 fg->pattern[fg->patternLen] = '\0'; 351 352 /* Look for ways to cheat...er...avoid the full regex engine. */ 353 for (i = 0; i < fg->patternLen; i++) 354 { 355 switch (fg->pattern[i]) { 356 case '.': 357 hasDot = i; 358 if (i < fg->patternLen / 2) { 359 if (firstHalfDot < 0) 360 /* Closest dot to the beginning */ 361 firstHalfDot = i; 362 } else { 363 /* Closest dot to the end of the pattern. */ 364 lastHalfDot = i; 365 if (firstLastHalfDot < 0) 366 firstLastHalfDot = i; 367 } 368 break; 369 case '(': case ')': 370 case '{': case '}': 371 /* Special in BRE if preceded by '\\' */ 372 case '?': 373 case '+': 374 case '|': 375 /* Not special in BRE. */ 376 if (!Eflag) 377 goto nonspecial; 378 case '\\': 379 case '*': 380 case '[': case ']': 381 /* Free memory and let others know this is empty. */ 382 free(fg->pattern); 383 fg->pattern = NULL; 384 return (-1); 385 default: 386 nonspecial: 387 if (iflag) 388 fg->pattern[i] = toupper(fg->pattern[i]); 389 break; 390 } 391 } 392 393 /* 394 * Determine if a reverse search would be faster based on the placement 395 * of the dots. 396 */ 397 if ((!(lflag || cflag)) && ((!(bol || eol)) && 398 ((lastHalfDot) && ((firstHalfDot < 0) || 399 ((fg->patternLen - (lastHalfDot + 1)) < firstHalfDot))))) { 400 fg->reversedSearch = 1; 401 hasDot = fg->patternLen - (firstHalfDot < 0 ? 402 firstLastHalfDot : firstHalfDot) - 1; 403 grep_revstr(fg->pattern, fg->patternLen); 404 } 405 406 /* 407 * Normal Quick Search would require a shift based on the position the 408 * next character after the comparison is within the pattern. With 409 * wildcards, the position of the last dot effects the maximum shift 410 * distance. 411 * The closer to the end the wild card is the slower the search. A 412 * reverse version of this algorithm would be useful for wildcards near 413 * the end of the string. 414 * 415 * Examples: 416 * Pattern Max shift 417 * ------- --------- 418 * this 5 419 * .his 4 420 * t.is 3 421 * th.s 2 422 * thi. 1 423 */ 424 425 /* Adjust the shift based on location of the last dot ('.'). */ 426 shiftPatternLen = fg->patternLen - hasDot; 427 428 /* Preprocess pattern. */ 429 for (i = 0; i <= UCHAR_MAX; i++) 430 fg->qsBc[i] = shiftPatternLen; 431 for (i = hasDot + 1; i < fg->patternLen; i++) { 432 fg->qsBc[fg->pattern[i]] = fg->patternLen - i; 433 /* 434 * If case is ignored, make the jump apply to both upper and 435 * lower cased characters. As the pattern is stored in upper 436 * case, apply the same to the lower case equivalents. 437 */ 438 if (iflag) 439 fg->qsBc[tolower(fg->pattern[i])] = fg->patternLen - i; 440 } 441 442 /* 443 * Put pattern back to normal after pre-processing to allow for easy 444 * comparisons later. 445 */ 446 if (fg->reversedSearch) 447 grep_revstr(fg->pattern, fg->patternLen); 448 449 return (0); 450 #endif 451 } 452 453 /* 454 * Word boundaries using regular expressions are defined as the point 455 * of transition from a non-word char to a word char, or vice versa. 456 * This means that grep -w +a and grep -w a+ never match anything, 457 * because they lack a starting or ending transition, but grep -w a+b 458 * does match a line containing a+b. 459 */ 460 #define wmatch(d, l, s, e) \ 461 ((s == 0 || !isword(d[s-1])) && (e == l || !isword(d[e])) && \ 462 e > s && isword(d[s]) && isword(d[e-1])) 463 464 static int 465 grep_search(fastgrep_t *fg, unsigned char *data, size_t dataLen, 466 regmatch_t *pmatch) 467 { 468 #ifdef SMALL 469 return 0; 470 #else 471 regoff_t j; 472 int rtrnVal = REG_NOMATCH; 473 474 pmatch->rm_so = -1; 475 pmatch->rm_eo = -1; 476 477 /* No point in going farther if we do not have enough data. */ 478 if (dataLen < (size_t)fg->patternLen) 479 return (rtrnVal); 480 481 /* Only try once at the beginning or ending of the line. */ 482 if (fg->bol || fg->eol) { 483 /* Simple text comparison. */ 484 /* Verify data is >= pattern length before searching on it. */ 485 if (dataLen >= (size_t)fg->patternLen) { 486 /* Determine where in data to start search at. */ 487 if (fg->eol) 488 j = dataLen - fg->patternLen; 489 else 490 j = 0; 491 if (!((fg->bol && fg->eol) && 492 (dataLen != (size_t)fg->patternLen))) 493 if (grep_cmp(fg->pattern, data + j, 494 fg->patternLen) == -1) { 495 pmatch->rm_so = j; 496 pmatch->rm_eo = j + fg->patternLen; 497 if (!fg->wmatch || wmatch(data, dataLen, 498 pmatch->rm_so, pmatch->rm_eo)) 499 rtrnVal = 0; 500 } 501 } 502 } else if (fg->reversedSearch) { 503 /* Quick Search algorithm. */ 504 j = dataLen; 505 do { 506 if (grep_cmp(fg->pattern, data + j - fg->patternLen, 507 fg->patternLen) == -1) { 508 pmatch->rm_so = j - fg->patternLen; 509 pmatch->rm_eo = j; 510 if (!fg->wmatch || wmatch(data, dataLen, 511 pmatch->rm_so, pmatch->rm_eo)) { 512 rtrnVal = 0; 513 break; 514 } 515 } 516 /* Shift if within bounds, otherwise, we are done. */ 517 if (j == fg->patternLen) 518 break; 519 j -= fg->qsBc[(unsigned char)data[j - fg->patternLen - 1]]; 520 } while (j >= fg->patternLen); 521 } else { 522 /* Quick Search algorithm. */ 523 j = 0; 524 do { 525 if (grep_cmp(fg->pattern, data + j, fg->patternLen) == -1) { 526 pmatch->rm_so = j; 527 pmatch->rm_eo = j + fg->patternLen; 528 if (fg->patternLen == 0 || !fg->wmatch || 529 wmatch(data, dataLen, pmatch->rm_so, 530 pmatch->rm_eo)) { 531 rtrnVal = 0; 532 break; 533 } 534 } 535 536 /* Shift if within bounds, otherwise, we are done. */ 537 if (j + fg->patternLen == dataLen) 538 break; 539 else 540 j += fg->qsBc[(unsigned char)data[j + fg->patternLen]]; 541 } while (j <= (dataLen - fg->patternLen)); 542 } 543 544 return (rtrnVal); 545 #endif 546 } 547 548 549 void * 550 grep_malloc(size_t size) 551 { 552 void *ptr; 553 554 if ((ptr = malloc(size)) == NULL) 555 err(2, "malloc"); 556 return ptr; 557 } 558 559 void * 560 grep_calloc(size_t nmemb, size_t size) 561 { 562 void *ptr; 563 564 if ((ptr = calloc(nmemb, size)) == NULL) 565 err(2, "calloc"); 566 return ptr; 567 } 568 569 void * 570 grep_realloc(void *ptr, size_t size) 571 { 572 if ((ptr = realloc(ptr, size)) == NULL) 573 err(2, "realloc"); 574 return ptr; 575 } 576 577 #ifndef SMALL 578 /* 579 * Returns: i >= 0 on failure (position that it failed) 580 * -1 on success 581 */ 582 static int 583 grep_cmp(const unsigned char *pat, const unsigned char *data, size_t len) 584 { 585 size_t i; 586 587 for (i = 0; i < len; i++) { 588 if (((pat[i] == data[i]) || (!Fflag && pat[i] == '.')) 589 || (iflag && pat[i] == toupper(data[i]))) 590 continue; 591 return (i); 592 } 593 594 return (-1); 595 } 596 597 static void 598 grep_revstr(unsigned char *str, int len) 599 { 600 int i; 601 char c; 602 603 for (i = 0; i < len / 2; i++) { 604 c = str[i]; 605 str[i] = str[len - i - 1]; 606 str[len - i - 1] = c; 607 } 608 } 609 #endif 610 611 void 612 printline(str_t *line, int sep, regmatch_t *pmatch) 613 { 614 int n; 615 616 n = 0; 617 if (!hflag) { 618 fputs(line->file, stdout); 619 ++n; 620 } 621 if (nflag) { 622 if (n) 623 putchar(sep); 624 printf("%d", line->line_no); 625 ++n; 626 } 627 if (bflag) { 628 if (n) 629 putchar(sep); 630 printf("%lld", (long long)line->off); 631 ++n; 632 } 633 if (n) 634 putchar(sep); 635 if (pmatch) 636 fwrite(line->dat + pmatch->rm_so, 637 pmatch->rm_eo - pmatch->rm_so, 1, stdout); 638 else 639 fwrite(line->dat, line->len, 1, stdout); 640 putchar('\n'); 641 } 642