1 /* $OpenBSD: util.c,v 1.68 2023/11/15 00:50:43 millert Exp $ */ 2 3 /*- 4 * Copyright (c) 1999 James Howard and Dag-Erling Co�dan Sm�rgrav 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 19 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 26 * SUCH DAMAGE. 27 */ 28 29 #include <sys/types.h> 30 #include <sys/stat.h> 31 32 #include <ctype.h> 33 #include <err.h> 34 #include <errno.h> 35 #include <fts.h> 36 #include <regex.h> 37 #include <stdbool.h> 38 #include <stdio.h> 39 #include <stdlib.h> 40 #include <string.h> 41 #include <unistd.h> 42 #include <zlib.h> 43 44 #include "grep.h" 45 46 /* 47 * Process a file line by line... 48 */ 49 50 static int linesqueued; 51 static int procline(str_t *l, int); 52 static int grep_search(fastgrep_t *, char *, size_t, regmatch_t *pmatch, int); 53 #ifndef SMALL 54 static bool grep_cmp(const char *, const char *, size_t); 55 static void grep_revstr(unsigned char *, int); 56 #endif 57 58 int 59 grep_tree(char **argv) 60 { 61 FTS *fts; 62 FTSENT *p; 63 int c, fts_flags; 64 char *dot_argv[] = { ".", NULL }; 65 char *path; 66 67 /* default to . if no path given */ 68 if (argv[0] == NULL) 69 argv = dot_argv; 70 71 c = 0; 72 73 fts_flags = FTS_PHYSICAL | FTS_NOSTAT | FTS_NOCHDIR; 74 75 if (!(fts = fts_open(argv, fts_flags, NULL))) 76 err(2, NULL); 77 while ((p = fts_read(fts)) != NULL) { 78 switch (p->fts_info) { 79 case FTS_DNR: 80 break; 81 case FTS_ERR: 82 file_err = 1; 83 if(!sflag) 84 warnc(p->fts_errno, "%s", p->fts_path); 85 break; 86 case FTS_D: 87 case FTS_DP: 88 break; 89 default: 90 path = p->fts_path; 91 /* skip "./" if implied */ 92 if (argv == dot_argv && p->fts_pathlen >= 2) 93 path += 2; 94 c |= procfile(path); 95 break; 96 } 97 } 98 if (errno) 99 err(2, "fts_read"); 100 fts_close(fts); 101 return c; 102 } 103 104 int 105 procfile(char *fn) 106 { 107 str_t ln; 108 file_t *f; 109 int t, z, nottext, overflow = 0; 110 unsigned long long c; 111 112 mcount = mlimit; 113 114 if (fn == NULL) { 115 fn = "(standard input)"; 116 f = grep_fdopen(STDIN_FILENO); 117 } else { 118 f = grep_open(fn); 119 } 120 if (f == NULL) { 121 if (errno == EISDIR) 122 return 0; 123 file_err = 1; 124 if (!sflag) 125 warn("%s", fn); 126 return 0; 127 } 128 129 nottext = grep_bin_file(f); 130 if (nottext && binbehave == BIN_FILE_SKIP) { 131 grep_close(f); 132 return 0; 133 } 134 135 ln.file = fn; 136 if (labelname) 137 ln.file = (char *)labelname; 138 ln.line_no = 0; 139 ln.len = 0; 140 linesqueued = 0; 141 tail = 0; 142 ln.off = -1; 143 144 if (Bflag > 0) 145 initqueue(); 146 for (c = 0; c == 0 || !(lflag || qflag); ) { 147 if (mflag && mlimit == 0) 148 break; 149 ln.off += ln.len + 1; 150 if ((ln.dat = grep_fgetln(f, &ln.len)) == NULL) 151 break; 152 if (ln.len > 0 && ln.dat[ln.len - 1] == '\n') 153 --ln.len; 154 ln.line_no++; 155 156 z = tail; 157 158 if ((t = procline(&ln, nottext)) == 0 && Bflag > 0 && z == 0) { 159 enqueue(&ln); 160 linesqueued++; 161 } 162 if (ULLONG_MAX - c < (unsigned long long)t) 163 overflow = 1; 164 else 165 c += t; 166 if (mflag && mcount <= 0 && tail <= 0) 167 break; 168 } 169 if (Bflag > 0) 170 clearqueue(); 171 grep_close(f); 172 173 if (cflag) { 174 if (!hflag) 175 printf("%s%c", ln.file, nullflag ? '\0' : ':'); 176 printf("%llu%s\n", c, overflow ? "+" : ""); 177 } 178 if (lflag && c != 0) 179 printf("%s%c", fn, nullflag ? '\0' : '\n'); 180 if (Lflag && c == 0) 181 printf("%s%c", fn, nullflag ? '\0' : '\n'); 182 if (c && !cflag && !lflag && !Lflag && 183 binbehave == BIN_FILE_BIN && nottext && !qflag) 184 printf("Binary file %s matches\n", fn); 185 186 return overflow || c != 0; 187 } 188 189 190 /* 191 * Process an individual line in a file. Return non-zero if it matches. 192 */ 193 194 #define isword(x) (isalnum((unsigned char)x) || (x) == '_') 195 196 static int 197 procline(str_t *l, int nottext) 198 { 199 regmatch_t pmatch = { 0 }; 200 int c, i, r, counted; 201 regoff_t offset; 202 203 /* size_t will be converted to regoff_t. ssize_t is guaranteed to fit 204 * into regoff_t */ 205 if (l->len > SSIZE_MAX) { 206 errx(2, "Line is too big to process"); 207 } 208 209 c = 0; 210 i = 0; 211 counted = 0; 212 if (matchall) { 213 c = 1; 214 goto print; 215 } 216 if (mflag && mcount <= 0) 217 goto print; 218 219 for (i = 0; i < patterns; i++) { 220 offset = 0; 221 redo: 222 if (fg_pattern[i].pattern) { 223 int flags = 0; 224 if (offset) 225 flags |= REG_NOTBOL; 226 r = grep_search(&fg_pattern[i], l->dat + offset, 227 l->len - offset, &pmatch, flags); 228 pmatch.rm_so += offset; 229 pmatch.rm_eo += offset; 230 } else { 231 int flags = eflags; 232 if (offset) 233 flags |= REG_NOTBOL; 234 pmatch.rm_so = offset; 235 pmatch.rm_eo = l->len; 236 r = regexec(&r_pattern[i], l->dat, 1, &pmatch, flags); 237 } 238 if (r == 0 && xflag) { 239 if (pmatch.rm_so != 0 || pmatch.rm_eo != l->len) 240 r = REG_NOMATCH; 241 } 242 if (r == 0) { 243 c = 1; 244 if (oflag && pmatch.rm_so != pmatch.rm_eo) 245 goto print; 246 break; 247 } 248 } 249 if (oflag) 250 return c; 251 print: 252 if (vflag) 253 c = !c; 254 255 /* Count the matches if there is a match limit (but only once). */ 256 if (mflag && !counted) { 257 mcount -= c; 258 counted = 1; 259 } 260 261 if (c && binbehave == BIN_FILE_BIN && nottext) 262 return c; /* Binary file */ 263 264 if ((tail > 0 || c) && !cflag && !qflag) { 265 if (c) { 266 if (first > 0 && tail == 0 && (Aflag || (Bflag && 267 Bflag < linesqueued))) 268 printf("--\n"); 269 first = 1; 270 tail = Aflag; 271 if (Bflag > 0) 272 printqueue(); 273 linesqueued = 0; 274 printline(l, ':', oflag ? &pmatch : NULL); 275 } else { 276 printline(l, '-', oflag ? &pmatch : NULL); 277 tail--; 278 } 279 } 280 if (oflag && !matchall) { 281 offset = pmatch.rm_eo; 282 goto redo; 283 } 284 return c; 285 } 286 287 #ifndef SMALL 288 void 289 fgrepcomp(fastgrep_t *fg, const unsigned char *pattern) 290 { 291 int i; 292 293 /* Initialize. */ 294 fg->patternLen = strlen(pattern); 295 fg->bol = 0; 296 fg->eol = 0; 297 fg->wmatch = wflag; 298 fg->reversedSearch = 0; 299 300 /* 301 * Make a copy and upper case it for later if in -i mode, 302 * else just copy the pointer. 303 */ 304 if (iflag) { 305 fg->pattern = grep_malloc(fg->patternLen + 1); 306 for (i = 0; i < fg->patternLen; i++) 307 fg->pattern[i] = toupper(pattern[i]); 308 fg->pattern[fg->patternLen] = '\0'; 309 } else 310 fg->pattern = (unsigned char *)pattern; /* really const */ 311 312 /* Preprocess pattern. */ 313 for (i = 0; i <= UCHAR_MAX; i++) 314 fg->qsBc[i] = fg->patternLen; 315 for (i = 1; i < fg->patternLen; i++) { 316 fg->qsBc[fg->pattern[i]] = fg->patternLen - i; 317 /* 318 * If case is ignored, make the jump apply to both upper and 319 * lower cased characters. As the pattern is stored in upper 320 * case, apply the same to the lower case equivalents. 321 */ 322 if (iflag) 323 fg->qsBc[tolower(fg->pattern[i])] = fg->patternLen - i; 324 } 325 } 326 #endif 327 328 /* 329 * Returns: -1 on failure, 0 on success 330 */ 331 int 332 fastcomp(fastgrep_t *fg, const char *pattern) 333 { 334 #ifdef SMALL 335 return -1; 336 #else 337 int i; 338 int bol = 0; 339 int eol = 0; 340 int shiftPatternLen; 341 int hasDot = 0; 342 int firstHalfDot = -1; 343 int firstLastHalfDot = -1; 344 int lastHalfDot = 0; 345 346 /* Initialize. */ 347 fg->patternLen = strlen(pattern); 348 fg->bol = 0; 349 fg->eol = 0; 350 fg->wmatch = 0; 351 fg->reversedSearch = 0; 352 353 /* Remove end-of-line character ('$'). */ 354 if (fg->patternLen > 0 && pattern[fg->patternLen - 1] == '$') { 355 eol++; 356 fg->eol = 1; 357 fg->patternLen--; 358 } 359 360 /* Remove beginning-of-line character ('^'). */ 361 if (pattern[0] == '^') { 362 bol++; 363 fg->bol = 1; 364 fg->patternLen--; 365 } 366 367 /* Remove enclosing [[:<:]] and [[:>:]] (word match). */ 368 if (wflag) { 369 /* basic re's use \( \), extended re's ( ) */ 370 int extra = Eflag ? 1 : 2; 371 fg->patternLen -= 14 + 2 * extra; 372 fg->wmatch = 7 + extra; 373 } else if (fg->patternLen >= 14 && 374 strncmp(pattern + fg->bol, "[[:<:]]", 7) == 0 && 375 strncmp(pattern + fg->bol + fg->patternLen - 7, "[[:>:]]", 7) == 0) { 376 fg->patternLen -= 14; 377 fg->wmatch = 7; 378 } 379 380 /* 381 * Copy pattern minus '^' and '$' characters as well as word 382 * match character classes at the beginning and ending of the 383 * string respectively. 384 */ 385 fg->pattern = grep_malloc(fg->patternLen + 1); 386 memcpy(fg->pattern, pattern + bol + fg->wmatch, fg->patternLen); 387 fg->pattern[fg->patternLen] = '\0'; 388 389 /* Look for ways to cheat...er...avoid the full regex engine. */ 390 for (i = 0; i < fg->patternLen; i++) 391 { 392 switch (fg->pattern[i]) { 393 case '.': 394 hasDot = i; 395 if (i < fg->patternLen / 2) { 396 if (firstHalfDot < 0) 397 /* Closest dot to the beginning */ 398 firstHalfDot = i; 399 } else { 400 /* Closest dot to the end of the pattern. */ 401 lastHalfDot = i; 402 if (firstLastHalfDot < 0) 403 firstLastHalfDot = i; 404 } 405 break; 406 case '(': case ')': 407 case '{': case '}': 408 /* Special in BRE if preceded by '\\' */ 409 case '?': 410 case '+': 411 case '|': 412 /* Not special in BRE. */ 413 if (!Eflag) 414 goto nonspecial; 415 case '\\': 416 case '*': 417 case '[': case ']': 418 /* Free memory and let others know this is empty. */ 419 free(fg->pattern); 420 fg->pattern = NULL; 421 return (-1); 422 default: 423 nonspecial: 424 if (iflag) 425 fg->pattern[i] = toupper(fg->pattern[i]); 426 break; 427 } 428 } 429 430 /* 431 * Determine if a reverse search would be faster based on the placement 432 * of the dots. 433 */ 434 if ((!(lflag || cflag || oflag)) && ((!(bol || eol)) && 435 ((lastHalfDot) && ((firstHalfDot < 0) || 436 ((fg->patternLen - (lastHalfDot + 1)) < firstHalfDot))))) { 437 fg->reversedSearch = 1; 438 hasDot = fg->patternLen - (firstHalfDot < 0 ? 439 firstLastHalfDot : firstHalfDot) - 1; 440 grep_revstr(fg->pattern, fg->patternLen); 441 } 442 443 /* 444 * Normal Quick Search would require a shift based on the position the 445 * next character after the comparison is within the pattern. With 446 * wildcards, the position of the last dot effects the maximum shift 447 * distance. 448 * The closer to the end the wild card is the slower the search. A 449 * reverse version of this algorithm would be useful for wildcards near 450 * the end of the string. 451 * 452 * Examples: 453 * Pattern Max shift 454 * ------- --------- 455 * this 5 456 * .his 4 457 * t.is 3 458 * th.s 2 459 * thi. 1 460 */ 461 462 /* Adjust the shift based on location of the last dot ('.'). */ 463 shiftPatternLen = fg->patternLen - hasDot; 464 465 /* Preprocess pattern. */ 466 for (i = 0; i <= UCHAR_MAX; i++) 467 fg->qsBc[i] = shiftPatternLen; 468 for (i = hasDot + 1; i < fg->patternLen; i++) { 469 fg->qsBc[fg->pattern[i]] = fg->patternLen - i; 470 /* 471 * If case is ignored, make the jump apply to both upper and 472 * lower cased characters. As the pattern is stored in upper 473 * case, apply the same to the lower case equivalents. 474 */ 475 if (iflag) 476 fg->qsBc[tolower(fg->pattern[i])] = fg->patternLen - i; 477 } 478 479 /* 480 * Put pattern back to normal after pre-processing to allow for easy 481 * comparisons later. 482 */ 483 if (fg->reversedSearch) 484 grep_revstr(fg->pattern, fg->patternLen); 485 486 return (0); 487 #endif 488 } 489 490 /* 491 * Word boundaries using regular expressions are defined as the point 492 * of transition from a non-word char to a word char, or vice versa. 493 * This means that grep -w +a and grep -w a+ never match anything, 494 * because they lack a starting or ending transition, but grep -w a+b 495 * does match a line containing a+b. 496 */ 497 #define wmatch(d, l, s, e) \ 498 ((s == 0 || !isword(d[s-1])) && (e == l || !isword(d[e])) && \ 499 e > s && isword(d[s]) && isword(d[e-1])) 500 501 static int 502 grep_search(fastgrep_t *fg, char *data, size_t dataLen, regmatch_t *pmatch, 503 int flags) 504 { 505 #ifdef SMALL 506 return 0; 507 #else 508 regoff_t j; 509 int rtrnVal = REG_NOMATCH; 510 511 pmatch->rm_so = -1; 512 pmatch->rm_eo = -1; 513 514 /* No point in going farther if we do not have enough data. */ 515 if (dataLen < fg->patternLen) 516 return (rtrnVal); 517 518 /* Only try once at the beginning or ending of the line. */ 519 if (fg->bol || fg->eol) { 520 if (fg->bol && (flags & REG_NOTBOL)) 521 return 0; 522 /* Simple text comparison. */ 523 /* Verify data is >= pattern length before searching on it. */ 524 if (dataLen >= fg->patternLen) { 525 /* Determine where in data to start search at. */ 526 if (fg->eol) 527 j = dataLen - fg->patternLen; 528 else 529 j = 0; 530 if (!((fg->bol && fg->eol) && (dataLen != fg->patternLen))) 531 if (grep_cmp(fg->pattern, data + j, 532 fg->patternLen)) { 533 pmatch->rm_so = j; 534 pmatch->rm_eo = j + fg->patternLen; 535 if (!fg->wmatch || wmatch(data, dataLen, 536 pmatch->rm_so, pmatch->rm_eo)) 537 rtrnVal = 0; 538 } 539 } 540 } else if (fg->reversedSearch) { 541 /* Quick Search algorithm. */ 542 j = dataLen; 543 do { 544 if (grep_cmp(fg->pattern, data + j - fg->patternLen, 545 fg->patternLen)) { 546 pmatch->rm_so = j - fg->patternLen; 547 pmatch->rm_eo = j; 548 if (!fg->wmatch || wmatch(data, dataLen, 549 pmatch->rm_so, pmatch->rm_eo)) { 550 rtrnVal = 0; 551 break; 552 } 553 } 554 /* Shift if within bounds, otherwise, we are done. */ 555 if (j == fg->patternLen) 556 break; 557 j -= fg->qsBc[(unsigned char)data[j - fg->patternLen - 1]]; 558 } while (j >= fg->patternLen); 559 } else { 560 /* Quick Search algorithm. */ 561 j = 0; 562 do { 563 if (grep_cmp(fg->pattern, data + j, fg->patternLen)) { 564 pmatch->rm_so = j; 565 pmatch->rm_eo = j + fg->patternLen; 566 if (fg->patternLen == 0 || !fg->wmatch || 567 wmatch(data, dataLen, pmatch->rm_so, 568 pmatch->rm_eo)) { 569 rtrnVal = 0; 570 break; 571 } 572 } 573 574 /* Shift if within bounds, otherwise, we are done. */ 575 if (j + fg->patternLen == dataLen) 576 break; 577 else 578 j += fg->qsBc[(unsigned char)data[j + fg->patternLen]]; 579 } while (j <= (dataLen - fg->patternLen)); 580 } 581 582 return (rtrnVal); 583 #endif 584 } 585 586 587 void * 588 grep_malloc(size_t size) 589 { 590 void *ptr; 591 592 if ((ptr = malloc(size)) == NULL) 593 err(2, "malloc"); 594 return ptr; 595 } 596 597 void * 598 grep_calloc(size_t nmemb, size_t size) 599 { 600 void *ptr; 601 602 if ((ptr = calloc(nmemb, size)) == NULL) 603 err(2, "calloc"); 604 return ptr; 605 } 606 607 void * 608 grep_realloc(void *ptr, size_t size) 609 { 610 if ((ptr = realloc(ptr, size)) == NULL) 611 err(2, "realloc"); 612 return ptr; 613 } 614 615 void * 616 grep_reallocarray(void *ptr, size_t nmemb, size_t size) 617 { 618 if ((ptr = reallocarray(ptr, nmemb, size)) == NULL) 619 err(2, "reallocarray"); 620 return ptr; 621 } 622 623 #ifndef SMALL 624 /* 625 * Returns: true on success, false on failure 626 */ 627 static bool 628 grep_cmp(const char *pattern, const char *data, size_t len) 629 { 630 size_t i; 631 632 for (i = 0; i < len; i++) { 633 if (((pattern[i] == data[i]) || (!Fflag && pattern[i] == '.')) 634 || (iflag && pattern[i] == toupper((unsigned char)data[i]))) 635 continue; 636 return false; 637 } 638 639 return true; 640 } 641 642 static void 643 grep_revstr(unsigned char *str, int len) 644 { 645 int i; 646 char c; 647 648 for (i = 0; i < len / 2; i++) { 649 c = str[i]; 650 str[i] = str[len - i - 1]; 651 str[len - i - 1] = c; 652 } 653 } 654 #endif 655 656 void 657 printline(str_t *line, int sep, regmatch_t *pmatch) 658 { 659 int n; 660 661 n = 0; 662 if (!hflag) { 663 fputs(line->file, stdout); 664 if (nullflag) 665 putchar(0); 666 else 667 ++n; 668 } 669 if (nflag) { 670 if (n) 671 putchar(sep); 672 printf("%lld", line->line_no); 673 ++n; 674 } 675 if (bflag) { 676 if (n) 677 putchar(sep); 678 printf("%lld", (long long)line->off + 679 (pmatch ? pmatch->rm_so : 0)); 680 ++n; 681 } 682 if (n) 683 putchar(sep); 684 if (pmatch) 685 fwrite(line->dat + pmatch->rm_so, 686 pmatch->rm_eo - pmatch->rm_so, 1, stdout); 687 else 688 fwrite(line->dat, line->len, 1, stdout); 689 putchar('\n'); 690 } 691