1 /* $OpenBSD: util.c,v 1.61 2019/10/07 17:47:32 tedu Exp $ */ 2 3 /*- 4 * Copyright (c) 1999 James Howard and Dag-Erling Co�dan Sm�rgrav 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 19 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 26 * SUCH DAMAGE. 27 */ 28 29 #include <sys/types.h> 30 #include <sys/stat.h> 31 32 #include <ctype.h> 33 #include <err.h> 34 #include <errno.h> 35 #include <fts.h> 36 #include <regex.h> 37 #include <stdbool.h> 38 #include <stdio.h> 39 #include <stdlib.h> 40 #include <string.h> 41 #include <unistd.h> 42 #include <zlib.h> 43 44 #include "grep.h" 45 46 /* 47 * Process a file line by line... 48 */ 49 50 static int linesqueued; 51 static int procline(str_t *l, int); 52 static int grep_search(fastgrep_t *, char *, size_t, regmatch_t *pmatch, int); 53 #ifndef SMALL 54 static bool grep_cmp(const char *, const char *, size_t); 55 static void grep_revstr(unsigned char *, int); 56 #endif 57 58 int 59 grep_tree(char **argv) 60 { 61 FTS *fts; 62 FTSENT *p; 63 int c, fts_flags; 64 65 c = 0; 66 67 fts_flags = FTS_PHYSICAL | FTS_NOSTAT | FTS_NOCHDIR; 68 69 if (!(fts = fts_open(argv, fts_flags, NULL))) 70 err(2, NULL); 71 while ((p = fts_read(fts)) != NULL) { 72 switch (p->fts_info) { 73 case FTS_DNR: 74 break; 75 case FTS_ERR: 76 file_err = 1; 77 if(!sflag) 78 warnc(p->fts_errno, "%s", p->fts_path); 79 break; 80 case FTS_D: 81 case FTS_DP: 82 break; 83 default: 84 c += procfile(p->fts_path); 85 break; 86 } 87 } 88 if (errno) 89 err(2, "fts_read"); 90 fts_close(fts); 91 return c; 92 } 93 94 int 95 procfile(char *fn) 96 { 97 str_t ln; 98 file_t *f; 99 int c, t, z, nottext; 100 101 mcount = mlimit; 102 103 if (fn == NULL) { 104 fn = "(standard input)"; 105 f = grep_fdopen(STDIN_FILENO); 106 } else { 107 f = grep_open(fn); 108 } 109 if (f == NULL) { 110 if (errno == EISDIR) 111 return 0; 112 file_err = 1; 113 if (!sflag) 114 warn("%s", fn); 115 return 0; 116 } 117 118 nottext = grep_bin_file(f); 119 if (nottext && binbehave == BIN_FILE_SKIP) { 120 grep_close(f); 121 return 0; 122 } 123 124 ln.file = fn; 125 if (labelname) 126 ln.file = (char *)labelname; 127 ln.line_no = 0; 128 ln.len = 0; 129 linesqueued = 0; 130 tail = 0; 131 ln.off = -1; 132 133 if (Bflag > 0) 134 initqueue(); 135 for (c = 0; c == 0 || !(lflag || qflag); ) { 136 if (mflag && mlimit == 0) 137 break; 138 ln.off += ln.len + 1; 139 if ((ln.dat = grep_fgetln(f, &ln.len)) == NULL) 140 break; 141 if (ln.len > 0 && ln.dat[ln.len - 1] == '\n') 142 --ln.len; 143 ln.line_no++; 144 145 z = tail; 146 147 if ((t = procline(&ln, nottext)) == 0 && Bflag > 0 && z == 0) { 148 enqueue(&ln); 149 linesqueued++; 150 } 151 c += t; 152 if (mflag && mcount <= 0) 153 break; 154 } 155 if (Bflag > 0) 156 clearqueue(); 157 grep_close(f); 158 159 if (cflag) { 160 if (!hflag) 161 printf("%s:", ln.file); 162 printf("%u\n", c); 163 } 164 if (lflag && c != 0) 165 printf("%s\n", fn); 166 if (Lflag && c == 0) 167 printf("%s\n", fn); 168 if (c && !cflag && !lflag && !Lflag && 169 binbehave == BIN_FILE_BIN && nottext && !qflag) 170 printf("Binary file %s matches\n", fn); 171 172 return c; 173 } 174 175 176 /* 177 * Process an individual line in a file. Return non-zero if it matches. 178 */ 179 180 #define isword(x) (isalnum((unsigned char)x) || (x) == '_') 181 182 static int 183 procline(str_t *l, int nottext) 184 { 185 regmatch_t pmatch = { 0 }; 186 int c, i, r; 187 regoff_t offset; 188 189 /* size_t will be converted to regoff_t. ssize_t is guaranteed to fit 190 * into regoff_t */ 191 if (l->len > SSIZE_MAX) { 192 errx(2, "Line is too big to process"); 193 } 194 195 c = 0; 196 i = 0; 197 if (matchall) { 198 c = 1; 199 goto print; 200 } 201 202 for (i = 0; i < patterns; i++) { 203 offset = 0; 204 redo: 205 if (fg_pattern[i].pattern) { 206 int flags = 0; 207 if (offset) 208 flags |= REG_NOTBOL; 209 r = grep_search(&fg_pattern[i], l->dat + offset, 210 l->len - offset, &pmatch, flags); 211 pmatch.rm_so += offset; 212 pmatch.rm_eo += offset; 213 } else { 214 int flags = eflags; 215 if (offset) 216 flags |= REG_NOTBOL; 217 pmatch.rm_so = offset; 218 pmatch.rm_eo = l->len; 219 r = regexec(&r_pattern[i], l->dat, 1, &pmatch, flags); 220 } 221 if (r == 0 && xflag) { 222 if (pmatch.rm_so != 0 || pmatch.rm_eo != l->len) 223 r = REG_NOMATCH; 224 } 225 if (r == 0) { 226 c = 1; 227 if (oflag && pmatch.rm_so != pmatch.rm_eo) 228 goto print; 229 break; 230 } 231 } 232 if (oflag) 233 return c; 234 print: 235 if (vflag) 236 c = !c; 237 238 /* Count the matches if we have a match limit */ 239 if (mflag) 240 mcount -= c; 241 242 if (c && binbehave == BIN_FILE_BIN && nottext) 243 return c; /* Binary file */ 244 245 if ((tail > 0 || c) && !cflag && !qflag) { 246 if (c) { 247 if (first > 0 && tail == 0 && (Bflag < linesqueued) && 248 (Aflag || Bflag)) 249 printf("--\n"); 250 first = 1; 251 tail = Aflag; 252 if (Bflag > 0) 253 printqueue(); 254 linesqueued = 0; 255 printline(l, ':', oflag ? &pmatch : NULL); 256 } else { 257 printline(l, '-', oflag ? &pmatch : NULL); 258 tail--; 259 } 260 } 261 if (oflag && !matchall) { 262 offset = pmatch.rm_eo; 263 goto redo; 264 } 265 return c; 266 } 267 268 #ifndef SMALL 269 void 270 fgrepcomp(fastgrep_t *fg, const unsigned char *pattern) 271 { 272 int i; 273 274 /* Initialize. */ 275 fg->patternLen = strlen(pattern); 276 fg->bol = 0; 277 fg->eol = 0; 278 fg->wmatch = wflag; 279 fg->reversedSearch = 0; 280 281 /* 282 * Make a copy and upper case it for later if in -i mode, 283 * else just copy the pointer. 284 */ 285 if (iflag) { 286 fg->pattern = grep_malloc(fg->patternLen + 1); 287 for (i = 0; i < fg->patternLen; i++) 288 fg->pattern[i] = toupper(pattern[i]); 289 fg->pattern[fg->patternLen] = '\0'; 290 } else 291 fg->pattern = (unsigned char *)pattern; /* really const */ 292 293 /* Preprocess pattern. */ 294 for (i = 0; i <= UCHAR_MAX; i++) 295 fg->qsBc[i] = fg->patternLen; 296 for (i = 1; i < fg->patternLen; i++) { 297 fg->qsBc[fg->pattern[i]] = fg->patternLen - i; 298 /* 299 * If case is ignored, make the jump apply to both upper and 300 * lower cased characters. As the pattern is stored in upper 301 * case, apply the same to the lower case equivalents. 302 */ 303 if (iflag) 304 fg->qsBc[tolower(fg->pattern[i])] = fg->patternLen - i; 305 } 306 } 307 #endif 308 309 /* 310 * Returns: -1 on failure, 0 on success 311 */ 312 int 313 fastcomp(fastgrep_t *fg, const char *pattern) 314 { 315 #ifdef SMALL 316 return -1; 317 #else 318 int i; 319 int bol = 0; 320 int eol = 0; 321 int shiftPatternLen; 322 int hasDot = 0; 323 int firstHalfDot = -1; 324 int firstLastHalfDot = -1; 325 int lastHalfDot = 0; 326 327 /* Initialize. */ 328 fg->patternLen = strlen(pattern); 329 fg->bol = 0; 330 fg->eol = 0; 331 fg->wmatch = 0; 332 fg->reversedSearch = 0; 333 334 /* Remove end-of-line character ('$'). */ 335 if (fg->patternLen > 0 && pattern[fg->patternLen - 1] == '$') { 336 eol++; 337 fg->eol = 1; 338 fg->patternLen--; 339 } 340 341 /* Remove beginning-of-line character ('^'). */ 342 if (pattern[0] == '^') { 343 bol++; 344 fg->bol = 1; 345 fg->patternLen--; 346 } 347 348 /* Remove enclosing [[:<:]] and [[:>:]] (word match). */ 349 if (wflag) { 350 /* basic re's use \( \), extended re's ( ) */ 351 int extra = Eflag ? 1 : 2; 352 fg->patternLen -= 14 + 2 * extra; 353 fg->wmatch = 7 + extra; 354 } else if (fg->patternLen >= 14 && 355 strncmp(pattern + fg->bol, "[[:<:]]", 7) == 0 && 356 strncmp(pattern + fg->bol + fg->patternLen - 7, "[[:>:]]", 7) == 0) { 357 fg->patternLen -= 14; 358 fg->wmatch = 7; 359 } 360 361 /* 362 * Copy pattern minus '^' and '$' characters as well as word 363 * match character classes at the beginning and ending of the 364 * string respectively. 365 */ 366 fg->pattern = grep_malloc(fg->patternLen + 1); 367 memcpy(fg->pattern, pattern + bol + fg->wmatch, fg->patternLen); 368 fg->pattern[fg->patternLen] = '\0'; 369 370 /* Look for ways to cheat...er...avoid the full regex engine. */ 371 for (i = 0; i < fg->patternLen; i++) 372 { 373 switch (fg->pattern[i]) { 374 case '.': 375 hasDot = i; 376 if (i < fg->patternLen / 2) { 377 if (firstHalfDot < 0) 378 /* Closest dot to the beginning */ 379 firstHalfDot = i; 380 } else { 381 /* Closest dot to the end of the pattern. */ 382 lastHalfDot = i; 383 if (firstLastHalfDot < 0) 384 firstLastHalfDot = i; 385 } 386 break; 387 case '(': case ')': 388 case '{': case '}': 389 /* Special in BRE if preceded by '\\' */ 390 case '?': 391 case '+': 392 case '|': 393 /* Not special in BRE. */ 394 if (!Eflag) 395 goto nonspecial; 396 case '\\': 397 case '*': 398 case '[': case ']': 399 /* Free memory and let others know this is empty. */ 400 free(fg->pattern); 401 fg->pattern = NULL; 402 return (-1); 403 default: 404 nonspecial: 405 if (iflag) 406 fg->pattern[i] = toupper(fg->pattern[i]); 407 break; 408 } 409 } 410 411 /* 412 * Determine if a reverse search would be faster based on the placement 413 * of the dots. 414 */ 415 if ((!(lflag || cflag || oflag)) && ((!(bol || eol)) && 416 ((lastHalfDot) && ((firstHalfDot < 0) || 417 ((fg->patternLen - (lastHalfDot + 1)) < firstHalfDot))))) { 418 fg->reversedSearch = 1; 419 hasDot = fg->patternLen - (firstHalfDot < 0 ? 420 firstLastHalfDot : firstHalfDot) - 1; 421 grep_revstr(fg->pattern, fg->patternLen); 422 } 423 424 /* 425 * Normal Quick Search would require a shift based on the position the 426 * next character after the comparison is within the pattern. With 427 * wildcards, the position of the last dot effects the maximum shift 428 * distance. 429 * The closer to the end the wild card is the slower the search. A 430 * reverse version of this algorithm would be useful for wildcards near 431 * the end of the string. 432 * 433 * Examples: 434 * Pattern Max shift 435 * ------- --------- 436 * this 5 437 * .his 4 438 * t.is 3 439 * th.s 2 440 * thi. 1 441 */ 442 443 /* Adjust the shift based on location of the last dot ('.'). */ 444 shiftPatternLen = fg->patternLen - hasDot; 445 446 /* Preprocess pattern. */ 447 for (i = 0; i <= UCHAR_MAX; i++) 448 fg->qsBc[i] = shiftPatternLen; 449 for (i = hasDot + 1; i < fg->patternLen; i++) { 450 fg->qsBc[fg->pattern[i]] = fg->patternLen - i; 451 /* 452 * If case is ignored, make the jump apply to both upper and 453 * lower cased characters. As the pattern is stored in upper 454 * case, apply the same to the lower case equivalents. 455 */ 456 if (iflag) 457 fg->qsBc[tolower(fg->pattern[i])] = fg->patternLen - i; 458 } 459 460 /* 461 * Put pattern back to normal after pre-processing to allow for easy 462 * comparisons later. 463 */ 464 if (fg->reversedSearch) 465 grep_revstr(fg->pattern, fg->patternLen); 466 467 return (0); 468 #endif 469 } 470 471 /* 472 * Word boundaries using regular expressions are defined as the point 473 * of transition from a non-word char to a word char, or vice versa. 474 * This means that grep -w +a and grep -w a+ never match anything, 475 * because they lack a starting or ending transition, but grep -w a+b 476 * does match a line containing a+b. 477 */ 478 #define wmatch(d, l, s, e) \ 479 ((s == 0 || !isword(d[s-1])) && (e == l || !isword(d[e])) && \ 480 e > s && isword(d[s]) && isword(d[e-1])) 481 482 static int 483 grep_search(fastgrep_t *fg, char *data, size_t dataLen, regmatch_t *pmatch, 484 int flags) 485 { 486 #ifdef SMALL 487 return 0; 488 #else 489 regoff_t j; 490 int rtrnVal = REG_NOMATCH; 491 492 pmatch->rm_so = -1; 493 pmatch->rm_eo = -1; 494 495 /* No point in going farther if we do not have enough data. */ 496 if (dataLen < fg->patternLen) 497 return (rtrnVal); 498 499 /* Only try once at the beginning or ending of the line. */ 500 if (fg->bol || fg->eol) { 501 if (fg->bol && (flags & REG_NOTBOL)) 502 return 0; 503 /* Simple text comparison. */ 504 /* Verify data is >= pattern length before searching on it. */ 505 if (dataLen >= fg->patternLen) { 506 /* Determine where in data to start search at. */ 507 if (fg->eol) 508 j = dataLen - fg->patternLen; 509 else 510 j = 0; 511 if (!((fg->bol && fg->eol) && (dataLen != fg->patternLen))) 512 if (grep_cmp(fg->pattern, data + j, 513 fg->patternLen)) { 514 pmatch->rm_so = j; 515 pmatch->rm_eo = j + fg->patternLen; 516 if (!fg->wmatch || wmatch(data, dataLen, 517 pmatch->rm_so, pmatch->rm_eo)) 518 rtrnVal = 0; 519 } 520 } 521 } else if (fg->reversedSearch) { 522 /* Quick Search algorithm. */ 523 j = dataLen; 524 do { 525 if (grep_cmp(fg->pattern, data + j - fg->patternLen, 526 fg->patternLen)) { 527 pmatch->rm_so = j - fg->patternLen; 528 pmatch->rm_eo = j; 529 if (!fg->wmatch || wmatch(data, dataLen, 530 pmatch->rm_so, pmatch->rm_eo)) { 531 rtrnVal = 0; 532 break; 533 } 534 } 535 /* Shift if within bounds, otherwise, we are done. */ 536 if (j == fg->patternLen) 537 break; 538 j -= fg->qsBc[(unsigned char)data[j - fg->patternLen - 1]]; 539 } while (j >= fg->patternLen); 540 } else { 541 /* Quick Search algorithm. */ 542 j = 0; 543 do { 544 if (grep_cmp(fg->pattern, data + j, fg->patternLen)) { 545 pmatch->rm_so = j; 546 pmatch->rm_eo = j + fg->patternLen; 547 if (fg->patternLen == 0 || !fg->wmatch || 548 wmatch(data, dataLen, pmatch->rm_so, 549 pmatch->rm_eo)) { 550 rtrnVal = 0; 551 break; 552 } 553 } 554 555 /* Shift if within bounds, otherwise, we are done. */ 556 if (j + fg->patternLen == dataLen) 557 break; 558 else 559 j += fg->qsBc[(unsigned char)data[j + fg->patternLen]]; 560 } while (j <= (dataLen - fg->patternLen)); 561 } 562 563 return (rtrnVal); 564 #endif 565 } 566 567 568 void * 569 grep_malloc(size_t size) 570 { 571 void *ptr; 572 573 if ((ptr = malloc(size)) == NULL) 574 err(2, "malloc"); 575 return ptr; 576 } 577 578 void * 579 grep_calloc(size_t nmemb, size_t size) 580 { 581 void *ptr; 582 583 if ((ptr = calloc(nmemb, size)) == NULL) 584 err(2, "calloc"); 585 return ptr; 586 } 587 588 void * 589 grep_realloc(void *ptr, size_t size) 590 { 591 if ((ptr = realloc(ptr, size)) == NULL) 592 err(2, "realloc"); 593 return ptr; 594 } 595 596 void * 597 grep_reallocarray(void *ptr, size_t nmemb, size_t size) 598 { 599 if ((ptr = reallocarray(ptr, nmemb, size)) == NULL) 600 err(2, "reallocarray"); 601 return ptr; 602 } 603 604 #ifndef SMALL 605 /* 606 * Returns: true on success, false on failure 607 */ 608 static bool 609 grep_cmp(const char *pattern, const char *data, size_t len) 610 { 611 size_t i; 612 613 for (i = 0; i < len; i++) { 614 if (((pattern[i] == data[i]) || (!Fflag && pattern[i] == '.')) 615 || (iflag && pattern[i] == toupper((unsigned char)data[i]))) 616 continue; 617 return false; 618 } 619 620 return true; 621 } 622 623 static void 624 grep_revstr(unsigned char *str, int len) 625 { 626 int i; 627 char c; 628 629 for (i = 0; i < len / 2; i++) { 630 c = str[i]; 631 str[i] = str[len - i - 1]; 632 str[len - i - 1] = c; 633 } 634 } 635 #endif 636 637 void 638 printline(str_t *line, int sep, regmatch_t *pmatch) 639 { 640 int n; 641 642 n = 0; 643 if (!hflag) { 644 fputs(line->file, stdout); 645 ++n; 646 } 647 if (nflag) { 648 if (n) 649 putchar(sep); 650 printf("%lld", line->line_no); 651 ++n; 652 } 653 if (bflag) { 654 if (n) 655 putchar(sep); 656 printf("%lld", (long long)line->off + 657 (pmatch ? pmatch->rm_so : 0)); 658 ++n; 659 } 660 if (n) 661 putchar(sep); 662 if (pmatch) 663 fwrite(line->dat + pmatch->rm_so, 664 pmatch->rm_eo - pmatch->rm_so, 1, stdout); 665 else 666 fwrite(line->dat, line->len, 1, stdout); 667 putchar('\n'); 668 } 669