1 #ifndef lint 2 static char sccsid[] = "@(#)egrep.c 5.13 (Berkeley) 02/27/91"; 3 #endif not lint 4 5 /* 6 Hybrid Boyer/Moore/Gosper-assisted 'grep/egrep/fgrep' search, with delta0 7 table as in original paper (CACM, October, 1977). No delta1 or delta2. 8 According to experiment (Horspool, Soft. Prac. Exp., 1982), delta2 is of 9 minimal practical value. However, to improve for worst case input, 10 integrating the improved Galil strategies (Apostolico/Giancarlo, SIAM. J. 11 Comput., Feb. 1986) deserves consideration. 12 13 Method: extract longest metacharacter-free string from expression. 14 this is done using a side-effect from henry spencer's regcomp(). 15 use boyer-moore to match such, then pass submatching lines 16 to either regexp() or standard 'egrep', depending on certain 17 criteria within execstrategy() below. [this tradeoff is due 18 to the general slowness of the regexp() nondeterministic 19 machine on complex expressions, as well as the startup time 20 of standard 'egrep' on short files.] alternatively, one may 21 change the vendor-supplied 'egrep' automaton to include 22 boyer-moore directly. see accompanying writeup for discussion 23 of kanji expression treatment. 24 25 late addition: apply trickbag for fast match of simple 26 alternations (sublinear, in common low-cardinality cases). 27 trap fgrep into this lair. 28 29 gnu additions: -f, newline as |, \< and \> [in regexec()], more 30 comments. inspire better dfa exec() strategy. 31 serious testing and help with special cases. 32 33 Algorithm amalgam summary: 34 35 dfa e?grep (aho/thompson) 36 ndfa regexp() (spencer/aho) 37 bmg (boyer/moore/gosper) 38 "superimposed" bmg (jaw) 39 fgrep (aho/corrasick) 40 41 sorry, but the knuth/morris/pratt machine, horspool's 42 "frequentist" code, and the rabin/karp matcher, however cute, 43 just don't cut it for this production. 44 45 James A. Woods Copyright (c) 1986 46 NASA Ames Research Center 47 */ 48 #include <sys/types.h> 49 #include <sys/stat.h> 50 #include <sys/file.h> 51 #include <regexp.h> /* must be henry spencer's version */ 52 #include <stdio.h> 53 #include <ctype.h> 54 #include "pathnames.h" 55 56 #define MIN(A, B) ((A) > (B) ? (B) : (A)) 57 58 #ifdef SLOWSYS 59 #define read xread 60 #endif 61 62 #define BUFSIZE 8192 /* make higher for cray */ 63 #define PATSIZE 6000 64 #define LARGE BUFSIZE + PATSIZE 65 66 #define NALT 7 /* tied to scanf() size in alternate() */ 67 #define NMUSH 6 /* loosely relates to expected alt length */ 68 69 #define FIRSTFEW 33 /* Always do FIRSTFEW matches with regexec() */ 70 #define PUNTPERCENT 10 /* After FIRSTFEW, if PUNTPERCENT of the input 71 * was processed by regexp(), exec std egrep. */ 72 #define NL '\n' 73 #define EOS '\0' 74 #define NONASCII 0200 /* Bit mask for Kanji non-ascii chars */ 75 #define META "\n^$.[]()?+*|\\" /* egrep meta-characters */ 76 #define SS2 '\216' /* EUC Katakana (or Chinese2) prefix */ 77 #define SS3 '\217' /* EUC Kanji2 (or Chinese3) prefix */ 78 79 extern char *optarg; 80 extern int optind; 81 char *progname; 82 83 int cflag, iflag, eflag, fflag, lflag, nflag; /* SVID flags */ 84 int sflag, hflag; /* v7, v8, bsd */ 85 86 int firstflag; /* Stop at first match */ 87 int grepflag; /* Called as "grep" */ 88 int fgrepflag; /* Called as "fgrep" */ 89 int altflag; /* Simple alternation in pattern */ 90 int boyonly; /* No regexp needed -- all simple */ 91 int flushflag; 92 int grepold, egrepold, fgrepold; 93 94 int nalt; /* Number of alternatives */ 95 int nsuccess; /* 1 for match, 2 for error */ 96 int altmin; /* Minimum length of all the alternate 97 * strings */ 98 int firstfile; /* argv index of first file argument */ 99 int patind; /* argv index of pattern */ 100 long nmatch; /* Number of matches in this file */ 101 long incount, counted; /* Amount of input consumed */ 102 long rxcount; /* Bytes of input processed by regexec() */ 103 int boyfound; /* accumulated partial matches (tripped by 104 * FIRSTFEW) */ 105 int prevmatch; /* next three lines aid fast -n */ 106 long nline, prevnline; 107 char *prevloc; 108 109 regexp *rspencer; 110 char *pattern; 111 char *patboy; /* Pattern for simple Boyer-Moore */ 112 char *patfile; /* Filename containing pattern(s) */ 113 114 int delta0[256]; /* Boyer-Moore algorithm core */ 115 char cmap[256]; /* Usually 0-255, but if -i, maps upper to 116 * lower case */ 117 char str[BUFSIZE + 2]; 118 int nleftover; 119 char linetemp[BUFSIZE]; 120 char *altpat[NALT]; /* alternation component storage */ 121 int altlen[NALT]; 122 short altset[NMUSH + 1][256]; 123 char preamble[200]; /* match prefix (filename, line no.) */ 124 125 int fd; 126 char * 127 strchr(), *strrchr(), *strcpy(), *strncpy(), *strpbrk(), *malloc(); 128 char * 129 grepxlat(), *fold(), *pfile(), *alternate(), *isolate(); 130 char *gotamatch(), *kanji(), *linesave(), *submatch(); 131 char **args; 132 133 main(argc, argv) 134 int argc; 135 char *argv[]; 136 { 137 int c, oflag; 138 int errflag = 0; 139 140 args = argv; 141 142 if ((progname = strrchr(argv[0], '/')) != 0) 143 progname++; 144 else 145 progname = argv[0]; 146 if (strcmp(progname, "grep") == 0) 147 grepflag++; 148 else if (strcmp(progname, "fgrep") == 0) 149 fgrepflag++; 150 151 oflag = 0; 152 while ((c = getopt(argc, argv, "bchie:f:lnosvwxy1")) != EOF) { 153 switch (c) { 154 155 case 'f': 156 fflag++; 157 patfile = optarg; 158 continue; 159 case 'b': 160 case 'v': 161 egrepold++; /* boyer-moore of little help here */ 162 continue; 163 case 'c': 164 cflag++; 165 continue; 166 case 'e': 167 eflag++; 168 pattern = optarg; 169 continue; 170 case 'h': 171 hflag++; 172 continue; 173 case 'o': 174 oflag++; 175 continue; 176 case '1': /* Stop at very first match */ 177 firstflag++; /* spead freaks only */ 178 continue; 179 case 'i': 180 iflag++; 181 continue; 182 case 'l': 183 lflag++; 184 continue; 185 case 'n': 186 nflag++; 187 continue; 188 case 's': 189 sflag++; 190 continue; 191 case 'w': 192 case 'y': 193 if (!grepflag) 194 errflag++; 195 grepold++; 196 continue; 197 case 'x': /* needs more work, like -b above */ 198 if (!fgrepflag) 199 errflag++; 200 fgrepold++; 201 continue; 202 case '?': 203 errflag++; 204 } 205 } 206 if (errflag || ((argc <= optind) && !fflag && !eflag)) { 207 if (grepflag) 208 oops("usage: grep [-bchilnosvwy] [-e] pattern [file ...]"); 209 else if (fgrepflag) 210 oops("usage: fgrep [-bchilnosvx] {-f patfile | [-e] strings} [file ...]"); 211 else /* encourage SVID options, though we provide 212 * others */ 213 oops("usage: egrep [-bchilnosv] {-f patfile | [-e] pattern} [file ...]"); 214 } 215 if (fflag) 216 pattern = pfile(patfile); 217 else if (!eflag) { 218 patind = optind; 219 pattern = argv[optind++]; 220 } 221 222 if (!oflag && (argc - optind) <= 1) /* Filename invisible given < 2 files */ 223 hflag++; 224 if (pattern[0] == EOS) 225 kernighan(argv); /* same as it ever was */ 226 /* 227 * 'grep/egrep' merger -- "old" grep is called to handle: tagged 228 * exprs \( \), word matches \< and \>, -w and -y options, char 229 * classes with '-' at end (egrep bug?), and patterns beginning with 230 * an asterisk (don't ask why). otherwise, characters meaningful to 231 * 'egrep' but not to 'grep' are escaped; the entire expr is then 232 * passed to 'egrep'. 233 */ 234 if (grepflag && !grepold) { 235 if (strindex(pattern, "\\(") >= 0 || 236 strindex(pattern, "\\<") >= 0 || 237 strindex(pattern, "\\>") >= 0 || 238 strindex(pattern, "-]") >= 0 || 239 pattern[0] == '*') /* grep bug */ 240 grepold++; 241 else 242 pattern = grepxlat(pattern); 243 } 244 if (grepold || egrepold || fgrepold) 245 kernighan(argv); 246 247 if (iflag) 248 strcpy(pattern, fold(pattern)); 249 /* 250 * If the pattern is a plain string, just run boyer-moore. If it 251 * consists of meta-free alternatives, run "superimposed" bmg. 252 * Otherwise, find best string, and compile pattern for regexec(). 253 */ 254 if (strpbrk(pattern, META) == NULL) { /* do boyer-moore only */ 255 boyonly++; 256 patboy = pattern; 257 } else { 258 if ((patboy = alternate(pattern)) != NULL) 259 boyonly++; 260 else { 261 if ((patboy = isolate(pattern)) == NULL) 262 kernighan(argv); /* expr too involved */ 263 #ifndef NOKANJI 264 for (c = 0; pattern[c] != EOS; c++) 265 if (pattern[c] & NONASCII) /* kanji + meta */ 266 kernighan(argv); 267 #endif 268 if ((rspencer = regcomp(pattern)) == NULL) 269 oops("regcomp failure"); 270 } 271 } 272 gosper(patboy); /* "pre-conditioning is wonderful" 273 * -- v. strassen */ 274 275 if ((firstfile = optind) >= argc) { 276 /* Grep standard input */ 277 if (lflag) /* We don't know its name! */ 278 exit(1); 279 egsecute((char *) NULL); 280 } else { 281 while (optind < argc) { 282 egsecute(argv[optind]); 283 optind++; 284 if (firstflag && (nsuccess == 1)) 285 break; 286 } 287 } 288 exit((nsuccess == 2) ? 2 : (nsuccess == 0)); 289 } 290 291 char * 292 pfile(pfname) /* absorb expression from file */ 293 char *pfname; 294 { 295 int fd; 296 struct stat patstat; 297 static char *pat; 298 299 if ((fd = open(pfname, O_RDONLY, 0)) < 0) 300 oops("can't read pattern file"); 301 if (fstat(fd, &patstat) != 0) 302 oops("can't stat pattern file"); 303 if (patstat.st_size > PATSIZE) { 304 if (fgrepflag) { /* defer to unix version */ 305 fgrepold++; 306 return "dummy"; 307 } else 308 oops("pattern file too big"); 309 } 310 if ((pat = malloc((unsigned) patstat.st_size + 1)) == NULL) 311 oops("out of memory to read pattern file"); 312 if (patstat.st_size != read(fd, pat, (int)patstat.st_size)) 313 oops("error reading pattern file"); 314 (void) close(fd); 315 316 pat[patstat.st_size] = EOS; 317 if (pat[patstat.st_size - 1] == NL) /* NOP for egrep; helps grep */ 318 pat[patstat.st_size - 1] = EOS; 319 320 if (nlcount(pat, &pat[patstat.st_size]) > NALT) { 321 if (fgrepflag) 322 fgrepold++; /* "what's it all about, alfie?" */ 323 else 324 egrepold++; 325 } 326 return (pat); 327 } 328 329 egsecute(file) 330 char *file; 331 { 332 extern int errno; 333 334 if (file == NULL) 335 fd = 0; 336 else if ((fd = open(file, O_RDONLY, 0)) <= 0) { 337 fprintf(stderr, 338 "%s: %s: %s\n", progname, file, strerror(errno)); 339 nsuccess = 2; 340 return; 341 } 342 chimaera(file, patboy); 343 344 if (!boyonly && !flushflag && file != NULL) 345 flushmatches(); 346 if (file != NULL) 347 close(fd); 348 } 349 350 chimaera(file, pat) /* "reach out and boyer-moore search someone" */ 351 char *file, *pat; /* -- soon-to-be-popular bumper sticker */ 352 { 353 register char *k, *strend, *s; 354 register int j, count; 355 register int *deltazero = delta0; 356 int patlen = altmin; 357 char *t; 358 359 nleftover = boyfound = flushflag = 0; 360 nline = 1L; 361 prevmatch = 0; 362 nmatch = counted = rxcount = 0L; 363 364 while ((count = read(fd, str + nleftover, BUFSIZE - nleftover)) > 0) { 365 366 counted += count; 367 strend = linesave(str, count); 368 369 for (k = str + patlen - 1; k < strend;) { 370 /* 371 * for a large class of patterns, upwards of 80% of 372 * match time is spent on the next line. we beat 373 * existing microcode (vax 'matchc') this way. 374 */ 375 while ((k += deltazero[*(unsigned char *) k]) < strend); 376 if (k < (str + LARGE)) 377 break; 378 k -= LARGE; 379 380 if (altflag) { 381 /* 382 * Parallel Boyer-Moore. Check whether each 383 * of the previous <altmin> chars COULD be 384 * from one of the alternative strings. 385 */ 386 s = k - 1; 387 j = altmin; 388 while (altset[--j][(unsigned char) 389 cmap[*(unsigned char *) s--]]); 390 /* 391 * quick test fails. in this life, compare 392 * 'em all. but, a "reverse trie" would 393 * attenuate worst case (linear w/delta2?). 394 */ 395 if (--j < 0) { 396 count = nalt - 1; 397 do { 398 s = k; 399 j = altlen[count]; 400 t = altpat[count]; 401 402 while 403 (cmap[*(unsigned char *) s--] 404 == t[--j]); 405 if (j < 0) 406 break; 407 } 408 while (count--); 409 } 410 } else { 411 /* One string -- check it */ 412 j = patlen - 1; 413 s = k - 1; 414 while (cmap[*(unsigned char *) s--] == pat[--j]); 415 } 416 /* 417 * delta-less shortcut for literati. short shrift for 418 * genetic engineers? 419 */ 420 if (j >= 0) { 421 k++; /* no match; restart next char */ 422 continue; 423 } 424 k = submatch(file, pat, str, strend, k, count); 425 if (k == NULL) 426 return; 427 } 428 if (nflag) { 429 if (prevmatch) 430 nline = prevnline + nlcount(prevloc, k); 431 else 432 nline = nline + nlcount(str, k); 433 prevmatch = 0; 434 } 435 strncpy(str, linetemp, nleftover); 436 } 437 if (cflag) { 438 /* Bug from old grep: -c overrides -h. We fix the bug. */ 439 if (!hflag) 440 printf("%s:", file); 441 printf("%ld\n", nmatch); 442 } 443 } 444 445 char * 446 linesave(str, count) /* accumulate partial line at end of buffer */ 447 char str[]; 448 register int count; 449 { 450 register int j; 451 452 count += nleftover; 453 if (count != BUFSIZE && fd != 0) 454 str[count++] = NL; /* insurance for broken last line */ 455 str[count] = EOS; 456 for (j = count - 1; str[j] != NL && j >= 0;) 457 j--; 458 /* 459 * break up these lines: long line (> BUFSIZE), last line of file, or 460 * short return from read(), as from tee(1) input 461 */ 462 if (j < 0 && (count == (BUFSIZE - nleftover))) { 463 str[count++] = NL; 464 str[count] = EOS; 465 linetemp[0] = EOS; 466 nleftover = 0; 467 return (str + count); 468 } else { 469 nleftover = count - j - 1; 470 strncpy(linetemp, str + j + 1, nleftover); 471 return (str + j); 472 } 473 } 474 475 /* 476 * Process partial match. First check for mis-aligned Kanji, then match line 477 * against full compiled r.e. if statistics do not warrant handing off to 478 * standard egrep. 479 */ 480 char * 481 submatch(file, pat, str, strend, k, altindex) 482 char file[], pat[], str[]; 483 register char *strend, *k; 484 int altindex; 485 { 486 register char *s; 487 char *t, c; 488 489 t = k; 490 s = ((altflag) ? k - altlen[altindex] + 1 : k - altmin + 1); 491 #ifndef NOKANJI 492 c = ((altflag) ? altpat[altindex][0] : pat[0]); 493 if (c & NONASCII) 494 if ((s = kanji(str, s, k)) == NULL) 495 return (++k); /* reject false kanji */ 496 #endif 497 do; 498 while (*s != NL && --s >= str); 499 k = s + 1; /* now at line start */ 500 501 if (boyonly) 502 return (gotamatch(file, k)); 503 504 incount = counted - (strend - k); 505 if (boyfound++ == FIRSTFEW) 506 execstrategy(file); 507 508 s = t; 509 do 510 rxcount++; 511 while (*s++ != NL); 512 *--s = EOS; 513 /* 514 * "quick henry -- the flit" (after theodor geisel) 515 */ 516 if (regexec(rspencer, ((iflag) ? fold(k) : k)) == 1) { 517 *s = NL; 518 if (gotamatch(file, k) == NULL) 519 return (NULL); 520 } 521 *s = NL; 522 return (s + 1); 523 } 524 525 #ifndef NOKANJI 526 /* 527 * EUC code disambiguation -- scan backwards to first 7-bit code, while 528 * counting intervening 8-bit codes. If odd, reject unaligned Kanji pattern. 529 * SS2/3 checks are for intermixed Japanase Katakana or Kanji2. 530 */ 531 char * 532 kanji(str, s, k) 533 register char *str, *s, *k; 534 { 535 register int j = 0; 536 537 for (s--; s >= str; s--) { 538 if (*s == SS2 || *s == SS3 || (*s & NONASCII) == 0) 539 break; 540 j++; 541 } 542 #ifndef CHINESE 543 if (*s == SS2) 544 j -= 1; 545 #endif CHINESE 546 return ((j & 01) ? NULL : k); 547 } 548 #endif 549 550 /* 551 * Compute "Boyer-Moore" delta table -- put skip distance in delta0[c] 552 */ 553 gosper(pattern) 554 char *pattern; /* ... HAKMEM lives ... */ 555 { 556 register int i, j; 557 unsigned char c; 558 559 /* Make one-string case look like simple alternatives case */ 560 if (!altflag) { 561 nalt = 1; 562 altmin = altlen[0] = strlen(pattern); 563 altpat[0] = pattern; 564 } 565 /* For chars that aren't in any string, skip by string length. */ 566 for (j = 0; j < 256; j++) { 567 delta0[j] = altmin; 568 cmap[j] = j; /* Sneak in initialization of cmap */ 569 } 570 571 /* For chars in a string, skip distance from char to end of string. */ 572 /* (If char appears more than once, skip minimum distance.) */ 573 for (i = 0; i < nalt; i++) 574 for (j = 0; j < altlen[i] - 1; j++) { 575 c = altpat[i][j]; 576 delta0[c] = MIN(delta0[c], altlen[i] - j - 1); 577 if (iflag && islower((int) c)) 578 delta0[toupper((int) c)] = delta0[c]; 579 } 580 581 /* For last char of each string, fall out of search loop. */ 582 for (i = 0; i < nalt; i++) { 583 c = altpat[i][altlen[i] - 1]; 584 delta0[c] = LARGE; 585 if (iflag && islower((int) c)) 586 delta0[toupper((int) c)] = LARGE; 587 } 588 if (iflag) 589 for (j = 'A'; j <= 'Z'; j++) 590 cmap[j] = tolower((int) j); 591 } 592 593 /* 594 * Print, count, or stop on full match. Result is either the location for 595 * continued search, or NULL to stop. 596 */ 597 char * 598 gotamatch(file, s) 599 register char *file, *s; 600 { 601 char *savematch(); 602 int squirrel = 0; /* nonzero to squirrel away FIRSTFEW matches */ 603 604 nmatch++; 605 nsuccess = 1; 606 if (!boyonly && boyfound <= FIRSTFEW && file != NULL) 607 squirrel = 1; 608 609 if (sflag) 610 return (NULL); /* -s usurps all flags (unlike some versions) */ 611 if (cflag) { /* -c overrides -l, we guess */ 612 do; 613 while (*s++ != NL); 614 } else if (lflag) { 615 puts(file); 616 return (NULL); 617 } else { 618 if (!hflag) 619 if (!squirrel) 620 printf("%s:", file); 621 else 622 (void)sprintf(preamble, "%s:", file); 623 if (nflag) { 624 if (prevmatch) 625 prevnline = prevnline + nlcount(prevloc, s); 626 else 627 prevnline = nline + nlcount(str, s); 628 prevmatch = 1; 629 630 if (!squirrel) 631 printf("%ld:", prevnline); 632 else 633 (void)sprintf(preamble + strlen(preamble), 634 "%ld:", prevnline); 635 } 636 if (!squirrel) { 637 do 638 putchar(*s); 639 while (*s++ != NL); 640 } else 641 s = savematch(s); 642 643 if (nflag) 644 prevloc = s - 1; 645 } 646 return ((firstflag && !cflag) ? NULL : s); 647 } 648 649 char * 650 fold(line) 651 char *line; 652 { 653 static char fline[BUFSIZE]; 654 register char *s, *t = fline; 655 656 for (s = line; *s != EOS; s++) 657 *t++ = (isupper((int) *s) ? (char) tolower((int) *s) : *s); 658 *t = EOS; 659 return (fline); 660 } 661 662 strindex(s, t) /* the easy way, as in K&P, p. 192 */ 663 char *s, *t; 664 { 665 int i, n; 666 667 n = strlen(t); 668 for (i = 0; s[i] != '\0'; i++) 669 if (strncmp(s + i, t, n) == 0) 670 return (i); 671 return (-1); 672 } 673 674 char * 675 grepxlat(pattern) /* grep pattern meta conversion */ 676 char *pattern; 677 { 678 register char *p, *s; 679 static char newpat[BUFSIZE]; 680 681 for (s = newpat, p = pattern; *p != EOS;) { 682 if (*p == '\\') { /* skip escapes ... */ 683 *s++ = *p++; 684 if (*p) 685 *s++ = *p++; 686 } else if (*p == '[') { /* ... and char classes */ 687 while (*p != EOS && *p != ']') 688 *s++ = *p++; 689 } else if (strchr("+?|()", *p) != NULL) { 690 *s++ = '\\'; /* insert protection */ 691 *s++ = *p++; 692 } else 693 *s++ = *p++; 694 } 695 *s = EOS; 696 grepflag = ((patind) ? 0 : 1); 697 return (newpat); 698 } 699 700 /* 701 * Test for simple alternation. Result is NULL if it's not so simple, or is 702 * a pointer to the first string if it is. Warning: sscanf size is a 703 * fixpoint, beyond which the speedup linearity starts to break down. In the 704 * wake of the elegant aho/corrasick "trie"-based fgrep, generalizing 705 * altpat[] to arbitrary size is not useful. 706 */ 707 char * 708 alternate(regexpr) 709 char *regexpr; 710 { 711 register int i, j; 712 register char *start, *stop; 713 unsigned char c; 714 715 if (fgrepflag && strchr(regexpr, '|')) 716 return (NULL); 717 718 /* 719 * break pattern up into altpat array; delimit on newline, bar, 720 * or EOS. We know we won't overflow, we've already checked the 721 * number of patterns we're going to find against NALT. 722 * Also, set length of pattern and find minimum pattern length. 723 */ 724 nalt = 0; 725 altmin = NMUSH; 726 for (start = stop = regexpr;; ++stop) 727 if (!*stop || *stop == '|' || *stop == NL) { 728 altlen[nalt] = j = stop - start; 729 if (j < altmin) 730 altmin = j; 731 if (!(altpat[nalt] = malloc((u_int)(j + 1)))) 732 oops("out of memory"); 733 bcopy(start, altpat[nalt], j); 734 altpat[nalt][j] = EOS; 735 ++nalt; 736 if (!*stop) 737 break; 738 if (nalt == NALT) 739 return(NULL); 740 if (*stop == NL) 741 *stop = '|'; 742 start = stop + 1; 743 } 744 if (!fgrepflag) { 745 if (strchr(regexpr, '|') == NULL || regexpr[0] == '|') 746 return (NULL); 747 if (strpbrk(regexpr, "^$.[]()?+*\\") != NULL 748 || strindex(regexpr, "||") >= 0) 749 return (NULL); 750 } 751 752 if (nalt > 1) { /* build superimposed "pre-match" sets per 753 * char */ 754 altflag++; 755 for (j = 0; j < nalt; j++) 756 for (i = 0; i < altmin; i++) { 757 c = altpat[j][altlen[j] - altmin + i]; 758 altset[i + 1][c] = 1; /* offset for sentinel */ 759 } 760 } 761 return (altpat[0]); 762 } 763 764 /* 765 * Grapple with the dfa (std egrep) vs. ndfa (regexp) tradeoff. Criteria to 766 * determine whether to use dfa-based egrep: We do FIRSTFEW matches with 767 * regexec(). If Boyer-Moore up to now matched more than PUNTPERCENT 768 * of the input, the r.e. is likely to be underspecified, so do old *grep, 769 * which is faster on complex patterns than regexp(). At FIRSTFEW, 770 * dump the saved matches collected by savematch(). They are saved 771 * so that a "PUNT" can "rewind" to ignore them. Stdin is problematic, 772 * since it's hard to rewind. 773 */ 774 775 execstrategy(file) 776 char *file; 777 { 778 int pctmatch; 779 780 pctmatch = (100 * rxcount) / incount; 781 if (pctmatch > PUNTPERCENT && file != NULL) 782 kernighan(args); 783 if (file != NULL) 784 flushmatches(); 785 } 786 787 nlcount(bstart, bstop) /* flail interval to totalize newlines. */ 788 char *bstart, *bstop; 789 { 790 register char *s = bstart; 791 register char *t = bstop; 792 register int count = 0; 793 794 do { /* loop unroll for older architectures */ 795 if (*t == NL) /* ... ask ames!jaw for sample code */ 796 count++; 797 } while (t-- > s); 798 799 return (count); 800 } 801 802 char * 803 isolate(regexpr) /* isolate longest metacharacter-free string */ 804 char *regexpr; 805 { 806 char *dummyexpr; 807 808 /* 809 * We add (.)* because Henry's regcomp only figures regmust if it 810 * sees a leading * pattern. Foo! 811 */ 812 dummyexpr = malloc((unsigned) strlen(regexpr) + 5); 813 (void)sprintf(dummyexpr, "(.)*%s", regexpr); 814 if ((rspencer = regcomp(dummyexpr)) == NULL) 815 kernighan(args); 816 return (rspencer->regmust); 817 } 818 819 char *matches[FIRSTFEW]; 820 static int mcount = 0; 821 822 char * 823 savematch(s) /* horde matches during statistics gathering */ 824 register char *s; 825 { 826 char *p; 827 char *start = s; 828 int msize = 0; 829 int psize = strlen(preamble); 830 831 while (*s++ != NL) 832 msize++; 833 *--s = EOS; 834 835 p = malloc((unsigned) msize + 1 + psize); 836 strcpy(p, preamble); 837 strcpy(p + psize, start); 838 matches[mcount++] = p; 839 840 preamble[0] = 0; 841 *s = NL; 842 return (s); 843 } 844 845 flushmatches() 846 { 847 int n; 848 849 flushflag = 1; 850 for (n = 0; n < mcount; n++) 851 printf("%s\n", matches[n]); 852 mcount = 0; 853 } 854 855 oops(message) 856 char *message; 857 { 858 fprintf(stderr, "%s: %s\n", progname, message); 859 exit(2); 860 } 861 862 kernighan(args) /* "let others do the hard part ..." */ 863 char *args[]; 864 { 865 /* 866 * We may have already run grep on some of the files; remove them 867 * from the arg list we pass on. Note that we can't delete them 868 * totally because the number of file names affects the output 869 * (automatic -h). 870 */ 871 /* better would be fork/exec per punted file -- jaw */ 872 873 while (firstfile && optind > firstfile) 874 args[firstfile++] = _PATH_DEVNULL; 875 if (patind) 876 args[patind] = pattern; 877 (void) fflush(stdout); 878 879 if (grepflag) 880 execvp(_PATH_GREPSTD, args), oops("can't exec old 'grep'"); 881 else if (fgrepflag) 882 execvp(_PATH_FGREPSTD, args), oops("can't exec old 'fgrep'"); 883 else 884 execvp(_PATH_EGREPSTD, args), oops("can't exec old 'egrep'"); 885 } 886