1 #ifndef lint 2 static char sccsid[] = "@(#)egrep.c 5.9 (Berkeley) 10/22/87"; 3 #endif not lint 4 5 /* 6 Hybrid Boyer/Moore/Gosper-assisted 'grep/egrep/fgrep' search, with delta0 7 table as in original paper (CACM, October, 1977). No delta1 or delta2. 8 According to experiment (Horspool, Soft. Prac. Exp., 1982), delta2 is of 9 minimal practical value. However, to improve for worst case input, 10 integrating the improved Galil strategies (Apostolico/Giancarlo, SIAM. J. 11 Comput., Feb. 1986) deserves consideration. 12 13 Method: extract longest metacharacter-free string from expression. 14 this is done using a side-effect from henry spencer's regcomp(). 15 use boyer-moore to match such, then pass submatching lines 16 to either regexp() or standard 'egrep', depending on certain 17 criteria within execstrategy() below. [this tradeoff is due 18 to the general slowness of the regexp() nondeterministic 19 machine on complex expressions, as well as the startup time 20 of standard 'egrep' on short files.] alternatively, one may 21 change the vendor-supplied 'egrep' automaton to include 22 boyer-moore directly. see accompanying writeup for discussion 23 of kanji expression treatment. 24 25 late addition: apply trickbag for fast match of simple 26 alternations (sublinear, in common low-cardinality cases). 27 trap fgrep into this lair. 28 29 gnu additions: -f, newline as |, \< and \> [in regexec()], more 30 comments. inspire better dfa exec() strategy. 31 serious testing and help with special cases. 32 33 Algorithm amalgam summary: 34 35 dfa e?grep (aho/thompson) 36 ndfa regexp() (spencer/aho) 37 bmg (boyer/moore/gosper) 38 "superimposed" bmg (jaw) 39 fgrep (aho/corrasick) 40 41 sorry, but the knuth/morris/pratt machine, horspool's 42 "frequentist" code, and the rabin/karp matcher, however cute, 43 just don't cut it for this production. 44 45 James A. Woods Copyright (c) 1986 46 NASA Ames Research Center 47 */ 48 #include <stdio.h> 49 #include <ctype.h> 50 #include <sys/types.h> 51 #include <sys/stat.h> 52 #include <sys/file.h> 53 #include <regexp.h> /* must be henry spencer's version */ 54 55 #define MIN(A, B) ((A) > (B) ? (B) : (A)) 56 57 #ifdef SLOWSYS 58 #define read xread 59 #endif 60 /* 61 * define existing [ef]?grep program locations below for use by execvp(). 62 * [execlp() would be used were it not for the possibility of 63 * installation-dependent recursion.] 64 */ 65 #ifndef EGREPSTD 66 #define EGREPSTD "/usr/bin/egrep" 67 #endif 68 #ifndef GREPSTD 69 #define GREPSTD "/bin/grep" 70 #endif 71 #ifndef FGREPSTD 72 #define FGREPSTD "/usr/bin/fgrep" 73 #endif 74 75 #define BUFSIZE 8192 /* make higher for cray */ 76 #define PATSIZE 6000 77 #define LARGE BUFSIZE + PATSIZE 78 79 #define NALT 7 /* tied to scanf() size in alternate() */ 80 #define NMUSH 6 /* loosely relates to expected alt length */ 81 82 #define FIRSTFEW 33 /* Always do FIRSTFEW matches with regexec() */ 83 #define PUNTPERCENT 10 /* After FIRSTFEW, if PUNTPERCENT of the input 84 * was processed by regexp(), exec std egrep. */ 85 #define NL '\n' 86 #define EOS '\0' 87 #define NONASCII 0200 /* Bit mask for Kanji non-ascii chars */ 88 #define META "\n^$.[]()?+*|\\" /* egrep meta-characters */ 89 #define SS2 '\216' /* EUC Katakana (or Chinese2) prefix */ 90 #define SS3 '\217' /* EUC Kanji2 (or Chinese3) prefix */ 91 92 extern char *optarg; 93 extern int optind; 94 char *progname; 95 96 int cflag, iflag, eflag, fflag, lflag, nflag; /* SVID flags */ 97 int sflag, hflag; /* v7, v8, bsd */ 98 99 int firstflag; /* Stop at first match */ 100 int grepflag; /* Called as "grep" */ 101 int fgrepflag; /* Called as "fgrep" */ 102 int altflag; /* Simple alternation in pattern */ 103 int boyonly; /* No regexp needed -- all simple */ 104 int flushflag; 105 int grepold, egrepold, fgrepold; 106 107 int nalt; /* Number of alternatives */ 108 int nsuccess; /* 1 for match, 2 for error */ 109 int altmin; /* Minimum length of all the alternate 110 * strings */ 111 int firstfile; /* argv index of first file argument */ 112 int patind; /* argv index of pattern */ 113 long nmatch; /* Number of matches in this file */ 114 long incount, counted; /* Amount of input consumed */ 115 long rxcount; /* Bytes of input processed by regexec() */ 116 int boyfound; /* accumulated partial matches (tripped by 117 * FIRSTFEW) */ 118 int prevmatch; /* next three lines aid fast -n */ 119 long nline, prevnline; 120 char *prevloc; 121 122 regexp *rspencer; 123 char *pattern; 124 char *patboy; /* Pattern for simple Boyer-Moore */ 125 char *patfile; /* Filename containing pattern(s) */ 126 127 int delta0[256]; /* Boyer-Moore algorithm core */ 128 char cmap[256]; /* Usually 0-255, but if -i, maps upper to 129 * lower case */ 130 char str[BUFSIZE + 2]; 131 int nleftover; 132 char linetemp[BUFSIZE]; 133 char *altpat[NALT]; /* alternation component storage */ 134 int altlen[NALT]; 135 short altset[NMUSH + 1][256]; 136 char preamble[200]; /* match prefix (filename, line no.) */ 137 138 int fd; 139 char * 140 strchr(), *strrchr(), *strcpy(), *strncpy(), *strpbrk(), *malloc(); 141 char * 142 grepxlat(), *fold(), *pfile(), *alternate(), *isolate(); 143 char **args; 144 145 main(argc, argv) 146 int argc; 147 char *argv[]; 148 { 149 int c, oflag; 150 int errflag = 0; 151 152 args = argv; 153 154 if ((progname = strrchr(argv[0], '/')) != 0) 155 progname++; 156 else 157 progname = argv[0]; 158 if (strcmp(progname, "grep") == 0) 159 grepflag++; 160 else if (strcmp(progname, "fgrep") == 0) 161 fgrepflag++; 162 163 oflag = 0; 164 while ((c = getopt(argc, argv, "bchie:f:lnosvwxy1")) != EOF) { 165 switch (c) { 166 167 case 'f': 168 fflag++; 169 patfile = optarg; 170 continue; 171 case 'b': 172 case 'v': 173 egrepold++; /* boyer-moore of little help here */ 174 continue; 175 case 'c': 176 cflag++; 177 continue; 178 case 'e': 179 eflag++; 180 pattern = optarg; 181 continue; 182 case 'h': 183 hflag++; 184 continue; 185 case 'o': 186 oflag++; 187 continue; 188 case '1': /* Stop at very first match */ 189 firstflag++; /* spead freaks only */ 190 continue; 191 case 'i': 192 iflag++; 193 continue; 194 case 'l': 195 lflag++; 196 continue; 197 case 'n': 198 nflag++; 199 continue; 200 case 's': 201 sflag++; 202 continue; 203 case 'w': 204 case 'y': 205 if (!grepflag) 206 errflag++; 207 grepold++; 208 continue; 209 case 'x': /* needs more work, like -b above */ 210 if (!fgrepflag) 211 errflag++; 212 fgrepold++; 213 continue; 214 case '?': 215 errflag++; 216 } 217 } 218 if (errflag || ((argc <= optind) && !fflag && !eflag)) { 219 if (grepflag) 220 oops("usage: grep [-bchilnosvwy] [-e] pattern [file ...]"); 221 else if (fgrepflag) 222 oops("usage: fgrep [-bchilnosvx] {-f patfile | [-e] strings} [file ...]"); 223 else /* encourage SVID options, though we provide 224 * others */ 225 oops("usage: egrep [-bchilnosv] {-f patfile | [-e] pattern} [file ...]"); 226 } 227 patind = optind; 228 if (fflag) 229 pattern = pfile(patfile), patind = 0; 230 else if (!eflag) 231 pattern = argv[optind++]; 232 233 if (!oflag && (argc - optind) <= 1) /* Filename invisible given < 2 files */ 234 hflag++; 235 if (pattern[0] == EOS) 236 kernighan(argv); /* same as it ever was */ 237 /* 238 * 'grep/egrep' merger -- "old" grep is called to handle: tagged 239 * exprs \( \), word matches \< and \>, -w and -y options, char 240 * classes with '-' at end (egrep bug?), and patterns beginning with 241 * an asterisk (don't ask why). otherwise, characters meaningful to 242 * 'egrep' but not to 'grep' are escaped; the entire expr is then 243 * passed to 'egrep'. 244 */ 245 if (grepflag && !grepold) { 246 if (strindex(pattern, "\\(") >= 0 || 247 strindex(pattern, "\\<") >= 0 || 248 strindex(pattern, "\\>") >= 0 || 249 strindex(pattern, "-]") >= 0 || 250 pattern[0] == '*') /* grep bug */ 251 grepold++; 252 else 253 pattern = grepxlat(pattern); 254 } 255 if (grepold || egrepold || fgrepold) 256 kernighan(argv); 257 258 if (iflag) 259 strcpy(pattern, fold(pattern)); 260 /* 261 * If the pattern is a plain string, just run boyer-moore. If it 262 * consists of meta-free alternatives, run "superimposed" bmg. 263 * Otherwise, find best string, and compile pattern for regexec(). 264 */ 265 if (strpbrk(pattern, META) == NULL) { /* do boyer-moore only */ 266 boyonly++; 267 patboy = pattern; 268 } else { 269 if ((patboy = alternate(pattern)) != NULL) 270 boyonly++; 271 else { 272 if ((patboy = isolate(pattern)) == NULL) 273 kernighan(argv); /* expr too involved */ 274 #ifndef NOKANJI 275 for (c = 0; pattern[c] != EOS; c++) 276 if (pattern[c] & NONASCII) /* kanji + meta */ 277 kernighan(argv); 278 #endif 279 if ((rspencer = regcomp(pattern)) == NULL) 280 oops("regcomp failure"); 281 } 282 } 283 gosper(patboy); /* "pre-conditioning is wonderful" 284 * -- v. strassen */ 285 286 if ((firstfile = optind) >= argc) { 287 /* Grep standard input */ 288 if (lflag) /* We don't know its name! */ 289 exit(1); 290 egsecute((char *) NULL); 291 } else { 292 while (optind < argc) { 293 egsecute(argv[optind]); 294 optind++; 295 if (firstflag && (nsuccess == 1)) 296 break; 297 } 298 } 299 exit((nsuccess == 2) ? 2 : (nsuccess == 0)); 300 } 301 302 char * 303 pfile(pfname) /* absorb expression from file */ 304 char *pfname; 305 { 306 int fd; 307 struct stat patstat; 308 static char *pat; 309 310 if ((fd = open(pfname, O_RDONLY, 0)) < 0) 311 oops("can't read pattern file"); 312 if (fstat(fd, &patstat) != 0) 313 oops("can't stat pattern file"); 314 if (patstat.st_size > PATSIZE) { 315 if (fgrepflag) { /* defer to unix version */ 316 fgrepold++; 317 return "dummy"; 318 } else 319 oops("pattern file too big"); 320 } 321 if ((pat = malloc((unsigned) patstat.st_size + 1)) == NULL) 322 oops("out of memory to read pattern file"); 323 if (patstat.st_size != read(fd, pat, (int)patstat.st_size)) 324 oops("error reading pattern file"); 325 (void) close(fd); 326 327 pat[patstat.st_size] = EOS; 328 if (pat[patstat.st_size - 1] == NL) /* NOP for egrep; helps grep */ 329 pat[patstat.st_size - 1] = EOS; 330 331 if (nlcount(pat, &pat[patstat.st_size]) > NALT) { 332 if (fgrepflag) 333 fgrepold++; /* "what's it all about, alfie?" */ 334 else 335 egrepold++; 336 } 337 return (pat); 338 } 339 340 egsecute(file) 341 char *file; 342 { 343 if (file == NULL) 344 fd = 0; 345 else if ((fd = open(file, O_RDONLY, 0)) <= 0) { 346 fprintf(stderr, "%s: can't open %s\n", progname, file); 347 nsuccess = 2; 348 return; 349 } 350 chimaera(file, patboy); 351 352 if (!boyonly && !flushflag && file != NULL) 353 flushmatches(); 354 if (file != NULL) 355 close(fd); 356 } 357 358 chimaera(file, pat) /* "reach out and boyer-moore search someone" */ 359 char *file, *pat; /* -- soon-to-be-popular bumper sticker */ 360 { 361 register char *k, *strend, *s; 362 register int j, count; 363 register int *deltazero = delta0; 364 int patlen = altmin; 365 char *t; 366 char *gotamatch(), *kanji(), *linesave(), *submatch(); 367 368 nleftover = boyfound = flushflag = 0; 369 nline = 1L; 370 prevmatch = 0; 371 nmatch = counted = rxcount = 0L; 372 373 while ((count = read(fd, str + nleftover, BUFSIZE - nleftover)) > 0) { 374 375 counted += count; 376 strend = linesave(str, count); 377 378 for (k = str + patlen - 1; k < strend;) { 379 /* 380 * for a large class of patterns, upwards of 80% of 381 * match time is spent on the next line. we beat 382 * existing microcode (vax 'matchc') this way. 383 */ 384 while ((k += deltazero[*(unsigned char *) k]) < strend); 385 if (k < (str + LARGE)) 386 break; 387 k -= LARGE; 388 389 if (altflag) { 390 /* 391 * Parallel Boyer-Moore. Check whether each 392 * of the previous <altmin> chars COULD be 393 * from one of the alternative strings. 394 */ 395 s = k - 1; 396 j = altmin; 397 while (altset[--j][(unsigned char) 398 cmap[*(unsigned char *) s--]]); 399 /* 400 * quick test fails. in this life, compare 401 * 'em all. but, a "reverse trie" would 402 * attenuate worst case (linear w/delta2?). 403 */ 404 if (--j < 0) { 405 count = nalt - 1; 406 do { 407 s = k; 408 j = altlen[count]; 409 t = altpat[count]; 410 411 while 412 (cmap[*(unsigned char *) s--] 413 == t[--j]); 414 if (j < 0) 415 break; 416 } 417 while (count--); 418 } 419 } else { 420 /* One string -- check it */ 421 j = patlen - 1; 422 s = k - 1; 423 while (cmap[*(unsigned char *) s--] == pat[--j]); 424 } 425 /* 426 * delta-less shortcut for literati. short shrift for 427 * genetic engineers? 428 */ 429 if (j >= 0) { 430 k++; /* no match; restart next char */ 431 continue; 432 } 433 k = submatch(file, pat, str, strend, k, count); 434 if (k == NULL) 435 return; 436 } 437 if (nflag) { 438 if (prevmatch) 439 nline = prevnline + nlcount(prevloc, k); 440 else 441 nline = nline + nlcount(str, k); 442 prevmatch = 0; 443 } 444 strncpy(str, linetemp, nleftover); 445 } 446 if (cflag) { 447 /* Bug from old grep: -c overrides -h. We fix the bug. */ 448 if (!hflag) 449 printf("%s:", file); 450 printf("%ld\n", nmatch); 451 } 452 } 453 454 char * 455 linesave(str, count) /* accumulate partial line at end of buffer */ 456 char str[]; 457 register int count; 458 { 459 register int j; 460 461 count += nleftover; 462 if (count != BUFSIZE && fd != 0) 463 str[count++] = NL; /* insurance for broken last line */ 464 str[count] = EOS; 465 for (j = count - 1; str[j] != NL && j >= 0;) 466 j--; 467 /* 468 * break up these lines: long line (> BUFSIZE), last line of file, or 469 * short return from read(), as from tee(1) input 470 */ 471 if (j < 0 && (count == (BUFSIZE - nleftover))) { 472 str[count++] = NL; 473 str[count] = EOS; 474 linetemp[0] = EOS; 475 nleftover = 0; 476 return (str + count); 477 } else { 478 nleftover = count - j - 1; 479 strncpy(linetemp, str + j + 1, nleftover); 480 return (str + j); 481 } 482 } 483 484 /* 485 * Process partial match. First check for mis-aligned Kanji, then match line 486 * against full compiled r.e. if statistics do not warrant handing off to 487 * standard egrep. 488 */ 489 char * 490 submatch(file, pat, str, strend, k, altindex) 491 char file[], pat[], str[]; 492 register char *strend, *k; 493 int altindex; 494 { 495 register char *s; 496 char *t, c; 497 498 t = k; 499 s = ((altflag) ? k - altlen[altindex] + 1 : k - altmin + 1); 500 #ifndef NOKANJI 501 c = ((altflag) ? altpat[altindex][0] : pat[0]); 502 if (c & NONASCII) 503 if ((s = kanji(str, s, k)) == NULL) 504 return (++k); /* reject false kanji */ 505 #endif 506 do; 507 while (*s != NL && --s >= str); 508 k = s + 1; /* now at line start */ 509 510 if (boyonly) 511 return (gotamatch(file, k)); 512 513 incount = counted - (strend - k); 514 if (boyfound++ == FIRSTFEW) 515 execstrategy(file); 516 517 s = t; 518 do 519 rxcount++; 520 while (*s++ != NL); 521 *--s = EOS; 522 /* 523 * "quick henry -- the flit" (after theodor geisel) 524 */ 525 if (regexec(rspencer, ((iflag) ? fold(k) : k)) == 1) { 526 *s = NL; 527 if (gotamatch(file, k) == NULL) 528 return (NULL); 529 } 530 *s = NL; 531 return (s + 1); 532 } 533 534 #ifndef NOKANJI 535 /* 536 * EUC code disambiguation -- scan backwards to first 7-bit code, while 537 * counting intervening 8-bit codes. If odd, reject unaligned Kanji pattern. 538 * SS2/3 checks are for intermixed Japanase Katakana or Kanji2. 539 */ 540 char * 541 kanji(str, s, k) 542 register char *str, *s, *k; 543 { 544 register int j = 0; 545 546 for (s--; s >= str; s--) { 547 if (*s == SS2 || *s == SS3 || (*s & NONASCII) == 0) 548 break; 549 j++; 550 } 551 #ifndef CHINESE 552 if (*s == SS2) 553 j -= 1; 554 #endif CHINESE 555 return ((j & 01) ? NULL : k); 556 } 557 #endif 558 559 /* 560 * Compute "Boyer-Moore" delta table -- put skip distance in delta0[c] 561 */ 562 gosper(pattern) 563 char *pattern; /* ... HAKMEM lives ... */ 564 { 565 register int i, j; 566 unsigned char c; 567 568 /* Make one-string case look like simple alternatives case */ 569 if (!altflag) { 570 nalt = 1; 571 altmin = altlen[0] = strlen(pattern); 572 altpat[0] = pattern; 573 } 574 /* For chars that aren't in any string, skip by string length. */ 575 for (j = 0; j < 256; j++) { 576 delta0[j] = altmin; 577 cmap[j] = j; /* Sneak in initialization of cmap */ 578 } 579 580 /* For chars in a string, skip distance from char to end of string. */ 581 /* (If char appears more than once, skip minimum distance.) */ 582 for (i = 0; i < nalt; i++) 583 for (j = 0; j < altlen[i] - 1; j++) { 584 c = altpat[i][j]; 585 delta0[c] = MIN(delta0[c], altlen[i] - j - 1); 586 if (iflag && islower((int) c)) 587 delta0[toupper((int) c)] = delta0[c]; 588 } 589 590 /* For last char of each string, fall out of search loop. */ 591 for (i = 0; i < nalt; i++) { 592 c = altpat[i][altlen[i] - 1]; 593 delta0[c] = LARGE; 594 if (iflag && islower((int) c)) 595 delta0[toupper((int) c)] = LARGE; 596 } 597 if (iflag) 598 for (j = 'A'; j <= 'Z'; j++) 599 cmap[j] = tolower((int) j); 600 } 601 602 /* 603 * Print, count, or stop on full match. Result is either the location for 604 * continued search, or NULL to stop. 605 */ 606 char * 607 gotamatch(file, s) 608 register char *file, *s; 609 { 610 char *savematch(); 611 int squirrel = 0; /* nonzero to squirrel away FIRSTFEW matches */ 612 613 nmatch++; 614 nsuccess = 1; 615 if (!boyonly && boyfound <= FIRSTFEW && file != NULL) 616 squirrel = 1; 617 618 if (sflag) 619 return (NULL); /* -s usurps all flags (unlike some versions) */ 620 if (cflag) { /* -c overrides -l, we guess */ 621 do; 622 while (*s++ != NL); 623 } else if (lflag) { 624 puts(file); 625 return (NULL); 626 } else { 627 if (!hflag) 628 if (!squirrel) 629 printf("%s:", file); 630 else 631 (void)sprintf(preamble, "%s:", file); 632 if (nflag) { 633 if (prevmatch) 634 prevnline = prevnline + nlcount(prevloc, s); 635 else 636 prevnline = nline + nlcount(str, s); 637 prevmatch = 1; 638 639 if (!squirrel) 640 printf("%ld:", prevnline); 641 else 642 (void)sprintf(preamble + strlen(preamble), 643 "%ld:", prevnline); 644 } 645 if (!squirrel) { 646 do 647 putchar(*s); 648 while (*s++ != NL); 649 } else 650 s = savematch(s); 651 652 if (nflag) 653 prevloc = s - 1; 654 } 655 return ((firstflag && !cflag) ? NULL : s); 656 } 657 658 char * 659 fold(line) 660 char *line; 661 { 662 static char fline[BUFSIZE]; 663 register char *s, *t = fline; 664 665 for (s = line; *s != EOS; s++) 666 *t++ = (isupper((int) *s) ? (char) tolower((int) *s) : *s); 667 *t = EOS; 668 return (fline); 669 } 670 671 strindex(s, t) /* the easy way, as in K&P, p. 192 */ 672 char *s, *t; 673 { 674 int i, n; 675 676 n = strlen(t); 677 for (i = 0; s[i] != '\0'; i++) 678 if (strncmp(s + i, t, n) == 0) 679 return (i); 680 return (-1); 681 } 682 683 char * 684 grepxlat(pattern) /* grep pattern meta conversion */ 685 char *pattern; 686 { 687 register char *p, *s; 688 static char newpat[BUFSIZE]; 689 690 for (s = newpat, p = pattern; *p != EOS;) { 691 if (*p == '\\') { /* skip escapes ... */ 692 *s++ = *p++; 693 if (*p) 694 *s++ = *p++; 695 } else if (*p == '[') { /* ... and char classes */ 696 while (*p != EOS && *p != ']') 697 *s++ = *p++; 698 } else if (strchr("+?|()", *p) != NULL) { 699 *s++ = '\\'; /* insert protection */ 700 *s++ = *p++; 701 } else 702 *s++ = *p++; 703 } 704 *s = EOS; 705 grepflag = ((patind) ? 0 : 1); 706 return (newpat); 707 } 708 709 /* 710 * Test for simple alternation. Result is NULL if it's not so simple, or is 711 * a pointer to the first string if it is. Warning: sscanf size is a 712 * fixpoint, beyond which the speedup linearity starts to break down. In the 713 * wake of the elegant aho/corrasick "trie"-based fgrep, generalizing 714 * altpat[] to arbitrary size is not useful. 715 */ 716 char * 717 alternate(regexpr) 718 char *regexpr; 719 { 720 register int i, j; 721 register char *start, *stop; 722 unsigned char c; 723 724 if (fgrepflag && strchr(regexpr, '|')) 725 return (NULL); 726 727 /* 728 * break pattern up into altpat array; delimit on newline, bar, 729 * or EOS. We know we won't overflow, we've already checked the 730 * number of patterns we're going to find against NALT. 731 * Also, set length of pattern and find minimum pattern length. 732 */ 733 nalt = 0; 734 altmin = NMUSH; 735 for (start = stop = regexpr;; ++stop) 736 if (!*stop || *stop == '|' || *stop == NL) { 737 altlen[nalt] = j = stop - start; 738 if (j < altmin) 739 altmin = j; 740 if (!(altpat[nalt] = malloc((u_int)(j + 1)))) 741 oops("out of memory"); 742 bcopy(start, altpat[nalt], j); 743 altpat[nalt][j] = EOS; 744 ++nalt; 745 if (!*stop) 746 break; 747 if (nalt == NALT) 748 return(NULL); 749 if (*stop == NL) 750 *stop = '|'; 751 start = stop + 1; 752 } 753 if (!fgrepflag) { 754 if (strchr(regexpr, '|') == NULL || regexpr[0] == '|') 755 return (NULL); 756 if (strpbrk(regexpr, "^$.[]()?+*\\") != NULL 757 || strindex(regexpr, "||") >= 0) 758 return (NULL); 759 } 760 761 if (nalt > 1) { /* build superimposed "pre-match" sets per 762 * char */ 763 altflag++; 764 for (j = 0; j < nalt; j++) 765 for (i = 0; i < altmin; i++) { 766 c = altpat[j][altlen[j] - altmin + i]; 767 altset[i + 1][c] = 1; /* offset for sentinel */ 768 } 769 } 770 return (altpat[0]); 771 } 772 773 /* 774 * Grapple with the dfa (std egrep) vs. ndfa (regexp) tradeoff. Criteria to 775 * determine whether to use dfa-based egrep: We do FIRSTFEW matches with 776 * regexec(). If Boyer-Moore up to now matched more than PUNTPERCENT 777 * of the input, the r.e. is likely to be underspecified, so do old *grep, 778 * which is faster on complex patterns than regexp(). At FIRSTFEW, 779 * dump the saved matches collected by savematch(). They are saved 780 * so that a "PUNT" can "rewind" to ignore them. Stdin is problematic, 781 * since it's hard to rewind. 782 */ 783 784 execstrategy(file) 785 char *file; 786 { 787 int pctmatch; 788 789 pctmatch = (100 * rxcount) / incount; 790 if (pctmatch > PUNTPERCENT && file != NULL) 791 kernighan(args); 792 if (file != NULL) 793 flushmatches(); 794 } 795 796 nlcount(bstart, bstop) /* flail interval to totalize newlines. */ 797 char *bstart, *bstop; 798 { 799 register char *s = bstart; 800 register char *t = bstop; 801 register int count = 0; 802 803 do { /* loop unroll for older architectures */ 804 if (*t == NL) /* ... ask ames!jaw for sample code */ 805 count++; 806 } while (t-- > s); 807 808 return (count); 809 } 810 811 char * 812 isolate(regexpr) /* isolate longest metacharacter-free string */ 813 char *regexpr; 814 { 815 char *dummyexpr; 816 817 /* 818 * We add (.)* because Henry's regcomp only figures regmust if it 819 * sees a leading * pattern. Foo! 820 */ 821 dummyexpr = malloc((unsigned) strlen(regexpr) + 5); 822 (void)sprintf(dummyexpr, "(.)*%s", regexpr); 823 if ((rspencer = regcomp(dummyexpr)) == NULL) 824 kernighan(args); 825 return (rspencer->regmust); 826 } 827 828 char *matches[FIRSTFEW]; 829 static int mcount = 0; 830 831 char * 832 savematch(s) /* horde matches during statistics gathering */ 833 register char *s; 834 { 835 char *p; 836 char *start = s; 837 int msize = 0; 838 int psize = strlen(preamble); 839 840 while (*s++ != NL) 841 msize++; 842 *--s = EOS; 843 844 p = malloc((unsigned) msize + 1 + psize); 845 strcpy(p, preamble); 846 strcpy(p + psize, start); 847 matches[mcount++] = p; 848 849 preamble[0] = 0; 850 *s = NL; 851 return (s); 852 } 853 854 flushmatches() 855 { 856 int n; 857 858 flushflag = 1; 859 for (n = 0; n < mcount; n++) 860 printf("%s\n", matches[n]); 861 mcount = 0; 862 } 863 864 oops(message) 865 char *message; 866 { 867 fprintf(stderr, "%s: %s\n", progname, message); 868 exit(2); 869 } 870 871 kernighan(args) /* "let others do the hard part ..." */ 872 char *args[]; 873 { 874 /* 875 * We may have already run grep on some of the files; remove them 876 * from the arg list we pass on. Note that we can't delete them 877 * totally because the number of file names affects the output 878 * (automatic -h). 879 */ 880 /* better would be fork/exec per punted file -- jaw */ 881 882 while (firstfile && optind > firstfile) 883 args[firstfile++] = "/dev/null"; 884 if (patind) 885 args[patind] = pattern; 886 (void) fflush(stdout); 887 888 if (grepflag) 889 execvp(GREPSTD, args), oops("can't exec old 'grep'"); 890 else if (fgrepflag) 891 execvp(FGREPSTD, args), oops("can't exec old 'fgrep'"); 892 else 893 execvp(EGREPSTD, args), oops("can't exec old 'egrep'"); 894 } 895