1 /*- 2 * Copyright (c) 1991, 1993 3 * The Regents of the University of California. All rights reserved. 4 * 5 * %sccs.include.redist.c% 6 */ 7 8 #ifndef lint 9 static char copyright[] = 10 "@(#) Copyright (c) 1991, 1993\n\ 11 The Regents of the University of California. All rights reserved.\n"; 12 #endif /* not lint */ 13 14 #ifndef lint 15 static char sccsid[] = "@(#)egrep.c 8.1 (Berkeley) 06/06/93"; 16 #endif /* not lint */ 17 18 /* 19 Hybrid Boyer/Moore/Gosper-assisted 'grep/egrep/fgrep' search, with delta0 20 table as in original paper (CACM, October, 1977). No delta1 or delta2. 21 According to experiment (Horspool, Soft. Prac. Exp., 1982), delta2 is of 22 minimal practical value. However, to improve for worst case input, 23 integrating the improved Galil strategies (Apostolico/Giancarlo, SIAM. J. 24 Comput., Feb. 1986) deserves consideration. 25 26 Method: extract longest metacharacter-free string from expression. 27 this is done using a side-effect from henry spencer's regcomp(). 28 use boyer-moore to match such, then pass submatching lines 29 to either regexp() or standard 'egrep', depending on certain 30 criteria within execstrategy() below. [this tradeoff is due 31 to the general slowness of the regexp() nondeterministic 32 machine on complex expressions, as well as the startup time 33 of standard 'egrep' on short files.] alternatively, one may 34 change the vendor-supplied 'egrep' automaton to include 35 boyer-moore directly. see accompanying writeup for discussion 36 of kanji expression treatment. 37 38 late addition: apply trickbag for fast match of simple 39 alternations (sublinear, in common low-cardinality cases). 40 trap fgrep into this lair. 41 42 gnu additions: -f, newline as |, \< and \> [in regexec()], more 43 comments. inspire better dfa exec() strategy. 44 serious testing and help with special cases. 45 46 Algorithm amalgam summary: 47 48 dfa e?grep (aho/thompson) 49 ndfa regexp() (spencer/aho) 50 bmg (boyer/moore/gosper) 51 "superimposed" bmg (jaw) 52 fgrep (aho/corrasick) 53 54 sorry, but the knuth/morris/pratt machine, horspool's 55 "frequentist" code, and the rabin/karp matcher, however cute, 56 just don't cut it for this production. 57 58 James A. Woods Copyright (c) 1986 59 NASA Ames Research Center 60 */ 61 #include <sys/types.h> 62 #include <sys/stat.h> 63 #include <sys/file.h> 64 #include <regexp.h> /* must be henry spencer's version */ 65 #include <stdio.h> 66 #include <ctype.h> 67 #include "pathnames.h" 68 69 #define MIN(A, B) ((A) > (B) ? (B) : (A)) 70 71 #ifdef SLOWSYS 72 #define read xread 73 #endif 74 75 #define BUFSIZE 8192 /* make higher for cray */ 76 #define PATSIZE 6000 77 #define LARGE BUFSIZE + PATSIZE 78 79 #define NALT 7 /* tied to scanf() size in alternate() */ 80 #define NMUSH 6 /* loosely relates to expected alt length */ 81 82 #define FIRSTFEW 33 /* Always do FIRSTFEW matches with regexec() */ 83 #define PUNTPERCENT 10 /* After FIRSTFEW, if PUNTPERCENT of the input 84 * was processed by regexp(), exec std egrep. */ 85 #define NL '\n' 86 #define EOS '\0' 87 #define NONASCII 0200 /* Bit mask for Kanji non-ascii chars */ 88 #define META "\n^$.[]()?+*|\\" /* egrep meta-characters */ 89 #define SS2 '\216' /* EUC Katakana (or Chinese2) prefix */ 90 #define SS3 '\217' /* EUC Kanji2 (or Chinese3) prefix */ 91 92 extern char *optarg; 93 extern int optind; 94 char *progname; 95 96 int cflag, iflag, eflag, fflag, lflag, nflag; /* SVID flags */ 97 int sflag, hflag; /* v7, v8, bsd */ 98 99 int firstflag; /* Stop at first match */ 100 int grepflag; /* Called as "grep" */ 101 int fgrepflag; /* Called as "fgrep" */ 102 int altflag; /* Simple alternation in pattern */ 103 int boyonly; /* No regexp needed -- all simple */ 104 int flushflag; 105 int grepold, egrepold, fgrepold; 106 107 int nalt; /* Number of alternatives */ 108 int nsuccess; /* 1 for match, 2 for error */ 109 int altmin; /* Minimum length of all the alternate 110 * strings */ 111 int firstfile; /* argv index of first file argument */ 112 int patind; /* argv index of pattern */ 113 long nmatch; /* Number of matches in this file */ 114 long incount, counted; /* Amount of input consumed */ 115 long rxcount; /* Bytes of input processed by regexec() */ 116 int boyfound; /* accumulated partial matches (tripped by 117 * FIRSTFEW) */ 118 int prevmatch; /* next three lines aid fast -n */ 119 long nline, prevnline; 120 char *prevloc; 121 122 regexp *rspencer; 123 char *pattern; 124 char *patboy; /* Pattern for simple Boyer-Moore */ 125 char *patfile; /* Filename containing pattern(s) */ 126 127 int delta0[256]; /* Boyer-Moore algorithm core */ 128 char cmap[256]; /* Usually 0-255, but if -i, maps upper to 129 * lower case */ 130 char str[BUFSIZE + 2]; 131 int nleftover; 132 char linetemp[BUFSIZE]; 133 char *altpat[NALT]; /* alternation component storage */ 134 int altlen[NALT]; 135 short altset[NMUSH + 1][256]; 136 char preamble[200]; /* match prefix (filename, line no.) */ 137 138 int fd; 139 char * 140 strchr(), *strrchr(), *strcpy(), *strncpy(), *strpbrk(), *malloc(); 141 char * 142 grepxlat(), *fold(), *pfile(), *alternate(), *isolate(); 143 char *gotamatch(), *kanji(), *linesave(), *submatch(); 144 char **args; 145 146 main(argc, argv) 147 int argc; 148 char *argv[]; 149 { 150 int c, oflag; 151 int errflag = 0; 152 153 args = argv; 154 155 if ((progname = strrchr(argv[0], '/')) != 0) 156 progname++; 157 else 158 progname = argv[0]; 159 if (strcmp(progname, "grep") == 0) 160 grepflag++; 161 else if (strcmp(progname, "fgrep") == 0) 162 fgrepflag++; 163 164 oflag = 0; 165 while ((c = getopt(argc, argv, "bchie:f:lnosvwxy1")) != EOF) { 166 switch (c) { 167 168 case 'f': 169 fflag++; 170 patfile = optarg; 171 continue; 172 case 'b': 173 case 'v': 174 egrepold++; /* boyer-moore of little help here */ 175 continue; 176 case 'c': 177 cflag++; 178 continue; 179 case 'e': 180 eflag++; 181 pattern = optarg; 182 continue; 183 case 'h': 184 hflag++; 185 continue; 186 case 'o': 187 oflag++; 188 continue; 189 case '1': /* Stop at very first match */ 190 firstflag++; /* spead freaks only */ 191 continue; 192 case 'i': 193 iflag++; 194 continue; 195 case 'l': 196 lflag++; 197 continue; 198 case 'n': 199 nflag++; 200 continue; 201 case 's': 202 sflag++; 203 continue; 204 case 'w': 205 case 'y': 206 if (!grepflag) 207 errflag++; 208 grepold++; 209 continue; 210 case 'x': /* needs more work, like -b above */ 211 if (!fgrepflag) 212 errflag++; 213 fgrepold++; 214 continue; 215 case '?': 216 errflag++; 217 } 218 } 219 if (errflag || ((argc <= optind) && !fflag && !eflag)) { 220 if (grepflag) 221 oops("usage: grep [-bchilnosvwy] [-e] pattern [file ...]"); 222 else if (fgrepflag) 223 oops("usage: fgrep [-bchilnosvx] {-f patfile | [-e] strings} [file ...]"); 224 else /* encourage SVID options, though we provide 225 * others */ 226 oops("usage: egrep [-bchilnosv] {-f patfile | [-e] pattern} [file ...]"); 227 } 228 if (fflag) 229 pattern = pfile(patfile); 230 else if (!eflag) { 231 patind = optind; 232 pattern = argv[optind++]; 233 } 234 235 if (!oflag && (argc - optind) <= 1) /* Filename invisible given < 2 files */ 236 hflag++; 237 if (pattern[0] == EOS) 238 kernighan(argv); /* same as it ever was */ 239 /* 240 * 'grep/egrep' merger -- "old" grep is called to handle: tagged 241 * exprs \( \), word matches \< and \>, -w and -y options, char 242 * classes with '-' at end (egrep bug?), and patterns beginning with 243 * an asterisk (don't ask why). otherwise, characters meaningful to 244 * 'egrep' but not to 'grep' are escaped; the entire expr is then 245 * passed to 'egrep'. 246 */ 247 if (grepflag && !grepold) { 248 if (strindex(pattern, "\\(") >= 0 || 249 strindex(pattern, "\\<") >= 0 || 250 strindex(pattern, "\\>") >= 0 || 251 strindex(pattern, "-]") >= 0 || 252 pattern[0] == '*') /* grep bug */ 253 grepold++; 254 else 255 pattern = grepxlat(pattern); 256 } 257 if (grepold || egrepold || fgrepold) 258 kernighan(argv); 259 260 if (iflag) 261 strcpy(pattern, fold(pattern)); 262 /* 263 * If the pattern is a plain string, just run boyer-moore. If it 264 * consists of meta-free alternatives, run "superimposed" bmg. 265 * Otherwise, find best string, and compile pattern for regexec(). 266 */ 267 if (strpbrk(pattern, META) == NULL) { /* do boyer-moore only */ 268 boyonly++; 269 patboy = pattern; 270 } else { 271 if ((patboy = alternate(pattern)) != NULL) 272 boyonly++; 273 else { 274 if ((patboy = isolate(pattern)) == NULL) 275 kernighan(argv); /* expr too involved */ 276 #ifndef NOKANJI 277 for (c = 0; pattern[c] != EOS; c++) 278 if (pattern[c] & NONASCII) /* kanji + meta */ 279 kernighan(argv); 280 #endif 281 if ((rspencer = regcomp(pattern)) == NULL) 282 oops("regcomp failure"); 283 } 284 } 285 gosper(patboy); /* "pre-conditioning is wonderful" 286 * -- v. strassen */ 287 288 if ((firstfile = optind) >= argc) { 289 /* Grep standard input */ 290 if (lflag) /* We don't know its name! */ 291 exit(1); 292 egsecute((char *) NULL); 293 } else { 294 while (optind < argc) { 295 egsecute(argv[optind]); 296 optind++; 297 if (firstflag && (nsuccess == 1)) 298 break; 299 } 300 } 301 exit((nsuccess == 2) ? 2 : (nsuccess == 0)); 302 } 303 304 char * 305 pfile(pfname) /* absorb expression from file */ 306 char *pfname; 307 { 308 int fd; 309 struct stat patstat; 310 static char *pat; 311 312 if ((fd = open(pfname, O_RDONLY, 0)) < 0) 313 oops("can't read pattern file"); 314 if (fstat(fd, &patstat) != 0) 315 oops("can't stat pattern file"); 316 if (patstat.st_size > PATSIZE) { 317 if (fgrepflag) { /* defer to unix version */ 318 fgrepold++; 319 return "dummy"; 320 } else 321 oops("pattern file too big"); 322 } 323 if ((pat = malloc((unsigned) patstat.st_size + 1)) == NULL) 324 oops("out of memory to read pattern file"); 325 if (patstat.st_size != read(fd, pat, (int)patstat.st_size)) 326 oops("error reading pattern file"); 327 (void) close(fd); 328 329 pat[patstat.st_size] = EOS; 330 if (pat[patstat.st_size - 1] == NL) /* NOP for egrep; helps grep */ 331 pat[patstat.st_size - 1] = EOS; 332 333 if (nlcount(pat, &pat[patstat.st_size]) > NALT) { 334 if (fgrepflag) 335 fgrepold++; /* "what's it all about, alfie?" */ 336 else 337 egrepold++; 338 } 339 return (pat); 340 } 341 342 egsecute(file) 343 char *file; 344 { 345 extern int errno; 346 347 if (file == NULL) 348 fd = 0; 349 else if ((fd = open(file, O_RDONLY, 0)) <= 0) { 350 fprintf(stderr, 351 "%s: %s: %s\n", progname, file, strerror(errno)); 352 nsuccess = 2; 353 return; 354 } 355 chimaera(file, patboy); 356 357 if (!boyonly && !flushflag && file != NULL) 358 flushmatches(); 359 if (file != NULL) 360 close(fd); 361 } 362 363 chimaera(file, pat) /* "reach out and boyer-moore search someone" */ 364 char *file, *pat; /* -- soon-to-be-popular bumper sticker */ 365 { 366 register char *k, *strend, *s; 367 register int j, count; 368 register int *deltazero = delta0; 369 int patlen = altmin; 370 char *t; 371 372 nleftover = boyfound = flushflag = 0; 373 nline = 1L; 374 prevmatch = 0; 375 nmatch = counted = rxcount = 0L; 376 377 while ((count = read(fd, str + nleftover, BUFSIZE - nleftover)) > 0) { 378 379 counted += count; 380 strend = linesave(str, count); 381 382 for (k = str + patlen - 1; k < strend;) { 383 /* 384 * for a large class of patterns, upwards of 80% of 385 * match time is spent on the next line. we beat 386 * existing microcode (vax 'matchc') this way. 387 */ 388 while ((k += deltazero[*(unsigned char *) k]) < strend); 389 if (k < (str + LARGE)) 390 break; 391 k -= LARGE; 392 393 if (altflag) { 394 /* 395 * Parallel Boyer-Moore. Check whether each 396 * of the previous <altmin> chars COULD be 397 * from one of the alternative strings. 398 */ 399 s = k - 1; 400 j = altmin; 401 while (altset[--j][(unsigned char) 402 cmap[*(unsigned char *) s--]]); 403 /* 404 * quick test fails. in this life, compare 405 * 'em all. but, a "reverse trie" would 406 * attenuate worst case (linear w/delta2?). 407 */ 408 if (--j < 0) { 409 count = nalt - 1; 410 do { 411 s = k; 412 j = altlen[count]; 413 t = altpat[count]; 414 415 while 416 (cmap[*(unsigned char *) s--] 417 == t[--j]); 418 if (j < 0) 419 break; 420 } 421 while (count--); 422 } 423 } else { 424 /* One string -- check it */ 425 j = patlen - 1; 426 s = k - 1; 427 while (cmap[*(unsigned char *) s--] == pat[--j]); 428 } 429 /* 430 * delta-less shortcut for literati. short shrift for 431 * genetic engineers? 432 */ 433 if (j >= 0) { 434 k++; /* no match; restart next char */ 435 continue; 436 } 437 k = submatch(file, pat, str, strend, k, count); 438 if (k == NULL) 439 return; 440 } 441 if (nflag) { 442 if (prevmatch) 443 nline = prevnline + nlcount(prevloc, k); 444 else 445 nline = nline + nlcount(str, k); 446 prevmatch = 0; 447 } 448 strncpy(str, linetemp, nleftover); 449 } 450 if (cflag) { 451 /* Bug from old grep: -c overrides -h. We fix the bug. */ 452 if (!hflag) 453 printf("%s:", file); 454 printf("%ld\n", nmatch); 455 } 456 } 457 458 char * 459 linesave(str, count) /* accumulate partial line at end of buffer */ 460 char str[]; 461 register int count; 462 { 463 register int j; 464 465 count += nleftover; 466 if (count != BUFSIZE && fd != 0) 467 str[count++] = NL; /* insurance for broken last line */ 468 str[count] = EOS; 469 for (j = count - 1; str[j] != NL && j >= 0;) 470 j--; 471 /* 472 * break up these lines: long line (> BUFSIZE), last line of file, or 473 * short return from read(), as from tee(1) input 474 */ 475 if (j < 0 && (count == (BUFSIZE - nleftover))) { 476 str[count++] = NL; 477 str[count] = EOS; 478 linetemp[0] = EOS; 479 nleftover = 0; 480 return (str + count); 481 } else { 482 nleftover = count - j - 1; 483 strncpy(linetemp, str + j + 1, nleftover); 484 return (str + j); 485 } 486 } 487 488 /* 489 * Process partial match. First check for mis-aligned Kanji, then match line 490 * against full compiled r.e. if statistics do not warrant handing off to 491 * standard egrep. 492 */ 493 char * 494 submatch(file, pat, str, strend, k, altindex) 495 char file[], pat[], str[]; 496 register char *strend, *k; 497 int altindex; 498 { 499 register char *s; 500 char *t, c; 501 502 t = k; 503 s = ((altflag) ? k - altlen[altindex] + 1 : k - altmin + 1); 504 #ifndef NOKANJI 505 c = ((altflag) ? altpat[altindex][0] : pat[0]); 506 if (c & NONASCII) 507 if ((s = kanji(str, s, k)) == NULL) 508 return (++k); /* reject false kanji */ 509 #endif 510 do; 511 while (*s != NL && --s >= str); 512 k = s + 1; /* now at line start */ 513 514 if (boyonly) 515 return (gotamatch(file, k)); 516 517 incount = counted - (strend - k); 518 if (boyfound++ == FIRSTFEW) 519 execstrategy(file); 520 521 s = t; 522 do 523 rxcount++; 524 while (*s++ != NL); 525 *--s = EOS; 526 /* 527 * "quick henry -- the flit" (after theodor geisel) 528 */ 529 if (regexec(rspencer, ((iflag) ? fold(k) : k)) == 1) { 530 *s = NL; 531 if (gotamatch(file, k) == NULL) 532 return (NULL); 533 } 534 *s = NL; 535 return (s + 1); 536 } 537 538 #ifndef NOKANJI 539 /* 540 * EUC code disambiguation -- scan backwards to first 7-bit code, while 541 * counting intervening 8-bit codes. If odd, reject unaligned Kanji pattern. 542 * SS2/3 checks are for intermixed Japanase Katakana or Kanji2. 543 */ 544 char * 545 kanji(str, s, k) 546 register char *str, *s, *k; 547 { 548 register int j = 0; 549 550 for (s--; s >= str; s--) { 551 if (*s == SS2 || *s == SS3 || (*s & NONASCII) == 0) 552 break; 553 j++; 554 } 555 #ifndef CHINESE 556 if (*s == SS2) 557 j -= 1; 558 #endif CHINESE 559 return ((j & 01) ? NULL : k); 560 } 561 #endif 562 563 /* 564 * Compute "Boyer-Moore" delta table -- put skip distance in delta0[c] 565 */ 566 gosper(pattern) 567 char *pattern; /* ... HAKMEM lives ... */ 568 { 569 register int i, j; 570 unsigned char c; 571 572 /* Make one-string case look like simple alternatives case */ 573 if (!altflag) { 574 nalt = 1; 575 altmin = altlen[0] = strlen(pattern); 576 altpat[0] = pattern; 577 } 578 /* For chars that aren't in any string, skip by string length. */ 579 for (j = 0; j < 256; j++) { 580 delta0[j] = altmin; 581 cmap[j] = j; /* Sneak in initialization of cmap */ 582 } 583 584 /* For chars in a string, skip distance from char to end of string. */ 585 /* (If char appears more than once, skip minimum distance.) */ 586 for (i = 0; i < nalt; i++) 587 for (j = 0; j < altlen[i] - 1; j++) { 588 c = altpat[i][j]; 589 delta0[c] = MIN(delta0[c], altlen[i] - j - 1); 590 if (iflag && islower((int) c)) 591 delta0[toupper((int) c)] = delta0[c]; 592 } 593 594 /* For last char of each string, fall out of search loop. */ 595 for (i = 0; i < nalt; i++) { 596 c = altpat[i][altlen[i] - 1]; 597 delta0[c] = LARGE; 598 if (iflag && islower((int) c)) 599 delta0[toupper((int) c)] = LARGE; 600 } 601 if (iflag) 602 for (j = 'A'; j <= 'Z'; j++) 603 cmap[j] = tolower((int) j); 604 } 605 606 /* 607 * Print, count, or stop on full match. Result is either the location for 608 * continued search, or NULL to stop. 609 */ 610 char * 611 gotamatch(file, s) 612 register char *file, *s; 613 { 614 char *savematch(); 615 int squirrel = 0; /* nonzero to squirrel away FIRSTFEW matches */ 616 617 nmatch++; 618 nsuccess = 1; 619 if (!boyonly && boyfound <= FIRSTFEW && file != NULL) 620 squirrel = 1; 621 622 if (sflag) 623 return (NULL); /* -s usurps all flags (unlike some versions) */ 624 if (cflag) { /* -c overrides -l, we guess */ 625 do; 626 while (*s++ != NL); 627 } else if (lflag) { 628 puts(file); 629 return (NULL); 630 } else { 631 if (!hflag) 632 if (!squirrel) 633 printf("%s:", file); 634 else 635 (void)sprintf(preamble, "%s:", file); 636 if (nflag) { 637 if (prevmatch) 638 prevnline = prevnline + nlcount(prevloc, s); 639 else 640 prevnline = nline + nlcount(str, s); 641 prevmatch = 1; 642 643 if (!squirrel) 644 printf("%ld:", prevnline); 645 else 646 (void)sprintf(preamble + strlen(preamble), 647 "%ld:", prevnline); 648 } 649 if (!squirrel) { 650 do 651 putchar(*s); 652 while (*s++ != NL); 653 } else 654 s = savematch(s); 655 656 if (nflag) 657 prevloc = s - 1; 658 } 659 return ((firstflag && !cflag) ? NULL : s); 660 } 661 662 char * 663 fold(line) 664 char *line; 665 { 666 static char fline[BUFSIZE]; 667 register char *s, *t = fline; 668 669 for (s = line; *s != EOS; s++) 670 *t++ = (isupper((int) *s) ? (char) tolower((int) *s) : *s); 671 *t = EOS; 672 return (fline); 673 } 674 675 strindex(s, t) /* the easy way, as in K&P, p. 192 */ 676 char *s, *t; 677 { 678 int i, n; 679 680 n = strlen(t); 681 for (i = 0; s[i] != '\0'; i++) 682 if (strncmp(s + i, t, n) == 0) 683 return (i); 684 return (-1); 685 } 686 687 char * 688 grepxlat(pattern) /* grep pattern meta conversion */ 689 char *pattern; 690 { 691 register char *p, *s; 692 static char newpat[BUFSIZE]; 693 694 for (s = newpat, p = pattern; *p != EOS;) { 695 if (*p == '\\') { /* skip escapes ... */ 696 *s++ = *p++; 697 if (*p) 698 *s++ = *p++; 699 } else if (*p == '[') { /* ... and char classes */ 700 while (*p != EOS && *p != ']') 701 *s++ = *p++; 702 } else if (strchr("+?|()", *p) != NULL) { 703 *s++ = '\\'; /* insert protection */ 704 *s++ = *p++; 705 } else 706 *s++ = *p++; 707 } 708 *s = EOS; 709 grepflag = ((patind) ? 0 : 1); 710 return (newpat); 711 } 712 713 /* 714 * Test for simple alternation. Result is NULL if it's not so simple, or is 715 * a pointer to the first string if it is. Warning: sscanf size is a 716 * fixpoint, beyond which the speedup linearity starts to break down. In the 717 * wake of the elegant aho/corrasick "trie"-based fgrep, generalizing 718 * altpat[] to arbitrary size is not useful. 719 */ 720 char * 721 alternate(regexpr) 722 char *regexpr; 723 { 724 register int i, j; 725 register char *start, *stop; 726 unsigned char c; 727 728 if (fgrepflag && strchr(regexpr, '|')) 729 return (NULL); 730 731 /* 732 * break pattern up into altpat array; delimit on newline, bar, 733 * or EOS. We know we won't overflow, we've already checked the 734 * number of patterns we're going to find against NALT. 735 * Also, set length of pattern and find minimum pattern length. 736 */ 737 nalt = 0; 738 altmin = NMUSH; 739 for (start = stop = regexpr;; ++stop) 740 if (!*stop || *stop == '|' || *stop == NL) { 741 altlen[nalt] = j = stop - start; 742 if (j < altmin) 743 altmin = j; 744 if (!(altpat[nalt] = malloc((u_int)(j + 1)))) 745 oops("out of memory"); 746 bcopy(start, altpat[nalt], j); 747 altpat[nalt][j] = EOS; 748 ++nalt; 749 if (!*stop) 750 break; 751 if (nalt == NALT) 752 return(NULL); 753 if (*stop == NL) 754 *stop = '|'; 755 start = stop + 1; 756 } 757 if (!fgrepflag) { 758 if (strchr(regexpr, '|') == NULL || regexpr[0] == '|') 759 return (NULL); 760 if (strpbrk(regexpr, "^$.[]()?+*\\") != NULL 761 || strindex(regexpr, "||") >= 0) 762 return (NULL); 763 } 764 765 if (nalt > 1) { /* build superimposed "pre-match" sets per 766 * char */ 767 altflag++; 768 for (j = 0; j < nalt; j++) 769 for (i = 0; i < altmin; i++) { 770 c = altpat[j][altlen[j] - altmin + i]; 771 altset[i + 1][c] = 1; /* offset for sentinel */ 772 } 773 } 774 return (altpat[0]); 775 } 776 777 /* 778 * Grapple with the dfa (std egrep) vs. ndfa (regexp) tradeoff. Criteria to 779 * determine whether to use dfa-based egrep: We do FIRSTFEW matches with 780 * regexec(). If Boyer-Moore up to now matched more than PUNTPERCENT 781 * of the input, the r.e. is likely to be underspecified, so do old *grep, 782 * which is faster on complex patterns than regexp(). At FIRSTFEW, 783 * dump the saved matches collected by savematch(). They are saved 784 * so that a "PUNT" can "rewind" to ignore them. Stdin is problematic, 785 * since it's hard to rewind. 786 */ 787 788 execstrategy(file) 789 char *file; 790 { 791 int pctmatch; 792 793 pctmatch = (100 * rxcount) / incount; 794 if (pctmatch > PUNTPERCENT && file != NULL) 795 kernighan(args); 796 if (file != NULL) 797 flushmatches(); 798 } 799 800 nlcount(bstart, bstop) /* flail interval to totalize newlines. */ 801 char *bstart, *bstop; 802 { 803 register char *s = bstart; 804 register char *t = bstop; 805 register int count = 0; 806 807 do { /* loop unroll for older architectures */ 808 if (*t == NL) /* ... ask ames!jaw for sample code */ 809 count++; 810 } while (t-- > s); 811 812 return (count); 813 } 814 815 char * 816 isolate(regexpr) /* isolate longest metacharacter-free string */ 817 char *regexpr; 818 { 819 char *dummyexpr; 820 821 /* 822 * We add (.)* because Henry's regcomp only figures regmust if it 823 * sees a leading * pattern. Foo! 824 */ 825 dummyexpr = malloc((unsigned) strlen(regexpr) + 5); 826 (void)sprintf(dummyexpr, "(.)*%s", regexpr); 827 if ((rspencer = regcomp(dummyexpr)) == NULL) 828 kernighan(args); 829 return (rspencer->regmust); 830 } 831 832 char *matches[FIRSTFEW]; 833 static int mcount = 0; 834 835 char * 836 savematch(s) /* horde matches during statistics gathering */ 837 register char *s; 838 { 839 char *p; 840 char *start = s; 841 int msize = 0; 842 int psize = strlen(preamble); 843 844 while (*s++ != NL) 845 msize++; 846 *--s = EOS; 847 848 p = malloc((unsigned) msize + 1 + psize); 849 strcpy(p, preamble); 850 strcpy(p + psize, start); 851 matches[mcount++] = p; 852 853 preamble[0] = 0; 854 *s = NL; 855 return (s); 856 } 857 858 flushmatches() 859 { 860 int n; 861 862 flushflag = 1; 863 for (n = 0; n < mcount; n++) 864 printf("%s\n", matches[n]); 865 mcount = 0; 866 } 867 868 oops(message) 869 char *message; 870 { 871 fprintf(stderr, "%s: %s\n", progname, message); 872 exit(2); 873 } 874 875 kernighan(args) /* "let others do the hard part ..." */ 876 char *args[]; 877 { 878 /* 879 * We may have already run grep on some of the files; remove them 880 * from the arg list we pass on. Note that we can't delete them 881 * totally because the number of file names affects the output 882 * (automatic -h). 883 */ 884 /* better would be fork/exec per punted file -- jaw */ 885 886 while (firstfile && optind > firstfile) 887 args[firstfile++] = _PATH_DEVNULL; 888 if (patind) 889 args[patind] = pattern; 890 (void) fflush(stdout); 891 892 if (grepflag) 893 execvp(_PATH_GREPSTD, args), oops("can't exec old 'grep'"); 894 else if (fgrepflag) 895 execvp(_PATH_FGREPSTD, args), oops("can't exec old 'fgrep'"); 896 else 897 execvp(_PATH_EGREPSTD, args), oops("can't exec old 'egrep'"); 898 } 899