1 /*-
2 * Copyright (c) 1991, 1993
3 * The Regents of the University of California. All rights reserved.
4 *
5 * %sccs.include.redist.c%
6 */
7
8 #ifndef lint
9 static char copyright[] =
10 "@(#) Copyright (c) 1991, 1993\n\
11 The Regents of the University of California. All rights reserved.\n";
12 #endif /* not lint */
13
14 #ifndef lint
15 static char sccsid[] = "@(#)egrep.c 8.1 (Berkeley) 06/06/93";
16 #endif /* not lint */
17
18 /*
19 Hybrid Boyer/Moore/Gosper-assisted 'grep/egrep/fgrep' search, with delta0
20 table as in original paper (CACM, October, 1977). No delta1 or delta2.
21 According to experiment (Horspool, Soft. Prac. Exp., 1982), delta2 is of
22 minimal practical value. However, to improve for worst case input,
23 integrating the improved Galil strategies (Apostolico/Giancarlo, SIAM. J.
24 Comput., Feb. 1986) deserves consideration.
25
26 Method: extract longest metacharacter-free string from expression.
27 this is done using a side-effect from henry spencer's regcomp().
28 use boyer-moore to match such, then pass submatching lines
29 to either regexp() or standard 'egrep', depending on certain
30 criteria within execstrategy() below. [this tradeoff is due
31 to the general slowness of the regexp() nondeterministic
32 machine on complex expressions, as well as the startup time
33 of standard 'egrep' on short files.] alternatively, one may
34 change the vendor-supplied 'egrep' automaton to include
35 boyer-moore directly. see accompanying writeup for discussion
36 of kanji expression treatment.
37
38 late addition: apply trickbag for fast match of simple
39 alternations (sublinear, in common low-cardinality cases).
40 trap fgrep into this lair.
41
42 gnu additions: -f, newline as |, \< and \> [in regexec()], more
43 comments. inspire better dfa exec() strategy.
44 serious testing and help with special cases.
45
46 Algorithm amalgam summary:
47
48 dfa e?grep (aho/thompson)
49 ndfa regexp() (spencer/aho)
50 bmg (boyer/moore/gosper)
51 "superimposed" bmg (jaw)
52 fgrep (aho/corrasick)
53
54 sorry, but the knuth/morris/pratt machine, horspool's
55 "frequentist" code, and the rabin/karp matcher, however cute,
56 just don't cut it for this production.
57
58 James A. Woods Copyright (c) 1986
59 NASA Ames Research Center
60 */
61 #include <sys/types.h>
62 #include <sys/stat.h>
63 #include <sys/file.h>
64 #include <regexp.h> /* must be henry spencer's version */
65 #include <stdio.h>
66 #include <ctype.h>
67 #include "pathnames.h"
68
69 #define MIN(A, B) ((A) > (B) ? (B) : (A))
70
71 #ifdef SLOWSYS
72 #define read xread
73 #endif
74
75 #define BUFSIZE 8192 /* make higher for cray */
76 #define PATSIZE 6000
77 #define LARGE BUFSIZE + PATSIZE
78
79 #define NALT 7 /* tied to scanf() size in alternate() */
80 #define NMUSH 6 /* loosely relates to expected alt length */
81
82 #define FIRSTFEW 33 /* Always do FIRSTFEW matches with regexec() */
83 #define PUNTPERCENT 10 /* After FIRSTFEW, if PUNTPERCENT of the input
84 * was processed by regexp(), exec std egrep. */
85 #define NL '\n'
86 #define EOS '\0'
87 #define NONASCII 0200 /* Bit mask for Kanji non-ascii chars */
88 #define META "\n^$.[]()?+*|\\" /* egrep meta-characters */
89 #define SS2 '\216' /* EUC Katakana (or Chinese2) prefix */
90 #define SS3 '\217' /* EUC Kanji2 (or Chinese3) prefix */
91
92 extern char *optarg;
93 extern int optind;
94 char *progname;
95
96 int cflag, iflag, eflag, fflag, lflag, nflag; /* SVID flags */
97 int sflag, hflag; /* v7, v8, bsd */
98
99 int firstflag; /* Stop at first match */
100 int grepflag; /* Called as "grep" */
101 int fgrepflag; /* Called as "fgrep" */
102 int altflag; /* Simple alternation in pattern */
103 int boyonly; /* No regexp needed -- all simple */
104 int flushflag;
105 int grepold, egrepold, fgrepold;
106
107 int nalt; /* Number of alternatives */
108 int nsuccess; /* 1 for match, 2 for error */
109 int altmin; /* Minimum length of all the alternate
110 * strings */
111 int firstfile; /* argv index of first file argument */
112 int patind; /* argv index of pattern */
113 long nmatch; /* Number of matches in this file */
114 long incount, counted; /* Amount of input consumed */
115 long rxcount; /* Bytes of input processed by regexec() */
116 int boyfound; /* accumulated partial matches (tripped by
117 * FIRSTFEW) */
118 int prevmatch; /* next three lines aid fast -n */
119 long nline, prevnline;
120 char *prevloc;
121
122 regexp *rspencer;
123 char *pattern;
124 char *patboy; /* Pattern for simple Boyer-Moore */
125 char *patfile; /* Filename containing pattern(s) */
126
127 int delta0[256]; /* Boyer-Moore algorithm core */
128 char cmap[256]; /* Usually 0-255, but if -i, maps upper to
129 * lower case */
130 char str[BUFSIZE + 2];
131 int nleftover;
132 char linetemp[BUFSIZE];
133 char *altpat[NALT]; /* alternation component storage */
134 int altlen[NALT];
135 short altset[NMUSH + 1][256];
136 char preamble[200]; /* match prefix (filename, line no.) */
137
138 int fd;
139 char *
140 strchr(), *strrchr(), *strcpy(), *strncpy(), *strpbrk(), *malloc();
141 char *
142 grepxlat(), *fold(), *pfile(), *alternate(), *isolate();
143 char *gotamatch(), *kanji(), *linesave(), *submatch();
144 char **args;
145
main(argc,argv)146 main(argc, argv)
147 int argc;
148 char *argv[];
149 {
150 int c, oflag;
151 int errflag = 0;
152
153 args = argv;
154
155 if ((progname = strrchr(argv[0], '/')) != 0)
156 progname++;
157 else
158 progname = argv[0];
159 if (strcmp(progname, "grep") == 0)
160 grepflag++;
161 else if (strcmp(progname, "fgrep") == 0)
162 fgrepflag++;
163
164 oflag = 0;
165 while ((c = getopt(argc, argv, "bchie:f:lnosvwxy1")) != EOF) {
166 switch (c) {
167
168 case 'f':
169 fflag++;
170 patfile = optarg;
171 continue;
172 case 'b':
173 case 'v':
174 egrepold++; /* boyer-moore of little help here */
175 continue;
176 case 'c':
177 cflag++;
178 continue;
179 case 'e':
180 eflag++;
181 pattern = optarg;
182 continue;
183 case 'h':
184 hflag++;
185 continue;
186 case 'o':
187 oflag++;
188 continue;
189 case '1': /* Stop at very first match */
190 firstflag++; /* spead freaks only */
191 continue;
192 case 'i':
193 iflag++;
194 continue;
195 case 'l':
196 lflag++;
197 continue;
198 case 'n':
199 nflag++;
200 continue;
201 case 's':
202 sflag++;
203 continue;
204 case 'w':
205 case 'y':
206 if (!grepflag)
207 errflag++;
208 grepold++;
209 continue;
210 case 'x': /* needs more work, like -b above */
211 if (!fgrepflag)
212 errflag++;
213 fgrepold++;
214 continue;
215 case '?':
216 errflag++;
217 }
218 }
219 if (errflag || ((argc <= optind) && !fflag && !eflag)) {
220 if (grepflag)
221 oops("usage: grep [-bchilnosvwy] [-e] pattern [file ...]");
222 else if (fgrepflag)
223 oops("usage: fgrep [-bchilnosvx] {-f patfile | [-e] strings} [file ...]");
224 else /* encourage SVID options, though we provide
225 * others */
226 oops("usage: egrep [-bchilnosv] {-f patfile | [-e] pattern} [file ...]");
227 }
228 if (fflag)
229 pattern = pfile(patfile);
230 else if (!eflag) {
231 patind = optind;
232 pattern = argv[optind++];
233 }
234
235 if (!oflag && (argc - optind) <= 1) /* Filename invisible given < 2 files */
236 hflag++;
237 if (pattern[0] == EOS)
238 kernighan(argv); /* same as it ever was */
239 /*
240 * 'grep/egrep' merger -- "old" grep is called to handle: tagged
241 * exprs \( \), word matches \< and \>, -w and -y options, char
242 * classes with '-' at end (egrep bug?), and patterns beginning with
243 * an asterisk (don't ask why). otherwise, characters meaningful to
244 * 'egrep' but not to 'grep' are escaped; the entire expr is then
245 * passed to 'egrep'.
246 */
247 if (grepflag && !grepold) {
248 if (strindex(pattern, "\\(") >= 0 ||
249 strindex(pattern, "\\<") >= 0 ||
250 strindex(pattern, "\\>") >= 0 ||
251 strindex(pattern, "-]") >= 0 ||
252 pattern[0] == '*') /* grep bug */
253 grepold++;
254 else
255 pattern = grepxlat(pattern);
256 }
257 if (grepold || egrepold || fgrepold)
258 kernighan(argv);
259
260 if (iflag)
261 strcpy(pattern, fold(pattern));
262 /*
263 * If the pattern is a plain string, just run boyer-moore. If it
264 * consists of meta-free alternatives, run "superimposed" bmg.
265 * Otherwise, find best string, and compile pattern for regexec().
266 */
267 if (strpbrk(pattern, META) == NULL) { /* do boyer-moore only */
268 boyonly++;
269 patboy = pattern;
270 } else {
271 if ((patboy = alternate(pattern)) != NULL)
272 boyonly++;
273 else {
274 if ((patboy = isolate(pattern)) == NULL)
275 kernighan(argv); /* expr too involved */
276 #ifndef NOKANJI
277 for (c = 0; pattern[c] != EOS; c++)
278 if (pattern[c] & NONASCII) /* kanji + meta */
279 kernighan(argv);
280 #endif
281 if ((rspencer = regcomp(pattern)) == NULL)
282 oops("regcomp failure");
283 }
284 }
285 gosper(patboy); /* "pre-conditioning is wonderful"
286 * -- v. strassen */
287
288 if ((firstfile = optind) >= argc) {
289 /* Grep standard input */
290 if (lflag) /* We don't know its name! */
291 exit(1);
292 egsecute((char *) NULL);
293 } else {
294 while (optind < argc) {
295 egsecute(argv[optind]);
296 optind++;
297 if (firstflag && (nsuccess == 1))
298 break;
299 }
300 }
301 exit((nsuccess == 2) ? 2 : (nsuccess == 0));
302 }
303
304 char *
pfile(pfname)305 pfile(pfname) /* absorb expression from file */
306 char *pfname;
307 {
308 int fd;
309 struct stat patstat;
310 static char *pat;
311
312 if ((fd = open(pfname, O_RDONLY, 0)) < 0)
313 oops("can't read pattern file");
314 if (fstat(fd, &patstat) != 0)
315 oops("can't stat pattern file");
316 if (patstat.st_size > PATSIZE) {
317 if (fgrepflag) { /* defer to unix version */
318 fgrepold++;
319 return "dummy";
320 } else
321 oops("pattern file too big");
322 }
323 if ((pat = malloc((unsigned) patstat.st_size + 1)) == NULL)
324 oops("out of memory to read pattern file");
325 if (patstat.st_size != read(fd, pat, (int)patstat.st_size))
326 oops("error reading pattern file");
327 (void) close(fd);
328
329 pat[patstat.st_size] = EOS;
330 if (pat[patstat.st_size - 1] == NL) /* NOP for egrep; helps grep */
331 pat[patstat.st_size - 1] = EOS;
332
333 if (nlcount(pat, &pat[patstat.st_size]) > NALT) {
334 if (fgrepflag)
335 fgrepold++; /* "what's it all about, alfie?" */
336 else
337 egrepold++;
338 }
339 return (pat);
340 }
341
egsecute(file)342 egsecute(file)
343 char *file;
344 {
345 extern int errno;
346
347 if (file == NULL)
348 fd = 0;
349 else if ((fd = open(file, O_RDONLY, 0)) <= 0) {
350 fprintf(stderr,
351 "%s: %s: %s\n", progname, file, strerror(errno));
352 nsuccess = 2;
353 return;
354 }
355 chimaera(file, patboy);
356
357 if (!boyonly && !flushflag && file != NULL)
358 flushmatches();
359 if (file != NULL)
360 close(fd);
361 }
362
chimaera(file,pat)363 chimaera(file, pat) /* "reach out and boyer-moore search someone" */
364 char *file, *pat; /* -- soon-to-be-popular bumper sticker */
365 {
366 register char *k, *strend, *s;
367 register int j, count;
368 register int *deltazero = delta0;
369 int patlen = altmin;
370 char *t;
371
372 nleftover = boyfound = flushflag = 0;
373 nline = 1L;
374 prevmatch = 0;
375 nmatch = counted = rxcount = 0L;
376
377 while ((count = read(fd, str + nleftover, BUFSIZE - nleftover)) > 0) {
378
379 counted += count;
380 strend = linesave(str, count);
381
382 for (k = str + patlen - 1; k < strend;) {
383 /*
384 * for a large class of patterns, upwards of 80% of
385 * match time is spent on the next line. we beat
386 * existing microcode (vax 'matchc') this way.
387 */
388 while ((k += deltazero[*(unsigned char *) k]) < strend);
389 if (k < (str + LARGE))
390 break;
391 k -= LARGE;
392
393 if (altflag) {
394 /*
395 * Parallel Boyer-Moore. Check whether each
396 * of the previous <altmin> chars COULD be
397 * from one of the alternative strings.
398 */
399 s = k - 1;
400 j = altmin;
401 while (altset[--j][(unsigned char)
402 cmap[*(unsigned char *) s--]]);
403 /*
404 * quick test fails. in this life, compare
405 * 'em all. but, a "reverse trie" would
406 * attenuate worst case (linear w/delta2?).
407 */
408 if (--j < 0) {
409 count = nalt - 1;
410 do {
411 s = k;
412 j = altlen[count];
413 t = altpat[count];
414
415 while
416 (cmap[*(unsigned char *) s--]
417 == t[--j]);
418 if (j < 0)
419 break;
420 }
421 while (count--);
422 }
423 } else {
424 /* One string -- check it */
425 j = patlen - 1;
426 s = k - 1;
427 while (cmap[*(unsigned char *) s--] == pat[--j]);
428 }
429 /*
430 * delta-less shortcut for literati. short shrift for
431 * genetic engineers?
432 */
433 if (j >= 0) {
434 k++; /* no match; restart next char */
435 continue;
436 }
437 k = submatch(file, pat, str, strend, k, count);
438 if (k == NULL)
439 return;
440 }
441 if (nflag) {
442 if (prevmatch)
443 nline = prevnline + nlcount(prevloc, k);
444 else
445 nline = nline + nlcount(str, k);
446 prevmatch = 0;
447 }
448 strncpy(str, linetemp, nleftover);
449 }
450 if (cflag) {
451 /* Bug from old grep: -c overrides -h. We fix the bug. */
452 if (!hflag)
453 printf("%s:", file);
454 printf("%ld\n", nmatch);
455 }
456 }
457
458 char *
linesave(str,count)459 linesave(str, count) /* accumulate partial line at end of buffer */
460 char str[];
461 register int count;
462 {
463 register int j;
464
465 count += nleftover;
466 if (count != BUFSIZE && fd != 0)
467 str[count++] = NL; /* insurance for broken last line */
468 str[count] = EOS;
469 for (j = count - 1; str[j] != NL && j >= 0;)
470 j--;
471 /*
472 * break up these lines: long line (> BUFSIZE), last line of file, or
473 * short return from read(), as from tee(1) input
474 */
475 if (j < 0 && (count == (BUFSIZE - nleftover))) {
476 str[count++] = NL;
477 str[count] = EOS;
478 linetemp[0] = EOS;
479 nleftover = 0;
480 return (str + count);
481 } else {
482 nleftover = count - j - 1;
483 strncpy(linetemp, str + j + 1, nleftover);
484 return (str + j);
485 }
486 }
487
488 /*
489 * Process partial match. First check for mis-aligned Kanji, then match line
490 * against full compiled r.e. if statistics do not warrant handing off to
491 * standard egrep.
492 */
493 char *
submatch(file,pat,str,strend,k,altindex)494 submatch(file, pat, str, strend, k, altindex)
495 char file[], pat[], str[];
496 register char *strend, *k;
497 int altindex;
498 {
499 register char *s;
500 char *t, c;
501
502 t = k;
503 s = ((altflag) ? k - altlen[altindex] + 1 : k - altmin + 1);
504 #ifndef NOKANJI
505 c = ((altflag) ? altpat[altindex][0] : pat[0]);
506 if (c & NONASCII)
507 if ((s = kanji(str, s, k)) == NULL)
508 return (++k); /* reject false kanji */
509 #endif
510 do;
511 while (*s != NL && --s >= str);
512 k = s + 1; /* now at line start */
513
514 if (boyonly)
515 return (gotamatch(file, k));
516
517 incount = counted - (strend - k);
518 if (boyfound++ == FIRSTFEW)
519 execstrategy(file);
520
521 s = t;
522 do
523 rxcount++;
524 while (*s++ != NL);
525 *--s = EOS;
526 /*
527 * "quick henry -- the flit" (after theodor geisel)
528 */
529 if (regexec(rspencer, ((iflag) ? fold(k) : k)) == 1) {
530 *s = NL;
531 if (gotamatch(file, k) == NULL)
532 return (NULL);
533 }
534 *s = NL;
535 return (s + 1);
536 }
537
538 #ifndef NOKANJI
539 /*
540 * EUC code disambiguation -- scan backwards to first 7-bit code, while
541 * counting intervening 8-bit codes. If odd, reject unaligned Kanji pattern.
542 * SS2/3 checks are for intermixed Japanase Katakana or Kanji2.
543 */
544 char *
kanji(str,s,k)545 kanji(str, s, k)
546 register char *str, *s, *k;
547 {
548 register int j = 0;
549
550 for (s--; s >= str; s--) {
551 if (*s == SS2 || *s == SS3 || (*s & NONASCII) == 0)
552 break;
553 j++;
554 }
555 #ifndef CHINESE
556 if (*s == SS2)
557 j -= 1;
558 #endif CHINESE
559 return ((j & 01) ? NULL : k);
560 }
561 #endif
562
563 /*
564 * Compute "Boyer-Moore" delta table -- put skip distance in delta0[c]
565 */
gosper(pattern)566 gosper(pattern)
567 char *pattern; /* ... HAKMEM lives ... */
568 {
569 register int i, j;
570 unsigned char c;
571
572 /* Make one-string case look like simple alternatives case */
573 if (!altflag) {
574 nalt = 1;
575 altmin = altlen[0] = strlen(pattern);
576 altpat[0] = pattern;
577 }
578 /* For chars that aren't in any string, skip by string length. */
579 for (j = 0; j < 256; j++) {
580 delta0[j] = altmin;
581 cmap[j] = j; /* Sneak in initialization of cmap */
582 }
583
584 /* For chars in a string, skip distance from char to end of string. */
585 /* (If char appears more than once, skip minimum distance.) */
586 for (i = 0; i < nalt; i++)
587 for (j = 0; j < altlen[i] - 1; j++) {
588 c = altpat[i][j];
589 delta0[c] = MIN(delta0[c], altlen[i] - j - 1);
590 if (iflag && islower((int) c))
591 delta0[toupper((int) c)] = delta0[c];
592 }
593
594 /* For last char of each string, fall out of search loop. */
595 for (i = 0; i < nalt; i++) {
596 c = altpat[i][altlen[i] - 1];
597 delta0[c] = LARGE;
598 if (iflag && islower((int) c))
599 delta0[toupper((int) c)] = LARGE;
600 }
601 if (iflag)
602 for (j = 'A'; j <= 'Z'; j++)
603 cmap[j] = tolower((int) j);
604 }
605
606 /*
607 * Print, count, or stop on full match. Result is either the location for
608 * continued search, or NULL to stop.
609 */
610 char *
gotamatch(file,s)611 gotamatch(file, s)
612 register char *file, *s;
613 {
614 char *savematch();
615 int squirrel = 0; /* nonzero to squirrel away FIRSTFEW matches */
616
617 nmatch++;
618 nsuccess = 1;
619 if (!boyonly && boyfound <= FIRSTFEW && file != NULL)
620 squirrel = 1;
621
622 if (sflag)
623 return (NULL); /* -s usurps all flags (unlike some versions) */
624 if (cflag) { /* -c overrides -l, we guess */
625 do;
626 while (*s++ != NL);
627 } else if (lflag) {
628 puts(file);
629 return (NULL);
630 } else {
631 if (!hflag)
632 if (!squirrel)
633 printf("%s:", file);
634 else
635 (void)sprintf(preamble, "%s:", file);
636 if (nflag) {
637 if (prevmatch)
638 prevnline = prevnline + nlcount(prevloc, s);
639 else
640 prevnline = nline + nlcount(str, s);
641 prevmatch = 1;
642
643 if (!squirrel)
644 printf("%ld:", prevnline);
645 else
646 (void)sprintf(preamble + strlen(preamble),
647 "%ld:", prevnline);
648 }
649 if (!squirrel) {
650 do
651 putchar(*s);
652 while (*s++ != NL);
653 } else
654 s = savematch(s);
655
656 if (nflag)
657 prevloc = s - 1;
658 }
659 return ((firstflag && !cflag) ? NULL : s);
660 }
661
662 char *
fold(line)663 fold(line)
664 char *line;
665 {
666 static char fline[BUFSIZE];
667 register char *s, *t = fline;
668
669 for (s = line; *s != EOS; s++)
670 *t++ = (isupper((int) *s) ? (char) tolower((int) *s) : *s);
671 *t = EOS;
672 return (fline);
673 }
674
strindex(s,t)675 strindex(s, t) /* the easy way, as in K&P, p. 192 */
676 char *s, *t;
677 {
678 int i, n;
679
680 n = strlen(t);
681 for (i = 0; s[i] != '\0'; i++)
682 if (strncmp(s + i, t, n) == 0)
683 return (i);
684 return (-1);
685 }
686
687 char *
grepxlat(pattern)688 grepxlat(pattern) /* grep pattern meta conversion */
689 char *pattern;
690 {
691 register char *p, *s;
692 static char newpat[BUFSIZE];
693
694 for (s = newpat, p = pattern; *p != EOS;) {
695 if (*p == '\\') { /* skip escapes ... */
696 *s++ = *p++;
697 if (*p)
698 *s++ = *p++;
699 } else if (*p == '[') { /* ... and char classes */
700 while (*p != EOS && *p != ']')
701 *s++ = *p++;
702 } else if (strchr("+?|()", *p) != NULL) {
703 *s++ = '\\'; /* insert protection */
704 *s++ = *p++;
705 } else
706 *s++ = *p++;
707 }
708 *s = EOS;
709 grepflag = ((patind) ? 0 : 1);
710 return (newpat);
711 }
712
713 /*
714 * Test for simple alternation. Result is NULL if it's not so simple, or is
715 * a pointer to the first string if it is. Warning: sscanf size is a
716 * fixpoint, beyond which the speedup linearity starts to break down. In the
717 * wake of the elegant aho/corrasick "trie"-based fgrep, generalizing
718 * altpat[] to arbitrary size is not useful.
719 */
720 char *
alternate(regexpr)721 alternate(regexpr)
722 char *regexpr;
723 {
724 register int i, j;
725 register char *start, *stop;
726 unsigned char c;
727
728 if (fgrepflag && strchr(regexpr, '|'))
729 return (NULL);
730
731 /*
732 * break pattern up into altpat array; delimit on newline, bar,
733 * or EOS. We know we won't overflow, we've already checked the
734 * number of patterns we're going to find against NALT.
735 * Also, set length of pattern and find minimum pattern length.
736 */
737 nalt = 0;
738 altmin = NMUSH;
739 for (start = stop = regexpr;; ++stop)
740 if (!*stop || *stop == '|' || *stop == NL) {
741 altlen[nalt] = j = stop - start;
742 if (j < altmin)
743 altmin = j;
744 if (!(altpat[nalt] = malloc((u_int)(j + 1))))
745 oops("out of memory");
746 bcopy(start, altpat[nalt], j);
747 altpat[nalt][j] = EOS;
748 ++nalt;
749 if (!*stop)
750 break;
751 if (nalt == NALT)
752 return(NULL);
753 if (*stop == NL)
754 *stop = '|';
755 start = stop + 1;
756 }
757 if (!fgrepflag) {
758 if (strchr(regexpr, '|') == NULL || regexpr[0] == '|')
759 return (NULL);
760 if (strpbrk(regexpr, "^$.[]()?+*\\") != NULL
761 || strindex(regexpr, "||") >= 0)
762 return (NULL);
763 }
764
765 if (nalt > 1) { /* build superimposed "pre-match" sets per
766 * char */
767 altflag++;
768 for (j = 0; j < nalt; j++)
769 for (i = 0; i < altmin; i++) {
770 c = altpat[j][altlen[j] - altmin + i];
771 altset[i + 1][c] = 1; /* offset for sentinel */
772 }
773 }
774 return (altpat[0]);
775 }
776
777 /*
778 * Grapple with the dfa (std egrep) vs. ndfa (regexp) tradeoff. Criteria to
779 * determine whether to use dfa-based egrep: We do FIRSTFEW matches with
780 * regexec(). If Boyer-Moore up to now matched more than PUNTPERCENT
781 * of the input, the r.e. is likely to be underspecified, so do old *grep,
782 * which is faster on complex patterns than regexp(). At FIRSTFEW,
783 * dump the saved matches collected by savematch(). They are saved
784 * so that a "PUNT" can "rewind" to ignore them. Stdin is problematic,
785 * since it's hard to rewind.
786 */
787
execstrategy(file)788 execstrategy(file)
789 char *file;
790 {
791 int pctmatch;
792
793 pctmatch = (100 * rxcount) / incount;
794 if (pctmatch > PUNTPERCENT && file != NULL)
795 kernighan(args);
796 if (file != NULL)
797 flushmatches();
798 }
799
nlcount(bstart,bstop)800 nlcount(bstart, bstop) /* flail interval to totalize newlines. */
801 char *bstart, *bstop;
802 {
803 register char *s = bstart;
804 register char *t = bstop;
805 register int count = 0;
806
807 do { /* loop unroll for older architectures */
808 if (*t == NL) /* ... ask ames!jaw for sample code */
809 count++;
810 } while (t-- > s);
811
812 return (count);
813 }
814
815 char *
isolate(regexpr)816 isolate(regexpr) /* isolate longest metacharacter-free string */
817 char *regexpr;
818 {
819 char *dummyexpr;
820
821 /*
822 * We add (.)* because Henry's regcomp only figures regmust if it
823 * sees a leading * pattern. Foo!
824 */
825 dummyexpr = malloc((unsigned) strlen(regexpr) + 5);
826 (void)sprintf(dummyexpr, "(.)*%s", regexpr);
827 if ((rspencer = regcomp(dummyexpr)) == NULL)
828 kernighan(args);
829 return (rspencer->regmust);
830 }
831
832 char *matches[FIRSTFEW];
833 static int mcount = 0;
834
835 char *
savematch(s)836 savematch(s) /* horde matches during statistics gathering */
837 register char *s;
838 {
839 char *p;
840 char *start = s;
841 int msize = 0;
842 int psize = strlen(preamble);
843
844 while (*s++ != NL)
845 msize++;
846 *--s = EOS;
847
848 p = malloc((unsigned) msize + 1 + psize);
849 strcpy(p, preamble);
850 strcpy(p + psize, start);
851 matches[mcount++] = p;
852
853 preamble[0] = 0;
854 *s = NL;
855 return (s);
856 }
857
flushmatches()858 flushmatches()
859 {
860 int n;
861
862 flushflag = 1;
863 for (n = 0; n < mcount; n++)
864 printf("%s\n", matches[n]);
865 mcount = 0;
866 }
867
oops(message)868 oops(message)
869 char *message;
870 {
871 fprintf(stderr, "%s: %s\n", progname, message);
872 exit(2);
873 }
874
kernighan(args)875 kernighan(args) /* "let others do the hard part ..." */
876 char *args[];
877 {
878 /*
879 * We may have already run grep on some of the files; remove them
880 * from the arg list we pass on. Note that we can't delete them
881 * totally because the number of file names affects the output
882 * (automatic -h).
883 */
884 /* better would be fork/exec per punted file -- jaw */
885
886 while (firstfile && optind > firstfile)
887 args[firstfile++] = _PATH_DEVNULL;
888 if (patind)
889 args[patind] = pattern;
890 (void) fflush(stdout);
891
892 if (grepflag)
893 execvp(_PATH_GREPSTD, args), oops("can't exec old 'grep'");
894 else if (fgrepflag)
895 execvp(_PATH_FGREPSTD, args), oops("can't exec old 'fgrep'");
896 else
897 execvp(_PATH_EGREPSTD, args), oops("can't exec old 'egrep'");
898 }
899