xref: /original-bsd/usr.bin/grep/egrep/egrep.c (revision 95a66346)
1 #ifndef lint
2 static char sccsid[] = "@(#)egrep.c	5.13 (Berkeley) 02/27/91";
3 #endif not lint
4 
5 /*
6      Hybrid Boyer/Moore/Gosper-assisted 'grep/egrep/fgrep' search, with delta0
7      table as in original paper (CACM, October, 1977).  No delta1 or delta2.
8      According to experiment (Horspool, Soft. Prac. Exp., 1982), delta2 is of
9      minimal practical value.  However, to improve for worst case input,
10      integrating the improved Galil strategies (Apostolico/Giancarlo, SIAM. J.
11      Comput., Feb. 1986) deserves consideration.
12 
13      Method: 	extract longest metacharacter-free string from expression.
14 		this is done using a side-effect from henry spencer's regcomp().
15 		use boyer-moore to match such, then pass submatching lines
16 		to either regexp() or standard 'egrep', depending on certain
17 		criteria within execstrategy() below.  [this tradeoff is due
18 		to the general slowness of the regexp() nondeterministic
19 		machine on complex expressions, as well as the startup time
20 		of standard 'egrep' on short files.]  alternatively, one may
21 		change the vendor-supplied 'egrep' automaton to include
22 		boyer-moore directly.  see accompanying writeup for discussion
23 		of kanji expression treatment.
24 
25 		late addition:  apply trickbag for fast match of simple
26 		alternations (sublinear, in common low-cardinality cases).
27 		trap fgrep into this lair.
28 
29 		gnu additions:  -f, newline as |, \< and \> [in regexec()], more
30 				comments.  inspire better dfa exec() strategy.
31 				serious testing and help with special cases.
32 
33      Algorithm amalgam summary:
34 
35 		dfa e?grep 		(aho/thompson)
36 		ndfa regexp() 		(spencer/aho)
37 		bmg			(boyer/moore/gosper)
38 		"superimposed" bmg   	(jaw)
39 		fgrep			(aho/corrasick)
40 
41 		sorry, but the knuth/morris/pratt machine, horspool's
42 		"frequentist" code, and the rabin/karp matcher, however cute,
43 		just don't cut it for this production.
44 
45      James A. Woods				Copyright (c) 1986
46      NASA Ames Research Center
47 */
48 #include <sys/types.h>
49 #include <sys/stat.h>
50 #include <sys/file.h>
51 #include <regexp.h>		/* must be henry spencer's version */
52 #include <stdio.h>
53 #include <ctype.h>
54 #include "pathnames.h"
55 
56 #define	MIN(A, B)	((A) > (B) ? (B) : (A))
57 
58 #ifdef	SLOWSYS
59 #define read	xread
60 #endif
61 
62 #define BUFSIZE	8192		/* make higher for cray */
63 #define PATSIZE 6000
64 #define LARGE 	BUFSIZE + PATSIZE
65 
66 #define NALT	7		/* tied to scanf() size in alternate() */
67 #define NMUSH	6		/* loosely relates to expected alt length */
68 
69 #define	FIRSTFEW	33 	/* Always do FIRSTFEW matches with regexec() */
70 #define	PUNTPERCENT	10	/* After FIRSTFEW, if PUNTPERCENT of the input
71 				 * was processed by regexp(), exec std egrep. */
72 #define NL	'\n'
73 #define	EOS	'\0'
74 #define	NONASCII	0200	/* Bit mask for Kanji non-ascii chars */
75 #define META	"\n^$.[]()?+*|\\"	/* egrep meta-characters */
76 #define SS2	'\216'		/* EUC Katakana (or Chinese2) prefix */
77 #define SS3	'\217'		/* EUC Kanji2 (or Chinese3) prefix */
78 
79 extern char *optarg;
80 extern int optind;
81 char *progname;
82 
83 int cflag, iflag, eflag, fflag, lflag, nflag;	/* SVID flags */
84 int sflag, hflag;		/* v7, v8, bsd */
85 
86 int firstflag;			/* Stop at first match */
87 int grepflag;			/* Called as "grep" */
88 int fgrepflag;			/* Called as "fgrep" */
89 int altflag;			/* Simple alternation in pattern */
90 int boyonly;			/* No regexp needed -- all simple */
91 int flushflag;
92 int grepold, egrepold, fgrepold;
93 
94 int nalt;			/* Number of alternatives */
95 int nsuccess;			/* 1 for match, 2 for error */
96 int altmin;			/* Minimum length of all the alternate
97 				 * strings */
98 int firstfile;			/* argv index of first file argument */
99 int patind;			/* argv index of pattern */
100 long nmatch;			/* Number of matches in this file */
101 long incount, counted;		/* Amount of input consumed */
102 long rxcount;			/* Bytes of input processed by regexec() */
103 int boyfound;			/* accumulated partial matches (tripped by
104 				 * FIRSTFEW) */
105 int prevmatch;			/* next three lines aid fast -n */
106 long nline, prevnline;
107 char *prevloc;
108 
109 regexp *rspencer;
110 char *pattern;
111 char *patboy;			/* Pattern for simple Boyer-Moore */
112 char *patfile;			/* Filename containing pattern(s) */
113 
114 int delta0[256];		/* Boyer-Moore algorithm core */
115 char cmap[256];			/* Usually 0-255, but if -i, maps upper to
116 				 * lower case */
117 char str[BUFSIZE + 2];
118 int nleftover;
119 char linetemp[BUFSIZE];
120 char *altpat[NALT];		/* alternation component storage */
121 int altlen[NALT];
122 short altset[NMUSH + 1][256];
123 char preamble[200];		/* match prefix (filename, line no.) */
124 
125 int fd;
126 char *
127 strchr(), *strrchr(), *strcpy(), *strncpy(), *strpbrk(), *malloc();
128 char *
129 grepxlat(), *fold(), *pfile(), *alternate(), *isolate();
130 char *gotamatch(), *kanji(), *linesave(), *submatch();
131 char **args;
132 
133 main(argc, argv)
134 	int argc;
135 	char *argv[];
136 {
137 	int c, oflag;
138 	int errflag = 0;
139 
140 	args = argv;
141 
142 	if ((progname = strrchr(argv[0], '/')) != 0)
143 		progname++;
144 	else
145 		progname = argv[0];
146 	if (strcmp(progname, "grep") == 0)
147 		grepflag++;
148 	else if (strcmp(progname, "fgrep") == 0)
149 		fgrepflag++;
150 
151 	oflag = 0;
152 	while ((c = getopt(argc, argv, "bchie:f:lnosvwxy1")) != EOF) {
153 		switch (c) {
154 
155 		case 'f':
156 			fflag++;
157 			patfile = optarg;
158 			continue;
159 		case 'b':
160 		case 'v':
161 			egrepold++;	/* boyer-moore of little help here */
162 			continue;
163 		case 'c':
164 			cflag++;
165 			continue;
166 		case 'e':
167 			eflag++;
168 			pattern = optarg;
169 			continue;
170 		case 'h':
171 			hflag++;
172 			continue;
173 		case 'o':
174 			oflag++;
175 			continue;
176 		case '1':	/* Stop at very first match */
177 			firstflag++;	/* spead freaks only */
178 			continue;
179 		case 'i':
180 			iflag++;
181 			continue;
182 		case 'l':
183 			lflag++;
184 			continue;
185 		case 'n':
186 			nflag++;
187 			continue;
188 		case 's':
189 			sflag++;
190 			continue;
191 		case 'w':
192 		case 'y':
193 			if (!grepflag)
194 				errflag++;
195 			grepold++;
196 			continue;
197 		case 'x':	/* needs more work, like -b above */
198 			if (!fgrepflag)
199 				errflag++;
200 			fgrepold++;
201 			continue;
202 		case '?':
203 			errflag++;
204 		}
205 	}
206 	if (errflag || ((argc <= optind) && !fflag && !eflag)) {
207 		if (grepflag)
208 oops("usage: grep [-bchilnosvwy] [-e] pattern [file ...]");
209 		else if (fgrepflag)
210 oops("usage: fgrep [-bchilnosvx] {-f patfile | [-e] strings} [file ...]");
211 		else		/* encourage SVID options, though we provide
212 				 * others */
213 oops("usage: egrep [-bchilnosv] {-f patfile | [-e] pattern} [file ...]");
214 	}
215 	if (fflag)
216 		pattern = pfile(patfile);
217 	else if (!eflag) {
218 		patind = optind;
219 		pattern = argv[optind++];
220 	}
221 
222 	if (!oflag && (argc - optind) <= 1)	/* Filename invisible given < 2 files */
223 		hflag++;
224 	if (pattern[0] == EOS)
225 		kernighan(argv);	/* same as it ever was */
226 	/*
227 	 * 'grep/egrep' merger -- "old" grep is called to handle: tagged
228 	 * exprs \( \), word matches \< and \>, -w and -y options, char
229 	 * classes with '-' at end (egrep bug?), and patterns beginning with
230 	 * an asterisk (don't ask why). otherwise, characters meaningful to
231 	 * 'egrep' but not to 'grep' are escaped; the entire expr is then
232 	 * passed to 'egrep'.
233 	 */
234 	if (grepflag && !grepold) {
235 		if (strindex(pattern, "\\(") >= 0 ||
236 		    strindex(pattern, "\\<") >= 0 ||
237 		    strindex(pattern, "\\>") >= 0 ||
238 		    strindex(pattern, "-]") >= 0 ||
239 		    pattern[0] == '*')	/* grep bug */
240 			grepold++;
241 		else
242 			pattern = grepxlat(pattern);
243 	}
244 	if (grepold || egrepold || fgrepold)
245 		kernighan(argv);
246 
247 	if (iflag)
248 		strcpy(pattern, fold(pattern));
249 	/*
250 	 * If the pattern is a plain string, just run boyer-moore. If it
251 	 * consists of meta-free alternatives, run "superimposed" bmg.
252 	 * Otherwise, find best string, and compile pattern for regexec().
253 	 */
254 	if (strpbrk(pattern, META) == NULL) {	/* do boyer-moore only */
255 		boyonly++;
256 		patboy = pattern;
257 	} else {
258 		if ((patboy = alternate(pattern)) != NULL)
259 			boyonly++;
260 		else {
261 			if ((patboy = isolate(pattern)) == NULL)
262 				kernighan(argv);	/* expr too involved */
263 #ifndef NOKANJI
264 			for (c = 0; pattern[c] != EOS; c++)
265 				if (pattern[c] & NONASCII)	/* kanji + meta */
266 					kernighan(argv);
267 #endif
268 			if ((rspencer = regcomp(pattern)) == NULL)
269 				oops("regcomp failure");
270 		}
271 	}
272 	gosper(patboy);		/* "pre-conditioning is wonderful"
273 				 * -- v. strassen */
274 
275 	if ((firstfile = optind) >= argc) {
276 		/* Grep standard input */
277 		if (lflag)	/* We don't know its name! */
278 			exit(1);
279 		egsecute((char *) NULL);
280 	} else {
281 		while (optind < argc) {
282 			egsecute(argv[optind]);
283 			optind++;
284 			if (firstflag && (nsuccess == 1))
285 				break;
286 		}
287 	}
288 	exit((nsuccess == 2) ? 2 : (nsuccess == 0));
289 }
290 
291 char *
292 pfile(pfname)			/* absorb expression from file */
293 	char *pfname;
294 {
295 	int fd;
296 	struct stat patstat;
297 	static char *pat;
298 
299 	if ((fd = open(pfname, O_RDONLY, 0)) < 0)
300 		oops("can't read pattern file");
301 	if (fstat(fd, &patstat) != 0)
302 		oops("can't stat pattern file");
303 	if (patstat.st_size > PATSIZE) {
304 		if (fgrepflag) {	/* defer to unix version */
305 			fgrepold++;
306 			return "dummy";
307 		} else
308 			oops("pattern file too big");
309 	}
310 	if ((pat = malloc((unsigned) patstat.st_size + 1)) == NULL)
311 		oops("out of memory to read pattern file");
312 	if (patstat.st_size != read(fd, pat, (int)patstat.st_size))
313 		oops("error reading pattern file");
314 	(void) close(fd);
315 
316 	pat[patstat.st_size] = EOS;
317 	if (pat[patstat.st_size - 1] == NL)	/* NOP for egrep; helps grep */
318 		pat[patstat.st_size - 1] = EOS;
319 
320 	if (nlcount(pat, &pat[patstat.st_size]) > NALT) {
321 		if (fgrepflag)
322 			fgrepold++;	/* "what's it all about, alfie?" */
323 		else
324 			egrepold++;
325 	}
326 	return (pat);
327 }
328 
329 egsecute(file)
330 	char *file;
331 {
332 	extern int errno;
333 
334 	if (file == NULL)
335 		fd = 0;
336 	else if ((fd = open(file, O_RDONLY, 0)) <= 0) {
337 		fprintf(stderr,
338 		    "%s: %s: %s\n", progname, file, strerror(errno));
339 		nsuccess = 2;
340 		return;
341 	}
342 	chimaera(file, patboy);
343 
344 	if (!boyonly && !flushflag && file != NULL)
345 		flushmatches();
346 	if (file != NULL)
347 		close(fd);
348 }
349 
350 chimaera(file, pat)		/* "reach out and boyer-moore search someone" */
351 	char *file, *pat;	/* -- soon-to-be-popular bumper sticker */
352 {
353 	register char *k, *strend, *s;
354 	register int j, count;
355 	register int *deltazero = delta0;
356 	int patlen = altmin;
357 	char *t;
358 
359 	nleftover = boyfound = flushflag = 0;
360 	nline = 1L;
361 	prevmatch = 0;
362 	nmatch = counted = rxcount = 0L;
363 
364 	while ((count = read(fd, str + nleftover, BUFSIZE - nleftover)) > 0) {
365 
366 		counted += count;
367 		strend = linesave(str, count);
368 
369 		for (k = str + patlen - 1; k < strend;) {
370 			/*
371 			 * for a large class of patterns, upwards of 80% of
372 			 * match time is spent on the next line.  we beat
373 			 * existing microcode (vax 'matchc') this way.
374 			 */
375 			while ((k += deltazero[*(unsigned char *) k]) < strend);
376 			if (k < (str + LARGE))
377 				break;
378 			k -= LARGE;
379 
380 			if (altflag) {
381 				/*
382 				 * Parallel Boyer-Moore.  Check whether each
383 				 * of the previous <altmin> chars COULD be
384 				 * from one of the alternative strings.
385 				 */
386 				s = k - 1;
387 				j = altmin;
388 				while (altset[--j][(unsigned char)
389 					     cmap[*(unsigned char *) s--]]);
390 				/*
391 				 * quick test fails. in this life, compare
392 				 * 'em all.  but, a "reverse trie" would
393 				 * attenuate worst case (linear w/delta2?).
394 				 */
395 				if (--j < 0) {
396 					count = nalt - 1;
397 					do {
398 						s = k;
399 						j = altlen[count];
400 						t = altpat[count];
401 
402 						while
403 							(cmap[*(unsigned char *) s--]
404 							 == t[--j]);
405 						if (j < 0)
406 							break;
407 					}
408 					while (count--);
409 				}
410 			} else {
411 				/* One string -- check it */
412 				j = patlen - 1;
413 				s = k - 1;
414 				while (cmap[*(unsigned char *) s--] == pat[--j]);
415 			}
416 			/*
417 			 * delta-less shortcut for literati. short shrift for
418 			 * genetic engineers?
419 			 */
420 			if (j >= 0) {
421 				k++;	/* no match; restart next char */
422 				continue;
423 			}
424 			k = submatch(file, pat, str, strend, k, count);
425 			if (k == NULL)
426 				return;
427 		}
428 		if (nflag) {
429 			if (prevmatch)
430 				nline = prevnline + nlcount(prevloc, k);
431 			else
432 				nline = nline + nlcount(str, k);
433 			prevmatch = 0;
434 		}
435 		strncpy(str, linetemp, nleftover);
436 	}
437 	if (cflag) {
438 		/* Bug from old grep: -c overrides -h.  We fix the bug. */
439 		if (!hflag)
440 			printf("%s:", file);
441 		printf("%ld\n", nmatch);
442 	}
443 }
444 
445 char *
446 linesave(str, count)		/* accumulate partial line at end of buffer */
447 	char str[];
448 	register int count;
449 {
450 	register int j;
451 
452 	count += nleftover;
453 	if (count != BUFSIZE && fd != 0)
454 		str[count++] = NL;	/* insurance for broken last line */
455 	str[count] = EOS;
456 	for (j = count - 1; str[j] != NL && j >= 0;)
457 		j--;
458 	/*
459 	 * break up these lines: long line (> BUFSIZE), last line of file, or
460 	 * short return from read(), as from tee(1) input
461 	 */
462 	if (j < 0 && (count == (BUFSIZE - nleftover))) {
463 		str[count++] = NL;
464 		str[count] = EOS;
465 		linetemp[0] = EOS;
466 		nleftover = 0;
467 		return (str + count);
468 	} else {
469 		nleftover = count - j - 1;
470 		strncpy(linetemp, str + j + 1, nleftover);
471 		return (str + j);
472 	}
473 }
474 
475 /*
476  * Process partial match. First check for mis-aligned Kanji, then match line
477  * against full compiled r.e. if statistics do not warrant handing off to
478  * standard egrep.
479  */
480 char *
481 submatch(file, pat, str, strend, k, altindex)
482 	char file[], pat[], str[];
483 	register char *strend, *k;
484 	int altindex;
485 {
486 	register char *s;
487 	char *t, c;
488 
489 	t = k;
490 	s = ((altflag) ? k - altlen[altindex] + 1 : k - altmin + 1);
491 #ifndef NOKANJI
492 	c = ((altflag) ? altpat[altindex][0] : pat[0]);
493 	if (c & NONASCII)
494 		if ((s = kanji(str, s, k)) == NULL)
495 			return (++k);	/* reject false kanji */
496 #endif
497 	do;
498 	while (*s != NL && --s >= str);
499 	k = s + 1;		/* now at line start */
500 
501 	if (boyonly)
502 		return (gotamatch(file, k));
503 
504 	incount = counted - (strend - k);
505 	if (boyfound++ == FIRSTFEW)
506 		execstrategy(file);
507 
508 	s = t;
509 	do
510 		rxcount++;
511 	while (*s++ != NL);
512 	*--s = EOS;
513 	/*
514 	 * "quick henry -- the flit" (after theodor geisel)
515 	 */
516 	if (regexec(rspencer, ((iflag) ? fold(k) : k)) == 1) {
517 		*s = NL;
518 		if (gotamatch(file, k) == NULL)
519 			return (NULL);
520 	}
521 	*s = NL;
522 	return (s + 1);
523 }
524 
525 #ifndef NOKANJI
526 /*
527  * EUC code disambiguation -- scan backwards to first 7-bit code, while
528  * counting intervening 8-bit codes.  If odd, reject unaligned Kanji pattern.
529  * SS2/3 checks are for intermixed Japanase Katakana or Kanji2.
530  */
531 char *
532 kanji(str, s, k)
533 	register char *str, *s, *k;
534 {
535 	register int j = 0;
536 
537 	for (s--; s >= str; s--) {
538 		if (*s == SS2 || *s == SS3 || (*s & NONASCII) == 0)
539 			break;
540 		j++;
541 	}
542 #ifndef CHINESE
543 	if (*s == SS2)
544 		j -= 1;
545 #endif  CHINESE
546 	return ((j & 01) ? NULL : k);
547 }
548 #endif
549 
550 /*
551  * Compute "Boyer-Moore" delta table -- put skip distance in delta0[c]
552  */
553 gosper(pattern)
554 	char *pattern;		/* ... HAKMEM lives ... */
555 {
556 	register int i, j;
557 	unsigned char c;
558 
559 	/* Make one-string case look like simple alternatives case */
560 	if (!altflag) {
561 		nalt = 1;
562 		altmin = altlen[0] = strlen(pattern);
563 		altpat[0] = pattern;
564 	}
565 	/* For chars that aren't in any string, skip by string length. */
566 	for (j = 0; j < 256; j++) {
567 		delta0[j] = altmin;
568 		cmap[j] = j;	/* Sneak in initialization of cmap */
569 	}
570 
571 	/* For chars in a string, skip distance from char to end of string. */
572 	/* (If char appears more than once, skip minimum distance.) */
573 	for (i = 0; i < nalt; i++)
574 		for (j = 0; j < altlen[i] - 1; j++) {
575 			c = altpat[i][j];
576 			delta0[c] = MIN(delta0[c], altlen[i] - j - 1);
577 			if (iflag && islower((int) c))
578 				delta0[toupper((int) c)] = delta0[c];
579 		}
580 
581 	/* For last char of each string, fall out of search loop. */
582 	for (i = 0; i < nalt; i++) {
583 		c = altpat[i][altlen[i] - 1];
584 		delta0[c] = LARGE;
585 		if (iflag && islower((int) c))
586 			delta0[toupper((int) c)] = LARGE;
587 	}
588 	if (iflag)
589 		for (j = 'A'; j <= 'Z'; j++)
590 			cmap[j] = tolower((int) j);
591 }
592 
593 /*
594  * Print, count, or stop on full match. Result is either the location for
595  * continued search, or NULL to stop.
596  */
597 char *
598 gotamatch(file, s)
599 	register char *file, *s;
600 {
601 	char *savematch();
602 	int squirrel = 0;	/* nonzero to squirrel away FIRSTFEW matches */
603 
604 	nmatch++;
605 	nsuccess = 1;
606 	if (!boyonly && boyfound <= FIRSTFEW && file != NULL)
607 		squirrel = 1;
608 
609 	if (sflag)
610 		return (NULL);	/* -s usurps all flags (unlike some versions) */
611 	if (cflag) {		/* -c overrides -l, we guess */
612 		do;
613 		while (*s++ != NL);
614 	} else if (lflag) {
615 		puts(file);
616 		return (NULL);
617 	} else {
618 		if (!hflag)
619 			if (!squirrel)
620 				printf("%s:", file);
621 			else
622 				(void)sprintf(preamble, "%s:", file);
623 		if (nflag) {
624 			if (prevmatch)
625 				prevnline = prevnline + nlcount(prevloc, s);
626 			else
627 				prevnline = nline + nlcount(str, s);
628 			prevmatch = 1;
629 
630 			if (!squirrel)
631 				printf("%ld:", prevnline);
632 			else
633 				(void)sprintf(preamble + strlen(preamble),
634 					"%ld:", prevnline);
635 		}
636 		if (!squirrel) {
637 			do
638 				putchar(*s);
639 			while (*s++ != NL);
640 		} else
641 			s = savematch(s);
642 
643 		if (nflag)
644 			prevloc = s - 1;
645 	}
646 	return ((firstflag && !cflag) ? NULL : s);
647 }
648 
649 char *
650 fold(line)
651 	char *line;
652 {
653 	static char fline[BUFSIZE];
654 	register char *s, *t = fline;
655 
656 	for (s = line; *s != EOS; s++)
657 		*t++ = (isupper((int) *s) ? (char) tolower((int) *s) : *s);
658 	*t = EOS;
659 	return (fline);
660 }
661 
662 strindex(s, t)			/* the easy way, as in K&P, p. 192 */
663 	char *s, *t;
664 {
665 	int i, n;
666 
667 	n = strlen(t);
668 	for (i = 0; s[i] != '\0'; i++)
669 		if (strncmp(s + i, t, n) == 0)
670 			return (i);
671 	return (-1);
672 }
673 
674 char *
675 grepxlat(pattern)		/* grep pattern meta conversion */
676 	char *pattern;
677 {
678 	register char *p, *s;
679 	static char newpat[BUFSIZE];
680 
681 	for (s = newpat, p = pattern; *p != EOS;) {
682 		if (*p == '\\') {	/* skip escapes ... */
683 			*s++ = *p++;
684 			if (*p)
685 				*s++ = *p++;
686 		} else if (*p == '[') {	/* ... and char classes */
687 			while (*p != EOS && *p != ']')
688 				*s++ = *p++;
689 		} else if (strchr("+?|()", *p) != NULL) {
690 			*s++ = '\\';	/* insert protection */
691 			*s++ = *p++;
692 		} else
693 			*s++ = *p++;
694 	}
695 	*s = EOS;
696 	grepflag = ((patind) ? 0 : 1);
697 	return (newpat);
698 }
699 
700 /*
701  * Test for simple alternation.  Result is NULL if it's not so simple, or is
702  * a pointer to the first string if it is. Warning:  sscanf size is a
703  * fixpoint, beyond which the speedup linearity starts to break down.  In the
704  * wake of the elegant aho/corrasick "trie"-based fgrep, generalizing
705  * altpat[] to arbitrary size is not useful.
706  */
707 char *
708 alternate(regexpr)
709 	char *regexpr;
710 {
711 	register int i, j;
712 	register char *start, *stop;
713 	unsigned char c;
714 
715 	if (fgrepflag && strchr(regexpr, '|'))
716 			return (NULL);
717 
718 	/*
719 	 * break pattern up into altpat array; delimit on newline, bar,
720 	 * or EOS.  We know we won't overflow, we've already checked the
721 	 * number of patterns we're going to find against NALT.
722 	 * Also, set length of pattern and find minimum pattern length.
723 	 */
724 	nalt = 0;
725 	altmin = NMUSH;
726 	for (start = stop = regexpr;; ++stop)
727 		if (!*stop || *stop == '|' || *stop == NL) {
728 			altlen[nalt] = j = stop - start;
729 			if (j < altmin)
730 				altmin = j;
731 			if (!(altpat[nalt] = malloc((u_int)(j + 1))))
732 				oops("out of memory");
733 			bcopy(start, altpat[nalt], j);
734 			altpat[nalt][j] = EOS;
735 			++nalt;
736 			if (!*stop)
737 				break;
738 			if (nalt == NALT)
739 				return(NULL);
740 			if (*stop == NL)
741 				*stop = '|';
742 			start = stop + 1;
743 		}
744 	if (!fgrepflag) {
745 		if (strchr(regexpr, '|') == NULL || regexpr[0] == '|')
746 			return (NULL);
747 		if (strpbrk(regexpr, "^$.[]()?+*\\") != NULL
748 		    || strindex(regexpr, "||") >= 0)
749 			return (NULL);
750 	}
751 
752 	if (nalt > 1) {		/* build superimposed "pre-match" sets per
753 				 * char */
754 		altflag++;
755 		for (j = 0; j < nalt; j++)
756 			for (i = 0; i < altmin; i++) {
757 				c = altpat[j][altlen[j] - altmin + i];
758 				altset[i + 1][c] = 1;	/* offset for sentinel */
759 			}
760 	}
761 	return (altpat[0]);
762 }
763 
764 /*
765  * Grapple with the dfa (std egrep) vs. ndfa (regexp) tradeoff. Criteria to
766  * determine whether to use dfa-based egrep:  We do FIRSTFEW matches with
767  * regexec().  If Boyer-Moore up to now matched more than PUNTPERCENT
768  * of the input, the r.e. is likely to be underspecified, so do old *grep,
769  * which is faster on complex patterns than regexp().  At FIRSTFEW,
770  * dump the saved matches collected by savematch(). They are saved
771  * so that a "PUNT" can "rewind" to ignore them.  Stdin is problematic,
772  * since it's hard to rewind.
773  */
774 
775 execstrategy(file)
776 	char *file;
777 {
778 	int pctmatch;
779 
780 	pctmatch = (100 * rxcount) / incount;
781 	if (pctmatch > PUNTPERCENT && file != NULL)
782 		kernighan(args);
783 	if (file != NULL)
784 		flushmatches();
785 }
786 
787 nlcount(bstart, bstop)		/* flail interval to totalize newlines. */
788 	char *bstart, *bstop;
789 {
790 	register char *s = bstart;
791 	register char *t = bstop;
792 	register int count = 0;
793 
794 	do {			/* loop unroll for older architectures */
795 		if (*t == NL)	/* ... ask ames!jaw for sample code */
796 			count++;
797 	} while (t-- > s);
798 
799 	return (count);
800 }
801 
802 char *
803 isolate(regexpr)		/* isolate longest metacharacter-free string */
804 	char *regexpr;
805 {
806 	char *dummyexpr;
807 
808 	/*
809 	 * We add (.)* because Henry's regcomp only figures regmust if it
810 	 * sees a leading * pattern.  Foo!
811 	 */
812 	dummyexpr = malloc((unsigned) strlen(regexpr) + 5);
813 	(void)sprintf(dummyexpr, "(.)*%s", regexpr);
814 	if ((rspencer = regcomp(dummyexpr)) == NULL)
815 		kernighan(args);
816 	return (rspencer->regmust);
817 }
818 
819 char *matches[FIRSTFEW];
820 static int mcount = 0;
821 
822 char *
823 savematch(s)			/* horde matches during statistics gathering */
824 	register char *s;
825 {
826 	char *p;
827 	char *start = s;
828 	int msize = 0;
829 	int psize = strlen(preamble);
830 
831 	while (*s++ != NL)
832 		msize++;
833 	*--s = EOS;
834 
835 	p = malloc((unsigned) msize + 1 + psize);
836 	strcpy(p, preamble);
837 	strcpy(p + psize, start);
838 	matches[mcount++] = p;
839 
840 	preamble[0] = 0;
841 	*s = NL;
842 	return (s);
843 }
844 
845 flushmatches()
846 {
847 	int n;
848 
849 	flushflag = 1;
850 	for (n = 0; n < mcount; n++)
851 		printf("%s\n", matches[n]);
852 	mcount = 0;
853 }
854 
855 oops(message)
856 	char *message;
857 {
858 	fprintf(stderr, "%s: %s\n", progname, message);
859 	exit(2);
860 }
861 
862 kernighan(args)			/* "let others do the hard part ..." */
863 	char *args[];
864 {
865 	/*
866 	 * We may have already run grep on some of the files; remove them
867 	 * from the arg list we pass on.  Note that we can't delete them
868 	 * totally because the number of file names affects the output
869 	 * (automatic -h).
870 	 */
871 	/* better would be fork/exec per punted file -- jaw */
872 
873 	while (firstfile && optind > firstfile)
874 		args[firstfile++] = _PATH_DEVNULL;
875 	if (patind)
876 		args[patind] = pattern;
877 	(void) fflush(stdout);
878 
879 	if (grepflag)
880 		execvp(_PATH_GREPSTD, args), oops("can't exec old 'grep'");
881 	else if (fgrepflag)
882 		execvp(_PATH_FGREPSTD, args), oops("can't exec old 'fgrep'");
883 	else
884 		execvp(_PATH_EGREPSTD, args), oops("can't exec old 'egrep'");
885 }
886