xref: /original-bsd/usr.bin/grep/egrep/egrep.c (revision 81287ac5)
1 #ifndef lint
2 static char sccsid[] = "@(#)egrep.c	5.9 (Berkeley) 10/22/87";
3 #endif not lint
4 
5 /*
6      Hybrid Boyer/Moore/Gosper-assisted 'grep/egrep/fgrep' search, with delta0
7      table as in original paper (CACM, October, 1977).  No delta1 or delta2.
8      According to experiment (Horspool, Soft. Prac. Exp., 1982), delta2 is of
9      minimal practical value.  However, to improve for worst case input,
10      integrating the improved Galil strategies (Apostolico/Giancarlo, SIAM. J.
11      Comput., Feb. 1986) deserves consideration.
12 
13      Method: 	extract longest metacharacter-free string from expression.
14 		this is done using a side-effect from henry spencer's regcomp().
15 		use boyer-moore to match such, then pass submatching lines
16 		to either regexp() or standard 'egrep', depending on certain
17 		criteria within execstrategy() below.  [this tradeoff is due
18 		to the general slowness of the regexp() nondeterministic
19 		machine on complex expressions, as well as the startup time
20 		of standard 'egrep' on short files.]  alternatively, one may
21 		change the vendor-supplied 'egrep' automaton to include
22 		boyer-moore directly.  see accompanying writeup for discussion
23 		of kanji expression treatment.
24 
25 		late addition:  apply trickbag for fast match of simple
26 		alternations (sublinear, in common low-cardinality cases).
27 		trap fgrep into this lair.
28 
29 		gnu additions:  -f, newline as |, \< and \> [in regexec()], more
30 				comments.  inspire better dfa exec() strategy.
31 				serious testing and help with special cases.
32 
33      Algorithm amalgam summary:
34 
35 		dfa e?grep 		(aho/thompson)
36 		ndfa regexp() 		(spencer/aho)
37 		bmg			(boyer/moore/gosper)
38 		"superimposed" bmg   	(jaw)
39 		fgrep			(aho/corrasick)
40 
41 		sorry, but the knuth/morris/pratt machine, horspool's
42 		"frequentist" code, and the rabin/karp matcher, however cute,
43 		just don't cut it for this production.
44 
45      James A. Woods				Copyright (c) 1986
46      NASA Ames Research Center
47 */
48 #include <stdio.h>
49 #include <ctype.h>
50 #include <sys/types.h>
51 #include <sys/stat.h>
52 #include <sys/file.h>
53 #include <regexp.h>		/* must be henry spencer's version */
54 
55 #define	MIN(A, B)	((A) > (B) ? (B) : (A))
56 
57 #ifdef	SLOWSYS
58 #define read	xread
59 #endif
60 /*
61  * define existing [ef]?grep program locations below for use by execvp().
62  * [execlp() would be used were it not for the possibility of
63  * installation-dependent recursion.]
64  */
65 #ifndef EGREPSTD
66 #define	EGREPSTD	"/usr/bin/egrep"
67 #endif
68 #ifndef GREPSTD
69 #define	GREPSTD		"/bin/grep"
70 #endif
71 #ifndef FGREPSTD
72 #define	FGREPSTD	"/usr/bin/fgrep"
73 #endif
74 
75 #define BUFSIZE	8192		/* make higher for cray */
76 #define PATSIZE 6000
77 #define LARGE 	BUFSIZE + PATSIZE
78 
79 #define NALT	7		/* tied to scanf() size in alternate() */
80 #define NMUSH	6		/* loosely relates to expected alt length */
81 
82 #define	FIRSTFEW	33 	/* Always do FIRSTFEW matches with regexec() */
83 #define	PUNTPERCENT	10	/* After FIRSTFEW, if PUNTPERCENT of the input
84 				 * was processed by regexp(), exec std egrep. */
85 #define NL	'\n'
86 #define	EOS	'\0'
87 #define	NONASCII	0200	/* Bit mask for Kanji non-ascii chars */
88 #define META	"\n^$.[]()?+*|\\"	/* egrep meta-characters */
89 #define SS2	'\216'		/* EUC Katakana (or Chinese2) prefix */
90 #define SS3	'\217'		/* EUC Kanji2 (or Chinese3) prefix */
91 
92 extern char *optarg;
93 extern int optind;
94 char *progname;
95 
96 int cflag, iflag, eflag, fflag, lflag, nflag;	/* SVID flags */
97 int sflag, hflag;		/* v7, v8, bsd */
98 
99 int firstflag;			/* Stop at first match */
100 int grepflag;			/* Called as "grep" */
101 int fgrepflag;			/* Called as "fgrep" */
102 int altflag;			/* Simple alternation in pattern */
103 int boyonly;			/* No regexp needed -- all simple */
104 int flushflag;
105 int grepold, egrepold, fgrepold;
106 
107 int nalt;			/* Number of alternatives */
108 int nsuccess;			/* 1 for match, 2 for error */
109 int altmin;			/* Minimum length of all the alternate
110 				 * strings */
111 int firstfile;			/* argv index of first file argument */
112 int patind;			/* argv index of pattern */
113 long nmatch;			/* Number of matches in this file */
114 long incount, counted;		/* Amount of input consumed */
115 long rxcount;			/* Bytes of input processed by regexec() */
116 int boyfound;			/* accumulated partial matches (tripped by
117 				 * FIRSTFEW) */
118 int prevmatch;			/* next three lines aid fast -n */
119 long nline, prevnline;
120 char *prevloc;
121 
122 regexp *rspencer;
123 char *pattern;
124 char *patboy;			/* Pattern for simple Boyer-Moore */
125 char *patfile;			/* Filename containing pattern(s) */
126 
127 int delta0[256];		/* Boyer-Moore algorithm core */
128 char cmap[256];			/* Usually 0-255, but if -i, maps upper to
129 				 * lower case */
130 char str[BUFSIZE + 2];
131 int nleftover;
132 char linetemp[BUFSIZE];
133 char *altpat[NALT];		/* alternation component storage */
134 int altlen[NALT];
135 short altset[NMUSH + 1][256];
136 char preamble[200];		/* match prefix (filename, line no.) */
137 
138 int fd;
139 char *
140 strchr(), *strrchr(), *strcpy(), *strncpy(), *strpbrk(), *malloc();
141 char *
142 grepxlat(), *fold(), *pfile(), *alternate(), *isolate();
143 char **args;
144 
145 main(argc, argv)
146 	int argc;
147 	char *argv[];
148 {
149 	int c, oflag;
150 	int errflag = 0;
151 
152 	args = argv;
153 
154 	if ((progname = strrchr(argv[0], '/')) != 0)
155 		progname++;
156 	else
157 		progname = argv[0];
158 	if (strcmp(progname, "grep") == 0)
159 		grepflag++;
160 	else if (strcmp(progname, "fgrep") == 0)
161 		fgrepflag++;
162 
163 	oflag = 0;
164 	while ((c = getopt(argc, argv, "bchie:f:lnosvwxy1")) != EOF) {
165 		switch (c) {
166 
167 		case 'f':
168 			fflag++;
169 			patfile = optarg;
170 			continue;
171 		case 'b':
172 		case 'v':
173 			egrepold++;	/* boyer-moore of little help here */
174 			continue;
175 		case 'c':
176 			cflag++;
177 			continue;
178 		case 'e':
179 			eflag++;
180 			pattern = optarg;
181 			continue;
182 		case 'h':
183 			hflag++;
184 			continue;
185 		case 'o':
186 			oflag++;
187 			continue;
188 		case '1':	/* Stop at very first match */
189 			firstflag++;	/* spead freaks only */
190 			continue;
191 		case 'i':
192 			iflag++;
193 			continue;
194 		case 'l':
195 			lflag++;
196 			continue;
197 		case 'n':
198 			nflag++;
199 			continue;
200 		case 's':
201 			sflag++;
202 			continue;
203 		case 'w':
204 		case 'y':
205 			if (!grepflag)
206 				errflag++;
207 			grepold++;
208 			continue;
209 		case 'x':	/* needs more work, like -b above */
210 			if (!fgrepflag)
211 				errflag++;
212 			fgrepold++;
213 			continue;
214 		case '?':
215 			errflag++;
216 		}
217 	}
218 	if (errflag || ((argc <= optind) && !fflag && !eflag)) {
219 		if (grepflag)
220 oops("usage: grep [-bchilnosvwy] [-e] pattern [file ...]");
221 		else if (fgrepflag)
222 oops("usage: fgrep [-bchilnosvx] {-f patfile | [-e] strings} [file ...]");
223 		else		/* encourage SVID options, though we provide
224 				 * others */
225 oops("usage: egrep [-bchilnosv] {-f patfile | [-e] pattern} [file ...]");
226 	}
227 	patind = optind;
228 	if (fflag)
229 		pattern = pfile(patfile), patind = 0;
230 	else if (!eflag)
231 		pattern = argv[optind++];
232 
233 	if (!oflag && (argc - optind) <= 1)	/* Filename invisible given < 2 files */
234 		hflag++;
235 	if (pattern[0] == EOS)
236 		kernighan(argv);	/* same as it ever was */
237 	/*
238 	 * 'grep/egrep' merger -- "old" grep is called to handle: tagged
239 	 * exprs \( \), word matches \< and \>, -w and -y options, char
240 	 * classes with '-' at end (egrep bug?), and patterns beginning with
241 	 * an asterisk (don't ask why). otherwise, characters meaningful to
242 	 * 'egrep' but not to 'grep' are escaped; the entire expr is then
243 	 * passed to 'egrep'.
244 	 */
245 	if (grepflag && !grepold) {
246 		if (strindex(pattern, "\\(") >= 0 ||
247 		    strindex(pattern, "\\<") >= 0 ||
248 		    strindex(pattern, "\\>") >= 0 ||
249 		    strindex(pattern, "-]") >= 0 ||
250 		    pattern[0] == '*')	/* grep bug */
251 			grepold++;
252 		else
253 			pattern = grepxlat(pattern);
254 	}
255 	if (grepold || egrepold || fgrepold)
256 		kernighan(argv);
257 
258 	if (iflag)
259 		strcpy(pattern, fold(pattern));
260 	/*
261 	 * If the pattern is a plain string, just run boyer-moore. If it
262 	 * consists of meta-free alternatives, run "superimposed" bmg.
263 	 * Otherwise, find best string, and compile pattern for regexec().
264 	 */
265 	if (strpbrk(pattern, META) == NULL) {	/* do boyer-moore only */
266 		boyonly++;
267 		patboy = pattern;
268 	} else {
269 		if ((patboy = alternate(pattern)) != NULL)
270 			boyonly++;
271 		else {
272 			if ((patboy = isolate(pattern)) == NULL)
273 				kernighan(argv);	/* expr too involved */
274 #ifndef NOKANJI
275 			for (c = 0; pattern[c] != EOS; c++)
276 				if (pattern[c] & NONASCII)	/* kanji + meta */
277 					kernighan(argv);
278 #endif
279 			if ((rspencer = regcomp(pattern)) == NULL)
280 				oops("regcomp failure");
281 		}
282 	}
283 	gosper(patboy);		/* "pre-conditioning is wonderful"
284 				 * -- v. strassen */
285 
286 	if ((firstfile = optind) >= argc) {
287 		/* Grep standard input */
288 		if (lflag)	/* We don't know its name! */
289 			exit(1);
290 		egsecute((char *) NULL);
291 	} else {
292 		while (optind < argc) {
293 			egsecute(argv[optind]);
294 			optind++;
295 			if (firstflag && (nsuccess == 1))
296 				break;
297 		}
298 	}
299 	exit((nsuccess == 2) ? 2 : (nsuccess == 0));
300 }
301 
302 char *
303 pfile(pfname)			/* absorb expression from file */
304 	char *pfname;
305 {
306 	int fd;
307 	struct stat patstat;
308 	static char *pat;
309 
310 	if ((fd = open(pfname, O_RDONLY, 0)) < 0)
311 		oops("can't read pattern file");
312 	if (fstat(fd, &patstat) != 0)
313 		oops("can't stat pattern file");
314 	if (patstat.st_size > PATSIZE) {
315 		if (fgrepflag) {	/* defer to unix version */
316 			fgrepold++;
317 			return "dummy";
318 		} else
319 			oops("pattern file too big");
320 	}
321 	if ((pat = malloc((unsigned) patstat.st_size + 1)) == NULL)
322 		oops("out of memory to read pattern file");
323 	if (patstat.st_size != read(fd, pat, (int)patstat.st_size))
324 		oops("error reading pattern file");
325 	(void) close(fd);
326 
327 	pat[patstat.st_size] = EOS;
328 	if (pat[patstat.st_size - 1] == NL)	/* NOP for egrep; helps grep */
329 		pat[patstat.st_size - 1] = EOS;
330 
331 	if (nlcount(pat, &pat[patstat.st_size]) > NALT) {
332 		if (fgrepflag)
333 			fgrepold++;	/* "what's it all about, alfie?" */
334 		else
335 			egrepold++;
336 	}
337 	return (pat);
338 }
339 
340 egsecute(file)
341 	char *file;
342 {
343 	if (file == NULL)
344 		fd = 0;
345 	else if ((fd = open(file, O_RDONLY, 0)) <= 0) {
346 		fprintf(stderr, "%s: can't open %s\n", progname, file);
347 		nsuccess = 2;
348 		return;
349 	}
350 	chimaera(file, patboy);
351 
352 	if (!boyonly && !flushflag && file != NULL)
353 		flushmatches();
354 	if (file != NULL)
355 		close(fd);
356 }
357 
358 chimaera(file, pat)		/* "reach out and boyer-moore search someone" */
359 	char *file, *pat;	/* -- soon-to-be-popular bumper sticker */
360 {
361 	register char *k, *strend, *s;
362 	register int j, count;
363 	register int *deltazero = delta0;
364 	int patlen = altmin;
365 	char *t;
366 	char *gotamatch(), *kanji(), *linesave(), *submatch();
367 
368 	nleftover = boyfound = flushflag = 0;
369 	nline = 1L;
370 	prevmatch = 0;
371 	nmatch = counted = rxcount = 0L;
372 
373 	while ((count = read(fd, str + nleftover, BUFSIZE - nleftover)) > 0) {
374 
375 		counted += count;
376 		strend = linesave(str, count);
377 
378 		for (k = str + patlen - 1; k < strend;) {
379 			/*
380 			 * for a large class of patterns, upwards of 80% of
381 			 * match time is spent on the next line.  we beat
382 			 * existing microcode (vax 'matchc') this way.
383 			 */
384 			while ((k += deltazero[*(unsigned char *) k]) < strend);
385 			if (k < (str + LARGE))
386 				break;
387 			k -= LARGE;
388 
389 			if (altflag) {
390 				/*
391 				 * Parallel Boyer-Moore.  Check whether each
392 				 * of the previous <altmin> chars COULD be
393 				 * from one of the alternative strings.
394 				 */
395 				s = k - 1;
396 				j = altmin;
397 				while (altset[--j][(unsigned char)
398 					     cmap[*(unsigned char *) s--]]);
399 				/*
400 				 * quick test fails. in this life, compare
401 				 * 'em all.  but, a "reverse trie" would
402 				 * attenuate worst case (linear w/delta2?).
403 				 */
404 				if (--j < 0) {
405 					count = nalt - 1;
406 					do {
407 						s = k;
408 						j = altlen[count];
409 						t = altpat[count];
410 
411 						while
412 							(cmap[*(unsigned char *) s--]
413 							 == t[--j]);
414 						if (j < 0)
415 							break;
416 					}
417 					while (count--);
418 				}
419 			} else {
420 				/* One string -- check it */
421 				j = patlen - 1;
422 				s = k - 1;
423 				while (cmap[*(unsigned char *) s--] == pat[--j]);
424 			}
425 			/*
426 			 * delta-less shortcut for literati. short shrift for
427 			 * genetic engineers?
428 			 */
429 			if (j >= 0) {
430 				k++;	/* no match; restart next char */
431 				continue;
432 			}
433 			k = submatch(file, pat, str, strend, k, count);
434 			if (k == NULL)
435 				return;
436 		}
437 		if (nflag) {
438 			if (prevmatch)
439 				nline = prevnline + nlcount(prevloc, k);
440 			else
441 				nline = nline + nlcount(str, k);
442 			prevmatch = 0;
443 		}
444 		strncpy(str, linetemp, nleftover);
445 	}
446 	if (cflag) {
447 		/* Bug from old grep: -c overrides -h.  We fix the bug. */
448 		if (!hflag)
449 			printf("%s:", file);
450 		printf("%ld\n", nmatch);
451 	}
452 }
453 
454 char *
455 linesave(str, count)		/* accumulate partial line at end of buffer */
456 	char str[];
457 	register int count;
458 {
459 	register int j;
460 
461 	count += nleftover;
462 	if (count != BUFSIZE && fd != 0)
463 		str[count++] = NL;	/* insurance for broken last line */
464 	str[count] = EOS;
465 	for (j = count - 1; str[j] != NL && j >= 0;)
466 		j--;
467 	/*
468 	 * break up these lines: long line (> BUFSIZE), last line of file, or
469 	 * short return from read(), as from tee(1) input
470 	 */
471 	if (j < 0 && (count == (BUFSIZE - nleftover))) {
472 		str[count++] = NL;
473 		str[count] = EOS;
474 		linetemp[0] = EOS;
475 		nleftover = 0;
476 		return (str + count);
477 	} else {
478 		nleftover = count - j - 1;
479 		strncpy(linetemp, str + j + 1, nleftover);
480 		return (str + j);
481 	}
482 }
483 
484 /*
485  * Process partial match. First check for mis-aligned Kanji, then match line
486  * against full compiled r.e. if statistics do not warrant handing off to
487  * standard egrep.
488  */
489 char *
490 submatch(file, pat, str, strend, k, altindex)
491 	char file[], pat[], str[];
492 	register char *strend, *k;
493 	int altindex;
494 {
495 	register char *s;
496 	char *t, c;
497 
498 	t = k;
499 	s = ((altflag) ? k - altlen[altindex] + 1 : k - altmin + 1);
500 #ifndef NOKANJI
501 	c = ((altflag) ? altpat[altindex][0] : pat[0]);
502 	if (c & NONASCII)
503 		if ((s = kanji(str, s, k)) == NULL)
504 			return (++k);	/* reject false kanji */
505 #endif
506 	do;
507 	while (*s != NL && --s >= str);
508 	k = s + 1;		/* now at line start */
509 
510 	if (boyonly)
511 		return (gotamatch(file, k));
512 
513 	incount = counted - (strend - k);
514 	if (boyfound++ == FIRSTFEW)
515 		execstrategy(file);
516 
517 	s = t;
518 	do
519 		rxcount++;
520 	while (*s++ != NL);
521 	*--s = EOS;
522 	/*
523 	 * "quick henry -- the flit" (after theodor geisel)
524 	 */
525 	if (regexec(rspencer, ((iflag) ? fold(k) : k)) == 1) {
526 		*s = NL;
527 		if (gotamatch(file, k) == NULL)
528 			return (NULL);
529 	}
530 	*s = NL;
531 	return (s + 1);
532 }
533 
534 #ifndef NOKANJI
535 /*
536  * EUC code disambiguation -- scan backwards to first 7-bit code, while
537  * counting intervening 8-bit codes.  If odd, reject unaligned Kanji pattern.
538  * SS2/3 checks are for intermixed Japanase Katakana or Kanji2.
539  */
540 char *
541 kanji(str, s, k)
542 	register char *str, *s, *k;
543 {
544 	register int j = 0;
545 
546 	for (s--; s >= str; s--) {
547 		if (*s == SS2 || *s == SS3 || (*s & NONASCII) == 0)
548 			break;
549 		j++;
550 	}
551 #ifndef CHINESE
552 	if (*s == SS2)
553 		j -= 1;
554 #endif  CHINESE
555 	return ((j & 01) ? NULL : k);
556 }
557 #endif
558 
559 /*
560  * Compute "Boyer-Moore" delta table -- put skip distance in delta0[c]
561  */
562 gosper(pattern)
563 	char *pattern;		/* ... HAKMEM lives ... */
564 {
565 	register int i, j;
566 	unsigned char c;
567 
568 	/* Make one-string case look like simple alternatives case */
569 	if (!altflag) {
570 		nalt = 1;
571 		altmin = altlen[0] = strlen(pattern);
572 		altpat[0] = pattern;
573 	}
574 	/* For chars that aren't in any string, skip by string length. */
575 	for (j = 0; j < 256; j++) {
576 		delta0[j] = altmin;
577 		cmap[j] = j;	/* Sneak in initialization of cmap */
578 	}
579 
580 	/* For chars in a string, skip distance from char to end of string. */
581 	/* (If char appears more than once, skip minimum distance.) */
582 	for (i = 0; i < nalt; i++)
583 		for (j = 0; j < altlen[i] - 1; j++) {
584 			c = altpat[i][j];
585 			delta0[c] = MIN(delta0[c], altlen[i] - j - 1);
586 			if (iflag && islower((int) c))
587 				delta0[toupper((int) c)] = delta0[c];
588 		}
589 
590 	/* For last char of each string, fall out of search loop. */
591 	for (i = 0; i < nalt; i++) {
592 		c = altpat[i][altlen[i] - 1];
593 		delta0[c] = LARGE;
594 		if (iflag && islower((int) c))
595 			delta0[toupper((int) c)] = LARGE;
596 	}
597 	if (iflag)
598 		for (j = 'A'; j <= 'Z'; j++)
599 			cmap[j] = tolower((int) j);
600 }
601 
602 /*
603  * Print, count, or stop on full match. Result is either the location for
604  * continued search, or NULL to stop.
605  */
606 char *
607 gotamatch(file, s)
608 	register char *file, *s;
609 {
610 	char *savematch();
611 	int squirrel = 0;	/* nonzero to squirrel away FIRSTFEW matches */
612 
613 	nmatch++;
614 	nsuccess = 1;
615 	if (!boyonly && boyfound <= FIRSTFEW && file != NULL)
616 		squirrel = 1;
617 
618 	if (sflag)
619 		return (NULL);	/* -s usurps all flags (unlike some versions) */
620 	if (cflag) {		/* -c overrides -l, we guess */
621 		do;
622 		while (*s++ != NL);
623 	} else if (lflag) {
624 		puts(file);
625 		return (NULL);
626 	} else {
627 		if (!hflag)
628 			if (!squirrel)
629 				printf("%s:", file);
630 			else
631 				(void)sprintf(preamble, "%s:", file);
632 		if (nflag) {
633 			if (prevmatch)
634 				prevnline = prevnline + nlcount(prevloc, s);
635 			else
636 				prevnline = nline + nlcount(str, s);
637 			prevmatch = 1;
638 
639 			if (!squirrel)
640 				printf("%ld:", prevnline);
641 			else
642 				(void)sprintf(preamble + strlen(preamble),
643 					"%ld:", prevnline);
644 		}
645 		if (!squirrel) {
646 			do
647 				putchar(*s);
648 			while (*s++ != NL);
649 		} else
650 			s = savematch(s);
651 
652 		if (nflag)
653 			prevloc = s - 1;
654 	}
655 	return ((firstflag && !cflag) ? NULL : s);
656 }
657 
658 char *
659 fold(line)
660 	char *line;
661 {
662 	static char fline[BUFSIZE];
663 	register char *s, *t = fline;
664 
665 	for (s = line; *s != EOS; s++)
666 		*t++ = (isupper((int) *s) ? (char) tolower((int) *s) : *s);
667 	*t = EOS;
668 	return (fline);
669 }
670 
671 strindex(s, t)			/* the easy way, as in K&P, p. 192 */
672 	char *s, *t;
673 {
674 	int i, n;
675 
676 	n = strlen(t);
677 	for (i = 0; s[i] != '\0'; i++)
678 		if (strncmp(s + i, t, n) == 0)
679 			return (i);
680 	return (-1);
681 }
682 
683 char *
684 grepxlat(pattern)		/* grep pattern meta conversion */
685 	char *pattern;
686 {
687 	register char *p, *s;
688 	static char newpat[BUFSIZE];
689 
690 	for (s = newpat, p = pattern; *p != EOS;) {
691 		if (*p == '\\') {	/* skip escapes ... */
692 			*s++ = *p++;
693 			if (*p)
694 				*s++ = *p++;
695 		} else if (*p == '[') {	/* ... and char classes */
696 			while (*p != EOS && *p != ']')
697 				*s++ = *p++;
698 		} else if (strchr("+?|()", *p) != NULL) {
699 			*s++ = '\\';	/* insert protection */
700 			*s++ = *p++;
701 		} else
702 			*s++ = *p++;
703 	}
704 	*s = EOS;
705 	grepflag = ((patind) ? 0 : 1);
706 	return (newpat);
707 }
708 
709 /*
710  * Test for simple alternation.  Result is NULL if it's not so simple, or is
711  * a pointer to the first string if it is. Warning:  sscanf size is a
712  * fixpoint, beyond which the speedup linearity starts to break down.  In the
713  * wake of the elegant aho/corrasick "trie"-based fgrep, generalizing
714  * altpat[] to arbitrary size is not useful.
715  */
716 char *
717 alternate(regexpr)
718 	char *regexpr;
719 {
720 	register int i, j;
721 	register char *start, *stop;
722 	unsigned char c;
723 
724 	if (fgrepflag && strchr(regexpr, '|'))
725 			return (NULL);
726 
727 	/*
728 	 * break pattern up into altpat array; delimit on newline, bar,
729 	 * or EOS.  We know we won't overflow, we've already checked the
730 	 * number of patterns we're going to find against NALT.
731 	 * Also, set length of pattern and find minimum pattern length.
732 	 */
733 	nalt = 0;
734 	altmin = NMUSH;
735 	for (start = stop = regexpr;; ++stop)
736 		if (!*stop || *stop == '|' || *stop == NL) {
737 			altlen[nalt] = j = stop - start;
738 			if (j < altmin)
739 				altmin = j;
740 			if (!(altpat[nalt] = malloc((u_int)(j + 1))))
741 				oops("out of memory");
742 			bcopy(start, altpat[nalt], j);
743 			altpat[nalt][j] = EOS;
744 			++nalt;
745 			if (!*stop)
746 				break;
747 			if (nalt == NALT)
748 				return(NULL);
749 			if (*stop == NL)
750 				*stop = '|';
751 			start = stop + 1;
752 		}
753 	if (!fgrepflag) {
754 		if (strchr(regexpr, '|') == NULL || regexpr[0] == '|')
755 			return (NULL);
756 		if (strpbrk(regexpr, "^$.[]()?+*\\") != NULL
757 		    || strindex(regexpr, "||") >= 0)
758 			return (NULL);
759 	}
760 
761 	if (nalt > 1) {		/* build superimposed "pre-match" sets per
762 				 * char */
763 		altflag++;
764 		for (j = 0; j < nalt; j++)
765 			for (i = 0; i < altmin; i++) {
766 				c = altpat[j][altlen[j] - altmin + i];
767 				altset[i + 1][c] = 1;	/* offset for sentinel */
768 			}
769 	}
770 	return (altpat[0]);
771 }
772 
773 /*
774  * Grapple with the dfa (std egrep) vs. ndfa (regexp) tradeoff. Criteria to
775  * determine whether to use dfa-based egrep:  We do FIRSTFEW matches with
776  * regexec().  If Boyer-Moore up to now matched more than PUNTPERCENT
777  * of the input, the r.e. is likely to be underspecified, so do old *grep,
778  * which is faster on complex patterns than regexp().  At FIRSTFEW,
779  * dump the saved matches collected by savematch(). They are saved
780  * so that a "PUNT" can "rewind" to ignore them.  Stdin is problematic,
781  * since it's hard to rewind.
782  */
783 
784 execstrategy(file)
785 	char *file;
786 {
787 	int pctmatch;
788 
789 	pctmatch = (100 * rxcount) / incount;
790 	if (pctmatch > PUNTPERCENT && file != NULL)
791 		kernighan(args);
792 	if (file != NULL)
793 		flushmatches();
794 }
795 
796 nlcount(bstart, bstop)		/* flail interval to totalize newlines. */
797 	char *bstart, *bstop;
798 {
799 	register char *s = bstart;
800 	register char *t = bstop;
801 	register int count = 0;
802 
803 	do {			/* loop unroll for older architectures */
804 		if (*t == NL)	/* ... ask ames!jaw for sample code */
805 			count++;
806 	} while (t-- > s);
807 
808 	return (count);
809 }
810 
811 char *
812 isolate(regexpr)		/* isolate longest metacharacter-free string */
813 	char *regexpr;
814 {
815 	char *dummyexpr;
816 
817 	/*
818 	 * We add (.)* because Henry's regcomp only figures regmust if it
819 	 * sees a leading * pattern.  Foo!
820 	 */
821 	dummyexpr = malloc((unsigned) strlen(regexpr) + 5);
822 	(void)sprintf(dummyexpr, "(.)*%s", regexpr);
823 	if ((rspencer = regcomp(dummyexpr)) == NULL)
824 		kernighan(args);
825 	return (rspencer->regmust);
826 }
827 
828 char *matches[FIRSTFEW];
829 static int mcount = 0;
830 
831 char *
832 savematch(s)			/* horde matches during statistics gathering */
833 	register char *s;
834 {
835 	char *p;
836 	char *start = s;
837 	int msize = 0;
838 	int psize = strlen(preamble);
839 
840 	while (*s++ != NL)
841 		msize++;
842 	*--s = EOS;
843 
844 	p = malloc((unsigned) msize + 1 + psize);
845 	strcpy(p, preamble);
846 	strcpy(p + psize, start);
847 	matches[mcount++] = p;
848 
849 	preamble[0] = 0;
850 	*s = NL;
851 	return (s);
852 }
853 
854 flushmatches()
855 {
856 	int n;
857 
858 	flushflag = 1;
859 	for (n = 0; n < mcount; n++)
860 		printf("%s\n", matches[n]);
861 	mcount = 0;
862 }
863 
864 oops(message)
865 	char *message;
866 {
867 	fprintf(stderr, "%s: %s\n", progname, message);
868 	exit(2);
869 }
870 
871 kernighan(args)			/* "let others do the hard part ..." */
872 	char *args[];
873 {
874 	/*
875 	 * We may have already run grep on some of the files; remove them
876 	 * from the arg list we pass on.  Note that we can't delete them
877 	 * totally because the number of file names affects the output
878 	 * (automatic -h).
879 	 */
880 	/* better would be fork/exec per punted file -- jaw */
881 
882 	while (firstfile && optind > firstfile)
883 		args[firstfile++] = "/dev/null";
884 	if (patind)
885 		args[patind] = pattern;
886 	(void) fflush(stdout);
887 
888 	if (grepflag)
889 		execvp(GREPSTD, args), oops("can't exec old 'grep'");
890 	else if (fgrepflag)
891 		execvp(FGREPSTD, args), oops("can't exec old 'fgrep'");
892 	else
893 		execvp(EGREPSTD, args), oops("can't exec old 'egrep'");
894 }
895