xref: /original-bsd/usr.bin/grep/egrep/egrep.c (revision 42f60e33)
1 /*-
2  * Copyright (c) 1991, 1993
3  *	The Regents of the University of California.  All rights reserved.
4  *
5  * %sccs.include.redist.c%
6  */
7 
8 #ifndef lint
9 static char copyright[] =
10 "@(#) Copyright (c) 1991, 1993\n\
11 	The Regents of the University of California.  All rights reserved.\n";
12 #endif /* not lint */
13 
14 #ifndef lint
15 static char sccsid[] = "@(#)egrep.c	8.1 (Berkeley) 06/06/93";
16 #endif /* not lint */
17 
18 /*
19      Hybrid Boyer/Moore/Gosper-assisted 'grep/egrep/fgrep' search, with delta0
20      table as in original paper (CACM, October, 1977).  No delta1 or delta2.
21      According to experiment (Horspool, Soft. Prac. Exp., 1982), delta2 is of
22      minimal practical value.  However, to improve for worst case input,
23      integrating the improved Galil strategies (Apostolico/Giancarlo, SIAM. J.
24      Comput., Feb. 1986) deserves consideration.
25 
26      Method: 	extract longest metacharacter-free string from expression.
27 		this is done using a side-effect from henry spencer's regcomp().
28 		use boyer-moore to match such, then pass submatching lines
29 		to either regexp() or standard 'egrep', depending on certain
30 		criteria within execstrategy() below.  [this tradeoff is due
31 		to the general slowness of the regexp() nondeterministic
32 		machine on complex expressions, as well as the startup time
33 		of standard 'egrep' on short files.]  alternatively, one may
34 		change the vendor-supplied 'egrep' automaton to include
35 		boyer-moore directly.  see accompanying writeup for discussion
36 		of kanji expression treatment.
37 
38 		late addition:  apply trickbag for fast match of simple
39 		alternations (sublinear, in common low-cardinality cases).
40 		trap fgrep into this lair.
41 
42 		gnu additions:  -f, newline as |, \< and \> [in regexec()], more
43 				comments.  inspire better dfa exec() strategy.
44 				serious testing and help with special cases.
45 
46      Algorithm amalgam summary:
47 
48 		dfa e?grep 		(aho/thompson)
49 		ndfa regexp() 		(spencer/aho)
50 		bmg			(boyer/moore/gosper)
51 		"superimposed" bmg   	(jaw)
52 		fgrep			(aho/corrasick)
53 
54 		sorry, but the knuth/morris/pratt machine, horspool's
55 		"frequentist" code, and the rabin/karp matcher, however cute,
56 		just don't cut it for this production.
57 
58      James A. Woods				Copyright (c) 1986
59      NASA Ames Research Center
60 */
61 #include <sys/types.h>
62 #include <sys/stat.h>
63 #include <sys/file.h>
64 #include <regexp.h>		/* must be henry spencer's version */
65 #include <stdio.h>
66 #include <ctype.h>
67 #include "pathnames.h"
68 
69 #define	MIN(A, B)	((A) > (B) ? (B) : (A))
70 
71 #ifdef	SLOWSYS
72 #define read	xread
73 #endif
74 
75 #define BUFSIZE	8192		/* make higher for cray */
76 #define PATSIZE 6000
77 #define LARGE 	BUFSIZE + PATSIZE
78 
79 #define NALT	7		/* tied to scanf() size in alternate() */
80 #define NMUSH	6		/* loosely relates to expected alt length */
81 
82 #define	FIRSTFEW	33 	/* Always do FIRSTFEW matches with regexec() */
83 #define	PUNTPERCENT	10	/* After FIRSTFEW, if PUNTPERCENT of the input
84 				 * was processed by regexp(), exec std egrep. */
85 #define NL	'\n'
86 #define	EOS	'\0'
87 #define	NONASCII	0200	/* Bit mask for Kanji non-ascii chars */
88 #define META	"\n^$.[]()?+*|\\"	/* egrep meta-characters */
89 #define SS2	'\216'		/* EUC Katakana (or Chinese2) prefix */
90 #define SS3	'\217'		/* EUC Kanji2 (or Chinese3) prefix */
91 
92 extern char *optarg;
93 extern int optind;
94 char *progname;
95 
96 int cflag, iflag, eflag, fflag, lflag, nflag;	/* SVID flags */
97 int sflag, hflag;		/* v7, v8, bsd */
98 
99 int firstflag;			/* Stop at first match */
100 int grepflag;			/* Called as "grep" */
101 int fgrepflag;			/* Called as "fgrep" */
102 int altflag;			/* Simple alternation in pattern */
103 int boyonly;			/* No regexp needed -- all simple */
104 int flushflag;
105 int grepold, egrepold, fgrepold;
106 
107 int nalt;			/* Number of alternatives */
108 int nsuccess;			/* 1 for match, 2 for error */
109 int altmin;			/* Minimum length of all the alternate
110 				 * strings */
111 int firstfile;			/* argv index of first file argument */
112 int patind;			/* argv index of pattern */
113 long nmatch;			/* Number of matches in this file */
114 long incount, counted;		/* Amount of input consumed */
115 long rxcount;			/* Bytes of input processed by regexec() */
116 int boyfound;			/* accumulated partial matches (tripped by
117 				 * FIRSTFEW) */
118 int prevmatch;			/* next three lines aid fast -n */
119 long nline, prevnline;
120 char *prevloc;
121 
122 regexp *rspencer;
123 char *pattern;
124 char *patboy;			/* Pattern for simple Boyer-Moore */
125 char *patfile;			/* Filename containing pattern(s) */
126 
127 int delta0[256];		/* Boyer-Moore algorithm core */
128 char cmap[256];			/* Usually 0-255, but if -i, maps upper to
129 				 * lower case */
130 char str[BUFSIZE + 2];
131 int nleftover;
132 char linetemp[BUFSIZE];
133 char *altpat[NALT];		/* alternation component storage */
134 int altlen[NALT];
135 short altset[NMUSH + 1][256];
136 char preamble[200];		/* match prefix (filename, line no.) */
137 
138 int fd;
139 char *
140 strchr(), *strrchr(), *strcpy(), *strncpy(), *strpbrk(), *malloc();
141 char *
142 grepxlat(), *fold(), *pfile(), *alternate(), *isolate();
143 char *gotamatch(), *kanji(), *linesave(), *submatch();
144 char **args;
145 
146 main(argc, argv)
147 	int argc;
148 	char *argv[];
149 {
150 	int c, oflag;
151 	int errflag = 0;
152 
153 	args = argv;
154 
155 	if ((progname = strrchr(argv[0], '/')) != 0)
156 		progname++;
157 	else
158 		progname = argv[0];
159 	if (strcmp(progname, "grep") == 0)
160 		grepflag++;
161 	else if (strcmp(progname, "fgrep") == 0)
162 		fgrepflag++;
163 
164 	oflag = 0;
165 	while ((c = getopt(argc, argv, "bchie:f:lnosvwxy1")) != EOF) {
166 		switch (c) {
167 
168 		case 'f':
169 			fflag++;
170 			patfile = optarg;
171 			continue;
172 		case 'b':
173 		case 'v':
174 			egrepold++;	/* boyer-moore of little help here */
175 			continue;
176 		case 'c':
177 			cflag++;
178 			continue;
179 		case 'e':
180 			eflag++;
181 			pattern = optarg;
182 			continue;
183 		case 'h':
184 			hflag++;
185 			continue;
186 		case 'o':
187 			oflag++;
188 			continue;
189 		case '1':	/* Stop at very first match */
190 			firstflag++;	/* spead freaks only */
191 			continue;
192 		case 'i':
193 			iflag++;
194 			continue;
195 		case 'l':
196 			lflag++;
197 			continue;
198 		case 'n':
199 			nflag++;
200 			continue;
201 		case 's':
202 			sflag++;
203 			continue;
204 		case 'w':
205 		case 'y':
206 			if (!grepflag)
207 				errflag++;
208 			grepold++;
209 			continue;
210 		case 'x':	/* needs more work, like -b above */
211 			if (!fgrepflag)
212 				errflag++;
213 			fgrepold++;
214 			continue;
215 		case '?':
216 			errflag++;
217 		}
218 	}
219 	if (errflag || ((argc <= optind) && !fflag && !eflag)) {
220 		if (grepflag)
221 oops("usage: grep [-bchilnosvwy] [-e] pattern [file ...]");
222 		else if (fgrepflag)
223 oops("usage: fgrep [-bchilnosvx] {-f patfile | [-e] strings} [file ...]");
224 		else		/* encourage SVID options, though we provide
225 				 * others */
226 oops("usage: egrep [-bchilnosv] {-f patfile | [-e] pattern} [file ...]");
227 	}
228 	if (fflag)
229 		pattern = pfile(patfile);
230 	else if (!eflag) {
231 		patind = optind;
232 		pattern = argv[optind++];
233 	}
234 
235 	if (!oflag && (argc - optind) <= 1)	/* Filename invisible given < 2 files */
236 		hflag++;
237 	if (pattern[0] == EOS)
238 		kernighan(argv);	/* same as it ever was */
239 	/*
240 	 * 'grep/egrep' merger -- "old" grep is called to handle: tagged
241 	 * exprs \( \), word matches \< and \>, -w and -y options, char
242 	 * classes with '-' at end (egrep bug?), and patterns beginning with
243 	 * an asterisk (don't ask why). otherwise, characters meaningful to
244 	 * 'egrep' but not to 'grep' are escaped; the entire expr is then
245 	 * passed to 'egrep'.
246 	 */
247 	if (grepflag && !grepold) {
248 		if (strindex(pattern, "\\(") >= 0 ||
249 		    strindex(pattern, "\\<") >= 0 ||
250 		    strindex(pattern, "\\>") >= 0 ||
251 		    strindex(pattern, "-]") >= 0 ||
252 		    pattern[0] == '*')	/* grep bug */
253 			grepold++;
254 		else
255 			pattern = grepxlat(pattern);
256 	}
257 	if (grepold || egrepold || fgrepold)
258 		kernighan(argv);
259 
260 	if (iflag)
261 		strcpy(pattern, fold(pattern));
262 	/*
263 	 * If the pattern is a plain string, just run boyer-moore. If it
264 	 * consists of meta-free alternatives, run "superimposed" bmg.
265 	 * Otherwise, find best string, and compile pattern for regexec().
266 	 */
267 	if (strpbrk(pattern, META) == NULL) {	/* do boyer-moore only */
268 		boyonly++;
269 		patboy = pattern;
270 	} else {
271 		if ((patboy = alternate(pattern)) != NULL)
272 			boyonly++;
273 		else {
274 			if ((patboy = isolate(pattern)) == NULL)
275 				kernighan(argv);	/* expr too involved */
276 #ifndef NOKANJI
277 			for (c = 0; pattern[c] != EOS; c++)
278 				if (pattern[c] & NONASCII)	/* kanji + meta */
279 					kernighan(argv);
280 #endif
281 			if ((rspencer = regcomp(pattern)) == NULL)
282 				oops("regcomp failure");
283 		}
284 	}
285 	gosper(patboy);		/* "pre-conditioning is wonderful"
286 				 * -- v. strassen */
287 
288 	if ((firstfile = optind) >= argc) {
289 		/* Grep standard input */
290 		if (lflag)	/* We don't know its name! */
291 			exit(1);
292 		egsecute((char *) NULL);
293 	} else {
294 		while (optind < argc) {
295 			egsecute(argv[optind]);
296 			optind++;
297 			if (firstflag && (nsuccess == 1))
298 				break;
299 		}
300 	}
301 	exit((nsuccess == 2) ? 2 : (nsuccess == 0));
302 }
303 
304 char *
305 pfile(pfname)			/* absorb expression from file */
306 	char *pfname;
307 {
308 	int fd;
309 	struct stat patstat;
310 	static char *pat;
311 
312 	if ((fd = open(pfname, O_RDONLY, 0)) < 0)
313 		oops("can't read pattern file");
314 	if (fstat(fd, &patstat) != 0)
315 		oops("can't stat pattern file");
316 	if (patstat.st_size > PATSIZE) {
317 		if (fgrepflag) {	/* defer to unix version */
318 			fgrepold++;
319 			return "dummy";
320 		} else
321 			oops("pattern file too big");
322 	}
323 	if ((pat = malloc((unsigned) patstat.st_size + 1)) == NULL)
324 		oops("out of memory to read pattern file");
325 	if (patstat.st_size != read(fd, pat, (int)patstat.st_size))
326 		oops("error reading pattern file");
327 	(void) close(fd);
328 
329 	pat[patstat.st_size] = EOS;
330 	if (pat[patstat.st_size - 1] == NL)	/* NOP for egrep; helps grep */
331 		pat[patstat.st_size - 1] = EOS;
332 
333 	if (nlcount(pat, &pat[patstat.st_size]) > NALT) {
334 		if (fgrepflag)
335 			fgrepold++;	/* "what's it all about, alfie?" */
336 		else
337 			egrepold++;
338 	}
339 	return (pat);
340 }
341 
342 egsecute(file)
343 	char *file;
344 {
345 	extern int errno;
346 
347 	if (file == NULL)
348 		fd = 0;
349 	else if ((fd = open(file, O_RDONLY, 0)) <= 0) {
350 		fprintf(stderr,
351 		    "%s: %s: %s\n", progname, file, strerror(errno));
352 		nsuccess = 2;
353 		return;
354 	}
355 	chimaera(file, patboy);
356 
357 	if (!boyonly && !flushflag && file != NULL)
358 		flushmatches();
359 	if (file != NULL)
360 		close(fd);
361 }
362 
363 chimaera(file, pat)		/* "reach out and boyer-moore search someone" */
364 	char *file, *pat;	/* -- soon-to-be-popular bumper sticker */
365 {
366 	register char *k, *strend, *s;
367 	register int j, count;
368 	register int *deltazero = delta0;
369 	int patlen = altmin;
370 	char *t;
371 
372 	nleftover = boyfound = flushflag = 0;
373 	nline = 1L;
374 	prevmatch = 0;
375 	nmatch = counted = rxcount = 0L;
376 
377 	while ((count = read(fd, str + nleftover, BUFSIZE - nleftover)) > 0) {
378 
379 		counted += count;
380 		strend = linesave(str, count);
381 
382 		for (k = str + patlen - 1; k < strend;) {
383 			/*
384 			 * for a large class of patterns, upwards of 80% of
385 			 * match time is spent on the next line.  we beat
386 			 * existing microcode (vax 'matchc') this way.
387 			 */
388 			while ((k += deltazero[*(unsigned char *) k]) < strend);
389 			if (k < (str + LARGE))
390 				break;
391 			k -= LARGE;
392 
393 			if (altflag) {
394 				/*
395 				 * Parallel Boyer-Moore.  Check whether each
396 				 * of the previous <altmin> chars COULD be
397 				 * from one of the alternative strings.
398 				 */
399 				s = k - 1;
400 				j = altmin;
401 				while (altset[--j][(unsigned char)
402 					     cmap[*(unsigned char *) s--]]);
403 				/*
404 				 * quick test fails. in this life, compare
405 				 * 'em all.  but, a "reverse trie" would
406 				 * attenuate worst case (linear w/delta2?).
407 				 */
408 				if (--j < 0) {
409 					count = nalt - 1;
410 					do {
411 						s = k;
412 						j = altlen[count];
413 						t = altpat[count];
414 
415 						while
416 							(cmap[*(unsigned char *) s--]
417 							 == t[--j]);
418 						if (j < 0)
419 							break;
420 					}
421 					while (count--);
422 				}
423 			} else {
424 				/* One string -- check it */
425 				j = patlen - 1;
426 				s = k - 1;
427 				while (cmap[*(unsigned char *) s--] == pat[--j]);
428 			}
429 			/*
430 			 * delta-less shortcut for literati. short shrift for
431 			 * genetic engineers?
432 			 */
433 			if (j >= 0) {
434 				k++;	/* no match; restart next char */
435 				continue;
436 			}
437 			k = submatch(file, pat, str, strend, k, count);
438 			if (k == NULL)
439 				return;
440 		}
441 		if (nflag) {
442 			if (prevmatch)
443 				nline = prevnline + nlcount(prevloc, k);
444 			else
445 				nline = nline + nlcount(str, k);
446 			prevmatch = 0;
447 		}
448 		strncpy(str, linetemp, nleftover);
449 	}
450 	if (cflag) {
451 		/* Bug from old grep: -c overrides -h.  We fix the bug. */
452 		if (!hflag)
453 			printf("%s:", file);
454 		printf("%ld\n", nmatch);
455 	}
456 }
457 
458 char *
459 linesave(str, count)		/* accumulate partial line at end of buffer */
460 	char str[];
461 	register int count;
462 {
463 	register int j;
464 
465 	count += nleftover;
466 	if (count != BUFSIZE && fd != 0)
467 		str[count++] = NL;	/* insurance for broken last line */
468 	str[count] = EOS;
469 	for (j = count - 1; str[j] != NL && j >= 0;)
470 		j--;
471 	/*
472 	 * break up these lines: long line (> BUFSIZE), last line of file, or
473 	 * short return from read(), as from tee(1) input
474 	 */
475 	if (j < 0 && (count == (BUFSIZE - nleftover))) {
476 		str[count++] = NL;
477 		str[count] = EOS;
478 		linetemp[0] = EOS;
479 		nleftover = 0;
480 		return (str + count);
481 	} else {
482 		nleftover = count - j - 1;
483 		strncpy(linetemp, str + j + 1, nleftover);
484 		return (str + j);
485 	}
486 }
487 
488 /*
489  * Process partial match. First check for mis-aligned Kanji, then match line
490  * against full compiled r.e. if statistics do not warrant handing off to
491  * standard egrep.
492  */
493 char *
494 submatch(file, pat, str, strend, k, altindex)
495 	char file[], pat[], str[];
496 	register char *strend, *k;
497 	int altindex;
498 {
499 	register char *s;
500 	char *t, c;
501 
502 	t = k;
503 	s = ((altflag) ? k - altlen[altindex] + 1 : k - altmin + 1);
504 #ifndef NOKANJI
505 	c = ((altflag) ? altpat[altindex][0] : pat[0]);
506 	if (c & NONASCII)
507 		if ((s = kanji(str, s, k)) == NULL)
508 			return (++k);	/* reject false kanji */
509 #endif
510 	do;
511 	while (*s != NL && --s >= str);
512 	k = s + 1;		/* now at line start */
513 
514 	if (boyonly)
515 		return (gotamatch(file, k));
516 
517 	incount = counted - (strend - k);
518 	if (boyfound++ == FIRSTFEW)
519 		execstrategy(file);
520 
521 	s = t;
522 	do
523 		rxcount++;
524 	while (*s++ != NL);
525 	*--s = EOS;
526 	/*
527 	 * "quick henry -- the flit" (after theodor geisel)
528 	 */
529 	if (regexec(rspencer, ((iflag) ? fold(k) : k)) == 1) {
530 		*s = NL;
531 		if (gotamatch(file, k) == NULL)
532 			return (NULL);
533 	}
534 	*s = NL;
535 	return (s + 1);
536 }
537 
538 #ifndef NOKANJI
539 /*
540  * EUC code disambiguation -- scan backwards to first 7-bit code, while
541  * counting intervening 8-bit codes.  If odd, reject unaligned Kanji pattern.
542  * SS2/3 checks are for intermixed Japanase Katakana or Kanji2.
543  */
544 char *
545 kanji(str, s, k)
546 	register char *str, *s, *k;
547 {
548 	register int j = 0;
549 
550 	for (s--; s >= str; s--) {
551 		if (*s == SS2 || *s == SS3 || (*s & NONASCII) == 0)
552 			break;
553 		j++;
554 	}
555 #ifndef CHINESE
556 	if (*s == SS2)
557 		j -= 1;
558 #endif  CHINESE
559 	return ((j & 01) ? NULL : k);
560 }
561 #endif
562 
563 /*
564  * Compute "Boyer-Moore" delta table -- put skip distance in delta0[c]
565  */
566 gosper(pattern)
567 	char *pattern;		/* ... HAKMEM lives ... */
568 {
569 	register int i, j;
570 	unsigned char c;
571 
572 	/* Make one-string case look like simple alternatives case */
573 	if (!altflag) {
574 		nalt = 1;
575 		altmin = altlen[0] = strlen(pattern);
576 		altpat[0] = pattern;
577 	}
578 	/* For chars that aren't in any string, skip by string length. */
579 	for (j = 0; j < 256; j++) {
580 		delta0[j] = altmin;
581 		cmap[j] = j;	/* Sneak in initialization of cmap */
582 	}
583 
584 	/* For chars in a string, skip distance from char to end of string. */
585 	/* (If char appears more than once, skip minimum distance.) */
586 	for (i = 0; i < nalt; i++)
587 		for (j = 0; j < altlen[i] - 1; j++) {
588 			c = altpat[i][j];
589 			delta0[c] = MIN(delta0[c], altlen[i] - j - 1);
590 			if (iflag && islower((int) c))
591 				delta0[toupper((int) c)] = delta0[c];
592 		}
593 
594 	/* For last char of each string, fall out of search loop. */
595 	for (i = 0; i < nalt; i++) {
596 		c = altpat[i][altlen[i] - 1];
597 		delta0[c] = LARGE;
598 		if (iflag && islower((int) c))
599 			delta0[toupper((int) c)] = LARGE;
600 	}
601 	if (iflag)
602 		for (j = 'A'; j <= 'Z'; j++)
603 			cmap[j] = tolower((int) j);
604 }
605 
606 /*
607  * Print, count, or stop on full match. Result is either the location for
608  * continued search, or NULL to stop.
609  */
610 char *
611 gotamatch(file, s)
612 	register char *file, *s;
613 {
614 	char *savematch();
615 	int squirrel = 0;	/* nonzero to squirrel away FIRSTFEW matches */
616 
617 	nmatch++;
618 	nsuccess = 1;
619 	if (!boyonly && boyfound <= FIRSTFEW && file != NULL)
620 		squirrel = 1;
621 
622 	if (sflag)
623 		return (NULL);	/* -s usurps all flags (unlike some versions) */
624 	if (cflag) {		/* -c overrides -l, we guess */
625 		do;
626 		while (*s++ != NL);
627 	} else if (lflag) {
628 		puts(file);
629 		return (NULL);
630 	} else {
631 		if (!hflag)
632 			if (!squirrel)
633 				printf("%s:", file);
634 			else
635 				(void)sprintf(preamble, "%s:", file);
636 		if (nflag) {
637 			if (prevmatch)
638 				prevnline = prevnline + nlcount(prevloc, s);
639 			else
640 				prevnline = nline + nlcount(str, s);
641 			prevmatch = 1;
642 
643 			if (!squirrel)
644 				printf("%ld:", prevnline);
645 			else
646 				(void)sprintf(preamble + strlen(preamble),
647 					"%ld:", prevnline);
648 		}
649 		if (!squirrel) {
650 			do
651 				putchar(*s);
652 			while (*s++ != NL);
653 		} else
654 			s = savematch(s);
655 
656 		if (nflag)
657 			prevloc = s - 1;
658 	}
659 	return ((firstflag && !cflag) ? NULL : s);
660 }
661 
662 char *
663 fold(line)
664 	char *line;
665 {
666 	static char fline[BUFSIZE];
667 	register char *s, *t = fline;
668 
669 	for (s = line; *s != EOS; s++)
670 		*t++ = (isupper((int) *s) ? (char) tolower((int) *s) : *s);
671 	*t = EOS;
672 	return (fline);
673 }
674 
675 strindex(s, t)			/* the easy way, as in K&P, p. 192 */
676 	char *s, *t;
677 {
678 	int i, n;
679 
680 	n = strlen(t);
681 	for (i = 0; s[i] != '\0'; i++)
682 		if (strncmp(s + i, t, n) == 0)
683 			return (i);
684 	return (-1);
685 }
686 
687 char *
688 grepxlat(pattern)		/* grep pattern meta conversion */
689 	char *pattern;
690 {
691 	register char *p, *s;
692 	static char newpat[BUFSIZE];
693 
694 	for (s = newpat, p = pattern; *p != EOS;) {
695 		if (*p == '\\') {	/* skip escapes ... */
696 			*s++ = *p++;
697 			if (*p)
698 				*s++ = *p++;
699 		} else if (*p == '[') {	/* ... and char classes */
700 			while (*p != EOS && *p != ']')
701 				*s++ = *p++;
702 		} else if (strchr("+?|()", *p) != NULL) {
703 			*s++ = '\\';	/* insert protection */
704 			*s++ = *p++;
705 		} else
706 			*s++ = *p++;
707 	}
708 	*s = EOS;
709 	grepflag = ((patind) ? 0 : 1);
710 	return (newpat);
711 }
712 
713 /*
714  * Test for simple alternation.  Result is NULL if it's not so simple, or is
715  * a pointer to the first string if it is. Warning:  sscanf size is a
716  * fixpoint, beyond which the speedup linearity starts to break down.  In the
717  * wake of the elegant aho/corrasick "trie"-based fgrep, generalizing
718  * altpat[] to arbitrary size is not useful.
719  */
720 char *
721 alternate(regexpr)
722 	char *regexpr;
723 {
724 	register int i, j;
725 	register char *start, *stop;
726 	unsigned char c;
727 
728 	if (fgrepflag && strchr(regexpr, '|'))
729 			return (NULL);
730 
731 	/*
732 	 * break pattern up into altpat array; delimit on newline, bar,
733 	 * or EOS.  We know we won't overflow, we've already checked the
734 	 * number of patterns we're going to find against NALT.
735 	 * Also, set length of pattern and find minimum pattern length.
736 	 */
737 	nalt = 0;
738 	altmin = NMUSH;
739 	for (start = stop = regexpr;; ++stop)
740 		if (!*stop || *stop == '|' || *stop == NL) {
741 			altlen[nalt] = j = stop - start;
742 			if (j < altmin)
743 				altmin = j;
744 			if (!(altpat[nalt] = malloc((u_int)(j + 1))))
745 				oops("out of memory");
746 			bcopy(start, altpat[nalt], j);
747 			altpat[nalt][j] = EOS;
748 			++nalt;
749 			if (!*stop)
750 				break;
751 			if (nalt == NALT)
752 				return(NULL);
753 			if (*stop == NL)
754 				*stop = '|';
755 			start = stop + 1;
756 		}
757 	if (!fgrepflag) {
758 		if (strchr(regexpr, '|') == NULL || regexpr[0] == '|')
759 			return (NULL);
760 		if (strpbrk(regexpr, "^$.[]()?+*\\") != NULL
761 		    || strindex(regexpr, "||") >= 0)
762 			return (NULL);
763 	}
764 
765 	if (nalt > 1) {		/* build superimposed "pre-match" sets per
766 				 * char */
767 		altflag++;
768 		for (j = 0; j < nalt; j++)
769 			for (i = 0; i < altmin; i++) {
770 				c = altpat[j][altlen[j] - altmin + i];
771 				altset[i + 1][c] = 1;	/* offset for sentinel */
772 			}
773 	}
774 	return (altpat[0]);
775 }
776 
777 /*
778  * Grapple with the dfa (std egrep) vs. ndfa (regexp) tradeoff. Criteria to
779  * determine whether to use dfa-based egrep:  We do FIRSTFEW matches with
780  * regexec().  If Boyer-Moore up to now matched more than PUNTPERCENT
781  * of the input, the r.e. is likely to be underspecified, so do old *grep,
782  * which is faster on complex patterns than regexp().  At FIRSTFEW,
783  * dump the saved matches collected by savematch(). They are saved
784  * so that a "PUNT" can "rewind" to ignore them.  Stdin is problematic,
785  * since it's hard to rewind.
786  */
787 
788 execstrategy(file)
789 	char *file;
790 {
791 	int pctmatch;
792 
793 	pctmatch = (100 * rxcount) / incount;
794 	if (pctmatch > PUNTPERCENT && file != NULL)
795 		kernighan(args);
796 	if (file != NULL)
797 		flushmatches();
798 }
799 
800 nlcount(bstart, bstop)		/* flail interval to totalize newlines. */
801 	char *bstart, *bstop;
802 {
803 	register char *s = bstart;
804 	register char *t = bstop;
805 	register int count = 0;
806 
807 	do {			/* loop unroll for older architectures */
808 		if (*t == NL)	/* ... ask ames!jaw for sample code */
809 			count++;
810 	} while (t-- > s);
811 
812 	return (count);
813 }
814 
815 char *
816 isolate(regexpr)		/* isolate longest metacharacter-free string */
817 	char *regexpr;
818 {
819 	char *dummyexpr;
820 
821 	/*
822 	 * We add (.)* because Henry's regcomp only figures regmust if it
823 	 * sees a leading * pattern.  Foo!
824 	 */
825 	dummyexpr = malloc((unsigned) strlen(regexpr) + 5);
826 	(void)sprintf(dummyexpr, "(.)*%s", regexpr);
827 	if ((rspencer = regcomp(dummyexpr)) == NULL)
828 		kernighan(args);
829 	return (rspencer->regmust);
830 }
831 
832 char *matches[FIRSTFEW];
833 static int mcount = 0;
834 
835 char *
836 savematch(s)			/* horde matches during statistics gathering */
837 	register char *s;
838 {
839 	char *p;
840 	char *start = s;
841 	int msize = 0;
842 	int psize = strlen(preamble);
843 
844 	while (*s++ != NL)
845 		msize++;
846 	*--s = EOS;
847 
848 	p = malloc((unsigned) msize + 1 + psize);
849 	strcpy(p, preamble);
850 	strcpy(p + psize, start);
851 	matches[mcount++] = p;
852 
853 	preamble[0] = 0;
854 	*s = NL;
855 	return (s);
856 }
857 
858 flushmatches()
859 {
860 	int n;
861 
862 	flushflag = 1;
863 	for (n = 0; n < mcount; n++)
864 		printf("%s\n", matches[n]);
865 	mcount = 0;
866 }
867 
868 oops(message)
869 	char *message;
870 {
871 	fprintf(stderr, "%s: %s\n", progname, message);
872 	exit(2);
873 }
874 
875 kernighan(args)			/* "let others do the hard part ..." */
876 	char *args[];
877 {
878 	/*
879 	 * We may have already run grep on some of the files; remove them
880 	 * from the arg list we pass on.  Note that we can't delete them
881 	 * totally because the number of file names affects the output
882 	 * (automatic -h).
883 	 */
884 	/* better would be fork/exec per punted file -- jaw */
885 
886 	while (firstfile && optind > firstfile)
887 		args[firstfile++] = _PATH_DEVNULL;
888 	if (patind)
889 		args[patind] = pattern;
890 	(void) fflush(stdout);
891 
892 	if (grepflag)
893 		execvp(_PATH_GREPSTD, args), oops("can't exec old 'grep'");
894 	else if (fgrepflag)
895 		execvp(_PATH_FGREPSTD, args), oops("can't exec old 'fgrep'");
896 	else
897 		execvp(_PATH_EGREPSTD, args), oops("can't exec old 'egrep'");
898 }
899