xref: /openbsd/usr.bin/grep/grep.c (revision d485f761)
1 /*	$OpenBSD: grep.c,v 1.1 2001/09/21 23:12:00 deraadt Exp $	*/
2 
3 /*-
4  * Copyright (c) 2000 Carson Harding. All rights reserved.
5  * This code was written and contributed to OpenBSD by Carson Harding.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  * 3. Neither the name of the author, or the names of contributors may be
16  *    used to endorse or promote products derived from this software without
17  *    specific prior written permission.
18  *
19  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
20  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
23  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
25  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
26  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
28  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
29  * SUCH DAMAGE.
30  */
31 
32 #ifndef lint
33 static char rcsid[] = "$OpenBSD: grep.c,v 1.1 2001/09/21 23:12:00 deraadt Exp $";
34 #endif /* not lint */
35 
36 #include <sys/types.h>
37 #include <stdio.h>
38 #include <stdlib.h>
39 #include <unistd.h>
40 #include <regex.h>
41 #include <string.h>
42 #include <ctype.h>
43 #include <sys/param.h>
44 #include <fts.h>
45 #include <err.h>
46 
47 extern	char *__progname;
48 
49 
50 void	usage(void);
51 void	err_regerror(int r, regex_t *rexp);
52 int	grep_files(int regexc, regex_t *regexv, char **files);
53 int	grep_tree(int regexc, regex_t *regexv, char **paths);
54 int	grep_file(int regexc, regex_t *rexp, char *fname);
55 void	arg_patt(char *s);
56 char	*chop_patt(char *s, size_t *len);
57 void	add_patt(char *s, size_t len);
58 void	load_patt(char *fname);
59 regex_t *regcomp_patt(int pattc, char *pattvp[], int cflags);
60 
61 
62 int	f_bytecount;		/* -b prepend byte count */
63 int	f_countonly;		/* -c return only count */
64 int	f_nofname;		/* -h do not prepend filenames on multiple */
65 int	f_fnameonly;		/* -l only print file name with match */
66 int	f_suppress;		/* -s suppress error messages; 1/2 -q */
67 int	f_lineno;		/* -n prepend with line numbers */
68 int	f_quiet;		/* -q no output, only status */
69 int	f_wmatch;		/* -w match words */
70 int	f_xmatch;		/* -x match line */
71 int	f_zerobyte;		/* -z NUL character after filename with -l */
72 int	f_match;		/* = REG_MATCH; else = REG_NOMATCH for -v */
73 int	f_multifile;		/* multiple files: prepend file names */
74 int	f_matchall;		/* empty pattern, matches all input */
75 int	f_error;		/* saw error; set exit status */
76 
77 				/* default traversal flags */
78 int	f_ftsflags = FTS_LOGICAL|FTS_NOCHDIR|FTS_NOSTAT;
79 
80 int	f_debug;		/* temporary debugging flag */
81 
82 #define START_PATT_SZ	 8	/* start with room for 8 patterns */
83 char	**pattv;		/* array of patterns from -e and -f */
84 int	pattc;			/* patterns in pattern array */
85 int	pattn;			/* patterns we have seen, including nulls */
86 
87 int
88 main(int argc, char **argv)
89 {
90 	int	c;
91 	int	ch;
92 	int	cflags;		/* flags to regcomp() */
93 	int	sawfile;	/* did we see a pattern file? */
94 	regex_t *regexv;	/* start of array of compiled patterns */
95 
96 	int (*grepf)(int regexc, regex_t *regexv, char **argv);
97 
98 	sawfile = 0;
99 	cflags	= REG_BASIC|REG_NEWLINE;
100 	grepf	= grep_files;
101 
102 	if (*__progname == 'e')
103 		cflags |= REG_EXTENDED;
104 	else if (*__progname == 'f')
105 		cflags |= REG_NOSPEC;
106 
107 	while ((ch = getopt(argc, argv, "DEFRHLPXabce:f:hilnqsvwxz")) != -1) {
108 		switch(ch) {
109 		case 'D':
110 			f_debug = 1;
111 			break;
112 		case 'E':
113 			cflags |= REG_EXTENDED;
114 			break;
115 		case 'F':
116 			cflags |= REG_NOSPEC;
117 			break;
118 		case 'H':
119 			f_ftsflags |= FTS_COMFOLLOW;
120 			break;
121 		case 'L':
122 			f_ftsflags |= FTS_LOGICAL;
123 			break;
124 		case 'P':
125 			f_ftsflags |= FTS_PHYSICAL;
126 			break;
127 		case 'R':
128 			grepf = grep_tree;
129 			/*
130 			 * If walking the tree we don't know how many files
131 			 * we'll actually find. So assume multiple, if
132 			 * you don't want names, there's always -h ....
133 			 */
134 			f_multifile = 1;
135 			break;
136 		case 'X':
137 			f_ftsflags |= FTS_XDEV;
138 			break;
139 		case 'a':
140 			/*
141 			 * Silently eat -a; we don't use the default
142 			 * behaviour it toggles off in gnugrep.
143 			 */
144 			break;
145 		case 'b':
146 			f_bytecount = 1;
147 			break;
148 		case 'c':
149 			f_countonly = 1;
150 			break;
151 		case 'e':
152 			arg_patt(optarg);
153 			break;
154 		case 'f':
155 			load_patt(optarg);
156 			sawfile = 1;
157 			break;
158 		case 'h':
159 			f_nofname = 1;
160 			break;
161 		case 'i':
162 			cflags |= REG_ICASE;
163 			break;
164 		case 'l':
165 			f_fnameonly = 1;
166 			break;
167 		case 'n':
168 			f_lineno = 1;
169 			break;
170 		case 'q':
171 			f_quiet = 1;
172 			break;
173 		case 's':
174 			f_suppress = 1;
175 			break;
176 		case 'v':
177 			f_match = REG_NOMATCH;
178 			break;
179 		case 'w':
180 			f_wmatch = 1;
181 			break;
182 		case 'x':
183 			f_xmatch = 1;
184 			break;
185 		case 'z':
186 			f_zerobyte = 1;
187 			break;
188 		default:
189 			usage();
190 			break;
191 		}
192 	}
193 
194 	if ((cflags & REG_EXTENDED) && (cflags & REG_NOSPEC))
195 		usage();
196 
197 	/*
198 	 * If we read one or more pattern files, and still
199 	 * didn't end up with any pattern, any pattern file
200 	 * we read was empty. This is different than failing
201 	 * to provide a pattern as an argument, and we fail
202 	 * on this case as if we had searched and found
203 	 * no matches. (At least this is what GNU grep and
204 	 * Solaris's grep do.)
205 	 */
206 	if (!pattn && !argv[optind]) {
207 		if (sawfile)
208 			exit(1);
209 		else usage();
210 	}
211 
212 	if (!pattn) {
213 		arg_patt(argv[optind]);
214 		optind++;
215 	}
216 
217 	/* why bother ... just do nothing sooner */
218 	if (f_matchall && f_match == REG_NOMATCH)
219 		exit(1);
220 
221 	regexv = regcomp_patt(pattc, pattv, cflags);
222 
223 	if (optind == argc) {
224 		c = grep_file(pattc, regexv, NULL);
225 	} else {
226 		if (argc - optind > 1 && !f_nofname)
227 			f_multifile = 1;
228 		c = (*grepf)(pattc, regexv, &argv[optind]);
229 	}
230 
231 	/* XX ugh */
232 	if (f_error) {
233 		if (c && f_quiet)
234 			exit(0);
235 		else
236 			exit(2);
237 	} else if (c)
238 		exit(0);
239 	else
240 		exit(1);
241 }
242 
243 void
244 usage(void)
245 {
246 	fprintf(stderr, "usage: %s [-E|-F] [-abchilnqsvwx] [-RXH[-L|-P]]"
247 	    " {patt | -e patt | -f patt_file} [files]\n",
248 	    __progname);
249 	exit(2);
250 }
251 
252 /*
253  * Patterns as arguments may have embedded newlines.
254  * When read from file, these are detected by fgetln();
255  * in arguments we have to find and cut out the segments.
256  */
257 void
258 arg_patt(char *s)
259 {
260 	size_t len;
261 	char *sp;
262 
263 	if (f_debug)
264 		fprintf(stderr, "arg_patt(\"%s\")\n", s);
265 
266 	len = strlen(s);
267 	if (!len) {		     /* got "" on the command-line */
268 		add_patt(s, len);
269 		return;
270 	}
271 	for (sp = chop_patt(s, &len); sp; sp = chop_patt(NULL, &len)) {
272 		if (f_debug) {
273 			fprintf(stderr, "adding pattern \"");
274 			fwrite(sp, len, 1, stderr);
275 			fprintf(stderr, "\", length %lu\n",(unsigned long)len);
276 			if (pattc > 20) {
277 				fprintf(stderr, "too many, exiting ...\n");
278 				exit(2);
279 			}
280 		}
281 		add_patt(sp, len);
282 	}
283 }
284 
285 /*
286  * Kind of like strtok; pass char *, then NULL for rest.
287  * Call it memtok()... New size gets written into len.
288  */
289 char *
290 chop_patt(char *s, size_t *len)
291 {
292 	char	*cp;
293 	static	char *save_s;
294 	static	int   save_n;
295 
296 	if (s)
297 		save_n = *len;
298 	else
299 		s = save_s;
300 
301 	if (save_n <= 0) {
302 		s = save_s = NULL;
303 	} else if (s) {
304 		if ((cp = memchr(s, '\n', save_n)) != NULL) {
305 			*len = cp - s;	/* returned segment */
306 			save_n -= *len;
307 			save_s = ++cp;	/* adjust past newline */
308 			save_n--;
309 		} else {
310 			*len = save_n;	/* else return the whole string */
311 			save_n = 0;
312 		}
313 	}
314 
315 	return s;
316 }
317 
318 /*
319  * Start with an array for 8 patterns, and double it
320  * each time we outgrow it. If pattern is empty (0 length),
321  * or if f_matchall is already set, set f_matchall and return.
322  * No use adding a pattern if all input is going to match
323  * anyhow.
324  */
325 void
326 add_patt(char *s, size_t len)
327 {
328 	char	*p;
329 	static	size_t	pattmax = START_PATT_SZ;
330 	static size_t sumlen;
331 
332 	pattn++;
333 	sumlen += len;
334 
335 	if (!len || f_matchall) {
336 		f_matchall = 1;
337 		return;
338 	}
339 
340 	if (!pattv) {
341 		pattv = malloc(START_PATT_SZ * sizeof(char *));
342 		if (!pattv)
343 			err(2, "malloc");
344 		pattc = 0;
345 	} else if (pattc >= pattmax) {
346 		pattmax *= 2;
347 		pattv = realloc(pattv, pattmax * sizeof(char *));
348 		if (!pattv)
349 			err(2, "realloc");
350 	}
351 	p = malloc(len+1);
352 	if (!p) err(2, "malloc");
353 	memmove(p, s, len);
354 	p[len] = '\0';
355 	pattv[pattc++] = p;
356 }
357 
358 /*
359  * Load patterns from file.
360  */
361 void
362 load_patt(char *fname)
363 {
364 	char	*buf;
365 	size_t	len;
366 	FILE	*fr;
367 
368 	fr = fopen(fname, "r");
369 	if (!fr)
370 		err(2, fname);
371 	while ((buf = fgetln(fr, &len)) != NULL) {
372 		if (buf[len-1] == '\n')
373 			buf[--len] = '\0';
374 		add_patt(buf, len);
375 	}
376 	fclose(fr);
377 }
378 
379 /*
380  * Compile the collected pattern strings into an array
381  * of regex_t.
382  */
383 regex_t *
384 regcomp_patt(int lpattc, char *lpattv[], int cflags)
385 {
386 	int	i;
387 	int	r;
388 	regex_t *rxv;
389 
390 	if (f_matchall)
391 		return NULL;
392 
393 	rxv = malloc(sizeof(regex_t) * lpattc);
394 	if (!rxv)
395 		err(2, "malloc");
396 	for (i = 0; i < lpattc; i++) {
397 		if ((r = regcomp(&rxv[i], lpattv[i], cflags)) != 0)
398 			err_regerror(r, &rxv[i]);
399 	}
400 	return rxv;
401 }
402 
403 /*
404  * Print out regcomp error, and exit.
405  */
406 void
407 err_regerror(int r, regex_t *rexp)
408 {
409 	size_t	n;
410 	char	*buf;
411 
412 	n = regerror(r, rexp, NULL, 0);
413 	buf = malloc(n);
414 	if (!buf)
415 		err(2, "malloc");
416 	(void)regerror(r, rexp, buf, n);
417 	errx(2, "%s", buf);
418 }
419 
420 /*
421  * Little wrapper so we can use function pointer above.
422  */
423 int
424 grep_files(int regexc, regex_t *regexv, char **files)
425 {
426 	int	c;
427 	char	**fname;
428 
429 	c = 0;
430 	for (fname = files; *fname; fname++)
431 		c += grep_file(regexc, regexv, *fname);
432 
433 	return c;
434 }
435 
436 /*
437  * Modified from James Howard and Dag-Erling Co?dan Sm?rgrav's grep:
438  * add FTS_D to FTS_DP (especially since D was the one being used)
439  * pass in regex_t array, and set fts flags above in main().
440  */
441 int
442 grep_tree(int regexc, regex_t *regexv, char **paths)
443 {
444 	int	c;
445 	FTS	*fts;
446 	FTSENT	*p;
447 
448 	c = 0;
449 
450 	if (!(fts = fts_open(paths, f_ftsflags, (int (*) ()) NULL)))
451 		err(2, "fts_open");
452 	while ((p = fts_read(fts)) != NULL) {
453 		switch (p->fts_info) {
454 		case FTS_D:
455 		case FTS_DP:
456 		case FTS_DNR:
457 			break;
458 		case FTS_ERR:
459 			errx(2, "%s: %s", p->fts_path, strerror(p->fts_errno));
460 			break;
461 		default:
462 			if (f_debug)
463 				printf("%s\n", p->fts_path);
464 			c += grep_file(regexc, regexv, p->fts_path);
465 			break;
466 		}
467 	}
468 
469 	return c;
470 }
471 
472 /*
473  * Open and grep the named file. If fname is NULL, read
474  * from stdin.
475  */
476 
477 #define isword(x) (isalnum(x) || (x) == '_')
478 
479 int
480 grep_file(int regexc, regex_t *regexv, char *fname)
481 {
482 	int	i;
483 	int	c;
484 	int	n;
485 	int	r;
486 	int	match;
487 	char	*buf;
488 	size_t	b;
489 	size_t	len;
490 	FILE	*fr;
491 	regmatch_t pmatch[1];
492 	regoff_t   so, eo;
493 
494 	b = 0;		/* byte count */
495 	c = 0;		/* match count */
496 	n = 0;		/* line count */
497 
498 	if (!fname) {
499 		fr = stdin;
500 		fname = "(standard input)";
501 	} else {
502 		fr = fopen(fname, "r");
503 		if (!fr) {
504 			if (!f_suppress)
505 				warn("%s", fname);
506 			f_error = 1;
507 			return 0;
508 		}
509 	}
510 
511 	while ((buf = fgetln(fr, &len)) != NULL) {
512 		n++;
513 		if (f_matchall)
514 			goto printmatch;
515 		match = 0;
516 		for (i = 0; i < regexc; i++) {
517 			pmatch[0].rm_so = 0;
518 			pmatch[0].rm_eo = len-1;
519 			r = regexec(&regexv[i], buf, 1, pmatch, REG_STARTEND);
520 			if (r == f_match) {
521 				/*
522 				 * XX gnu grep allows both -w and -x;
523 				 * XX but seems bizarre. sometimes -w seems
524 				 * XX to override, at other times, not.
525 				 * XX Need to figure that out.
526 				 * XX It seems logical to go with the most
527 				 * XX restrictive argument: -x, as -x is
528 				 * XX a boundary case of -w anyhow.
529 				 */
530 				if (f_xmatch) {
531 					if (pmatch[0].rm_so != 0 ||
532 					    pmatch[0].rm_eo != len-1)
533 						continue;
534 				} else if (f_wmatch) {
535 					so = pmatch[0].rm_so;
536 					eo = pmatch[0].rm_eo;
537 					if (!((so == 0 || !isword(buf[so-1])) &&
538 					    (eo == len || !isword(buf[eo]))))
539 						continue;
540 				}
541 				match = 1;
542 				break;
543 			}
544 			/* XX test for regexec() errors ?? */
545 		}
546 		if (match) {
547 printmatch:
548 			c++;
549 			if (f_fnameonly || f_quiet)
550 				break;
551 			if (f_countonly)
552 				continue;
553 			if (f_multifile && !f_nofname)
554 				printf("%s:", fname);
555 			if (f_lineno)
556 				printf("%d:", n);
557 			if (f_bytecount)
558 				printf("%lu:", (unsigned long)b);
559 			fwrite(buf, len, 1, stdout);
560 		}
561 		/* save position in stream before next line */
562 		b += len;
563 	}
564 
565 	if (!buf && ferror(fr)) {
566 		warn("%s", fname);
567 		f_error = 1;
568 		/*
569 		 * XX or do we spit out what result we did have?
570 		 */
571 	} else if (!f_quiet) {
572 		/*
573 		 * XX test -c and -l together: gnu grep
574 		 * XX allows (although ugly), do others?
575 		 */
576 		if (f_countonly) {
577 			if (f_multifile)
578 				printf("%s:", fname);
579 			printf("%d\n", c);
580 		}
581 		if (c && f_fnameonly) {
582 			fputs(fname, stdout);
583 			if (f_zerobyte)
584 				fputc('\0', stdout);
585 			else
586 				fputc('\n', stdout);
587 		}
588 	}
589 
590 	if (fr != stdin)
591 		fclose(fr);
592 
593 	return c;
594 }
595 
596