xref: /minix/minix/usr.bin/grep/util.c (revision 0a6a1f1d)
1 /*	$OpenBSD: util.c,v 1.48 2014/05/20 01:25:23 guenther Exp $	*/
2 
3 /*-
4  * Copyright (c) 1999 James Howard and Dag-Erling Co�dan Sm�rgrav
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
20  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26  * SUCH DAMAGE.
27  */
28 
29 #include <sys/types.h>
30 #include <sys/stat.h>
31 
32 #include <ctype.h>
33 #include <err.h>
34 #include <errno.h>
35 #include <fts.h>
36 #include <regex.h>
37 #include <stdio.h>
38 #include <stdlib.h>
39 #include <string.h>
40 #include <unistd.h>
41 #include <zlib.h>
42 
43 #include "grep.h"
44 
45 /*
46  * Process a file line by line...
47  */
48 
49 static int	linesqueued;
50 static int	procline(str_t *l, int);
51 static int	grep_search(fastgrep_t *, unsigned char *, size_t,
52 	regmatch_t *pmatch);
53 #ifndef SMALL
54 static int	grep_cmp(const unsigned char *, const unsigned char *, size_t);
55 static void	grep_revstr(unsigned char *, int);
56 #endif
57 
58 int
59 grep_tree(char **argv)
60 {
61 	FTS	*fts;
62 	FTSENT	*p;
63 	int	c, fts_flags;
64 
65 	c = 0;
66 
67 	fts_flags = FTS_PHYSICAL | FTS_NOSTAT | FTS_NOCHDIR;
68 
69 	if (!(fts = fts_open(argv, fts_flags, NULL)))
70 		err(2, NULL);
71 	while ((p = fts_read(fts)) != NULL) {
72 		switch (p->fts_info) {
73 		case FTS_DNR:
74 			break;
75 		case FTS_ERR:
76 			file_err = 1;
77 			if(!sflag) {
78 				errno = p->fts_errno;
79 				warn("%s", p->fts_path);
80 			}
81 			break;
82 		case FTS_DP:
83 			break;
84 		default:
85 			c += procfile(p->fts_path);
86 			break;
87 		}
88 	}
89 	if (errno)
90 		err(2, "fts_read");
91 
92 	return c;
93 }
94 
95 int
96 procfile(const char *fn)
97 {
98 	str_t ln;
99 	file_t *f;
100 	int c, t, z, nottext;
101 
102 	if (fn == NULL) {
103 		fn = "(standard input)";
104 		f = grep_fdopen(STDIN_FILENO, "r");
105 	} else {
106 		f = grep_open(fn, "r");
107 	}
108 	if (f == NULL) {
109 		file_err = 1;
110 		if (!sflag)
111 			warn("%s", fn);
112 		return 0;
113 	}
114 
115 	nottext = grep_bin_file(f);
116 	if (nottext && binbehave == BIN_FILE_SKIP) {
117 		grep_close(f);
118 		return 0;
119 	}
120 
121 	ln.file = fn;
122 	ln.line_no = 0;
123 	ln.len = 0;
124 	linesqueued = 0;
125 	tail = 0;
126 	ln.off = -1;
127 
128 	if (Bflag > 0)
129 		initqueue();
130 	for (c = 0;  c == 0 || !(lflag || qflag); ) {
131 		ln.off += ln.len + 1;
132 		if ((ln.dat = grep_fgetln(f, &ln.len)) == NULL)
133 			break;
134 		if (ln.len > 0 && ln.dat[ln.len - 1] == '\n')
135 			--ln.len;
136 		ln.line_no++;
137 
138 		z = tail;
139 
140 		if ((t = procline(&ln, nottext)) == 0 && Bflag > 0 && z == 0) {
141 			enqueue(&ln);
142 			linesqueued++;
143 		}
144 		c += t;
145 	}
146 	if (Bflag > 0)
147 		clearqueue();
148 	grep_close(f);
149 
150 	if (cflag) {
151 		if (!hflag)
152 			printf("%s:", ln.file);
153 		printf("%u\n", c);
154 	}
155 	if (lflag && c != 0)
156 		printf("%s\n", fn);
157 	if (Lflag && c == 0)
158 		printf("%s\n", fn);
159 	if (c && !cflag && !lflag && !Lflag &&
160 	    binbehave == BIN_FILE_BIN && nottext && !qflag)
161 		printf("Binary file %s matches\n", fn);
162 
163 	return c;
164 }
165 
166 
167 /*
168  * Process an individual line in a file. Return non-zero if it matches.
169  */
170 
171 #define isword(x) (isalnum((unsigned char)x) || (x) == '_')
172 
173 static int
174 procline(str_t *l, int nottext)
175 {
176 	regmatch_t	pmatch;
177 	int		c, i, r;
178 	regoff_t	offset;
179 
180 	/* size_t will be converted to regoff_t. ssize_t is guaranteed to fit
181 	 * into regoff_t */
182 	if (l->len > SSIZE_MAX) {
183 		errx(2, "Line is too big to process");
184 	}
185 
186 	c = 0;
187 	i = 0;
188 	if (matchall) {
189 		c = 1;
190 		goto print;
191 	}
192 
193 	for (i = 0; i < patterns; i++) {
194 		offset = 0;
195 redo:
196 		if (fg_pattern[i].pattern) {
197 			r = grep_search(&fg_pattern[i],
198 			    (unsigned char *)l->dat + offset, l->len - offset,
199 			    &pmatch);
200 			pmatch.rm_so += offset;
201 			pmatch.rm_eo += offset;
202 		} else {
203 			pmatch.rm_so = offset;
204 			pmatch.rm_eo = l->len;
205 			r = regexec(&r_pattern[i], l->dat, 1, &pmatch, eflags);
206 		}
207 		if (r == 0 && xflag) {
208 			if (pmatch.rm_so != 0 || pmatch.rm_eo != l->len)
209 				r = REG_NOMATCH;
210 		}
211 		if (r == 0) {
212 			c = 1;
213 			if (oflag && pmatch.rm_so != pmatch.rm_eo)
214 				goto print;
215 			break;
216 		}
217 	}
218 	if (oflag)
219 		return c;
220 print:
221 	if (vflag)
222 		c = !c;
223 
224 	if (c && binbehave == BIN_FILE_BIN && nottext)
225 		return c; /* Binary file */
226 
227 	if ((tail > 0 || c) && !cflag && !qflag) {
228 		if (c) {
229 			if (first > 0 && tail == 0 && (Bflag < linesqueued) &&
230 			    (Aflag || Bflag))
231 				printf("--\n");
232 			first = 1;
233 			tail = Aflag;
234 			if (Bflag > 0)
235 				printqueue();
236 			linesqueued = 0;
237 			printline(l, ':', oflag ? &pmatch : NULL);
238 		} else {
239 			printline(l, '-', oflag ? &pmatch : NULL);
240 			tail--;
241 		}
242 	}
243 	if (oflag && !matchall) {
244 		offset = pmatch.rm_eo;
245 		goto redo;
246 	}
247 	return c;
248 }
249 
250 #ifndef SMALL
251 void
252 fgrepcomp(fastgrep_t *fg, const unsigned char *pat)
253 {
254 	int i;
255 
256 	/* Initialize. */
257 	fg->patternLen = strlen((const char *)pat);
258 	fg->bol = 0;
259 	fg->eol = 0;
260 	fg->wmatch = wflag;
261 	fg->reversedSearch = 0;
262 
263 	/*
264 	 * Make a copy and upper case it for later if in -i mode,
265 	 * else just copy the pointer.
266 	 */
267 	if (iflag) {
268 		fg->pattern = grep_malloc(fg->patternLen + 1);
269 		for (i = 0; i < fg->patternLen; i++)
270 			fg->pattern[i] = toupper(pat[i]);
271 		fg->pattern[fg->patternLen] = '\0';
272 	} else
273 		fg->pattern = __UNCONST(pat);	/* really const */
274 
275 	/* Preprocess pattern. */
276 	for (i = 0; i <= UCHAR_MAX; i++)
277 		fg->qsBc[i] = fg->patternLen;
278 	for (i = 1; i < fg->patternLen; i++) {
279 		fg->qsBc[fg->pattern[i]] = fg->patternLen - i;
280 		/*
281 		 * If case is ignored, make the jump apply to both upper and
282 		 * lower cased characters.  As the pattern is stored in upper
283 		 * case, apply the same to the lower case equivalents.
284 		 */
285 		if (iflag)
286 			fg->qsBc[tolower(fg->pattern[i])] = fg->patternLen - i;
287 	}
288 }
289 #endif
290 
291 /*
292  * Returns: -1 on failure, 0 on success
293  */
294 int
295 fastcomp(fastgrep_t *fg, const char *pat)
296 {
297 #ifdef SMALL
298 	return -1;
299 #else
300 	int i;
301 	int bol = 0;
302 	int eol = 0;
303 	int shiftPatternLen;
304 	int hasDot = 0;
305 	int firstHalfDot = -1;
306 	int firstLastHalfDot = -1;
307 	int lastHalfDot = 0;
308 
309 	/* Initialize. */
310 	fg->patternLen = strlen(pat);
311 	fg->bol = 0;
312 	fg->eol = 0;
313 	fg->wmatch = 0;
314 	fg->reversedSearch = 0;
315 
316 	/* Remove end-of-line character ('$'). */
317 	if (fg->patternLen > 0 && pat[fg->patternLen - 1] == '$') {
318 		eol++;
319 		fg->eol = 1;
320 		fg->patternLen--;
321 	}
322 
323 	/* Remove beginning-of-line character ('^'). */
324 	if (pat[0] == '^') {
325 		bol++;
326 		fg->bol = 1;
327 		fg->patternLen--;
328 	}
329 
330 	/* Remove enclosing [[:<:]] and [[:>:]] (word match). */
331 	if (wflag) {
332 		/* basic re's use \( \), extended re's ( ) */
333 		int extra = Eflag ? 1 : 2;
334 		fg->patternLen -= 14 + 2 * extra;
335 		fg->wmatch = 7 + extra;
336 	} else if (fg->patternLen >= 14 &&
337 	    strncmp(pat + fg->bol, "[[:<:]]", 7) == 0 &&
338 	    strncmp(pat + fg->bol + fg->patternLen - 7, "[[:>:]]", 7) == 0) {
339 		fg->patternLen -= 14;
340 		fg->wmatch = 7;
341 	}
342 
343 	/*
344 	 * Copy pattern minus '^' and '$' characters as well as word
345 	 * match character classes at the beginning and ending of the
346 	 * string respectively.
347 	 */
348 	fg->pattern = grep_malloc(fg->patternLen + 1);
349 	memcpy(fg->pattern, pat + bol + fg->wmatch, fg->patternLen);
350 	fg->pattern[fg->patternLen] = '\0';
351 
352 	/* Look for ways to cheat...er...avoid the full regex engine. */
353 	for (i = 0; i < fg->patternLen; i++)
354 	{
355 		switch (fg->pattern[i]) {
356 		case '.':
357 			hasDot = i;
358 			if (i < fg->patternLen / 2) {
359 				if (firstHalfDot < 0)
360 					/* Closest dot to the beginning */
361 					firstHalfDot = i;
362 			} else {
363 				/* Closest dot to the end of the pattern. */
364 				lastHalfDot = i;
365 				if (firstLastHalfDot < 0)
366 					firstLastHalfDot = i;
367 			}
368 			break;
369 		case '(': case ')':
370 		case '{': case '}':
371 			/* Special in BRE if preceded by '\\' */
372 		case '?':
373 		case '+':
374 		case '|':
375 			/* Not special in BRE. */
376 			if (!Eflag)
377 				goto nonspecial;
378 		case '\\':
379 		case '*':
380 		case '[': case ']':
381 			/* Free memory and let others know this is empty. */
382 			free(fg->pattern);
383 			fg->pattern = NULL;
384 			return (-1);
385 		default:
386 nonspecial:
387 			if (iflag)
388 				fg->pattern[i] = toupper(fg->pattern[i]);
389 			break;
390 		}
391 	}
392 
393 	/*
394 	 * Determine if a reverse search would be faster based on the placement
395 	 * of the dots.
396 	 */
397 	if ((!(lflag || cflag)) && ((!(bol || eol)) &&
398 	    ((lastHalfDot) && ((firstHalfDot < 0) ||
399 	    ((fg->patternLen - (lastHalfDot + 1)) < firstHalfDot))))) {
400 		fg->reversedSearch = 1;
401 		hasDot = fg->patternLen - (firstHalfDot < 0 ?
402 		    firstLastHalfDot : firstHalfDot) - 1;
403 		grep_revstr(fg->pattern, fg->patternLen);
404 	}
405 
406 	/*
407 	 * Normal Quick Search would require a shift based on the position the
408 	 * next character after the comparison is within the pattern.  With
409 	 * wildcards, the position of the last dot effects the maximum shift
410 	 * distance.
411 	 * The closer to the end the wild card is the slower the search.  A
412 	 * reverse version of this algorithm would be useful for wildcards near
413 	 * the end of the string.
414 	 *
415 	 * Examples:
416 	 * Pattern	Max shift
417 	 * -------	---------
418 	 * this		5
419 	 * .his		4
420 	 * t.is		3
421 	 * th.s		2
422 	 * thi.		1
423 	 */
424 
425 	/* Adjust the shift based on location of the last dot ('.'). */
426 	shiftPatternLen = fg->patternLen - hasDot;
427 
428 	/* Preprocess pattern. */
429 	for (i = 0; i <= UCHAR_MAX; i++)
430 		fg->qsBc[i] = shiftPatternLen;
431 	for (i = hasDot + 1; i < fg->patternLen; i++) {
432 		fg->qsBc[fg->pattern[i]] = fg->patternLen - i;
433 		/*
434 		 * If case is ignored, make the jump apply to both upper and
435 		 * lower cased characters.  As the pattern is stored in upper
436 		 * case, apply the same to the lower case equivalents.
437 		 */
438 		if (iflag)
439 			fg->qsBc[tolower(fg->pattern[i])] = fg->patternLen - i;
440 	}
441 
442 	/*
443 	 * Put pattern back to normal after pre-processing to allow for easy
444 	 * comparisons later.
445 	 */
446 	if (fg->reversedSearch)
447 		grep_revstr(fg->pattern, fg->patternLen);
448 
449 	return (0);
450 #endif
451 }
452 
453 /*
454  * Word boundaries using regular expressions are defined as the point
455  * of transition from a non-word char to a word char, or vice versa.
456  * This means that grep -w +a and grep -w a+ never match anything,
457  * because they lack a starting or ending transition, but grep -w a+b
458  * does match a line containing a+b.
459  */
460 #define wmatch(d, l, s, e)	\
461 	((s == 0 || !isword(d[s-1])) && (e == l || !isword(d[e])) && \
462 	  e > s && isword(d[s]) && isword(d[e-1]))
463 
464 static int
465 grep_search(fastgrep_t *fg, unsigned char *data, size_t dataLen,
466 	regmatch_t *pmatch)
467 {
468 #ifdef SMALL
469 	return 0;
470 #else
471 	regoff_t j;
472 	int rtrnVal = REG_NOMATCH;
473 
474 	pmatch->rm_so = -1;
475 	pmatch->rm_eo = -1;
476 
477 	/* No point in going farther if we do not have enough data. */
478 	if (dataLen < (size_t)fg->patternLen)
479 		return (rtrnVal);
480 
481 	/* Only try once at the beginning or ending of the line. */
482 	if (fg->bol || fg->eol) {
483 		/* Simple text comparison. */
484 		/* Verify data is >= pattern length before searching on it. */
485 		if (dataLen >= (size_t)fg->patternLen) {
486 			/* Determine where in data to start search at. */
487 			if (fg->eol)
488 				j = dataLen - fg->patternLen;
489 			else
490 				j = 0;
491 			if (!((fg->bol && fg->eol) &&
492 			    (dataLen != (size_t)fg->patternLen)))
493 				if (grep_cmp(fg->pattern, data + j,
494 				    fg->patternLen) == -1) {
495 					pmatch->rm_so = j;
496 					pmatch->rm_eo = j + fg->patternLen;
497 					if (!fg->wmatch || wmatch(data, dataLen,
498 					    pmatch->rm_so, pmatch->rm_eo))
499 						rtrnVal = 0;
500 				}
501 		}
502 	} else if (fg->reversedSearch) {
503 		/* Quick Search algorithm. */
504 		j = dataLen;
505 		do {
506 			if (grep_cmp(fg->pattern, data + j - fg->patternLen,
507 			    fg->patternLen) == -1) {
508 				pmatch->rm_so = j - fg->patternLen;
509 				pmatch->rm_eo = j;
510 				if (!fg->wmatch || wmatch(data, dataLen,
511 				    pmatch->rm_so, pmatch->rm_eo)) {
512 					rtrnVal = 0;
513 					break;
514 				}
515 			}
516 			/* Shift if within bounds, otherwise, we are done. */
517 			if (j == fg->patternLen)
518 				break;
519 			j -= fg->qsBc[(unsigned char)data[j - fg->patternLen - 1]];
520 		} while (j >= fg->patternLen);
521 	} else {
522 		/* Quick Search algorithm. */
523 		j = 0;
524 		do {
525 			if (grep_cmp(fg->pattern, data + j, fg->patternLen) == -1) {
526 				pmatch->rm_so = j;
527 				pmatch->rm_eo = j + fg->patternLen;
528 				if (fg->patternLen == 0 || !fg->wmatch ||
529 				    wmatch(data, dataLen, pmatch->rm_so,
530 				    pmatch->rm_eo)) {
531 					rtrnVal = 0;
532 					break;
533 				}
534 			}
535 
536 			/* Shift if within bounds, otherwise, we are done. */
537 			if (j + fg->patternLen == dataLen)
538 				break;
539 			else
540 				j += fg->qsBc[(unsigned char)data[j + fg->patternLen]];
541 		} while (j <= (dataLen - fg->patternLen));
542 	}
543 
544 	return (rtrnVal);
545 #endif
546 }
547 
548 
549 void *
550 grep_malloc(size_t size)
551 {
552 	void	*ptr;
553 
554 	if ((ptr = malloc(size)) == NULL)
555 		err(2, "malloc");
556 	return ptr;
557 }
558 
559 void *
560 grep_calloc(size_t nmemb, size_t size)
561 {
562 	void	*ptr;
563 
564 	if ((ptr = calloc(nmemb, size)) == NULL)
565 		err(2, "calloc");
566 	return ptr;
567 }
568 
569 void *
570 grep_realloc(void *ptr, size_t size)
571 {
572 	if ((ptr = realloc(ptr, size)) == NULL)
573 		err(2, "realloc");
574 	return ptr;
575 }
576 
577 #ifndef SMALL
578 /*
579  * Returns:	i >= 0 on failure (position that it failed)
580  *		-1 on success
581  */
582 static int
583 grep_cmp(const unsigned char *pat, const unsigned char *data, size_t len)
584 {
585 	size_t i;
586 
587 	for (i = 0; i < len; i++) {
588 		if (((pat[i] == data[i]) || (!Fflag && pat[i] == '.'))
589 		    || (iflag && pat[i] == toupper(data[i])))
590 			continue;
591 		return (i);
592 	}
593 
594 	return (-1);
595 }
596 
597 static void
598 grep_revstr(unsigned char *str, int len)
599 {
600 	int i;
601 	char c;
602 
603 	for (i = 0; i < len / 2; i++) {
604 		c = str[i];
605 		str[i] = str[len - i - 1];
606 		str[len - i - 1] = c;
607 	}
608 }
609 #endif
610 
611 void
612 printline(str_t *line, int sep, regmatch_t *pmatch)
613 {
614 	int n;
615 
616 	n = 0;
617 	if (!hflag) {
618 		fputs(line->file, stdout);
619 		++n;
620 	}
621 	if (nflag) {
622 		if (n)
623 			putchar(sep);
624 		printf("%d", line->line_no);
625 		++n;
626 	}
627 	if (bflag) {
628 		if (n)
629 			putchar(sep);
630 		printf("%lld", (long long)line->off);
631 		++n;
632 	}
633 	if (n)
634 		putchar(sep);
635 	if (pmatch)
636 		fwrite(line->dat + pmatch->rm_so,
637 		    pmatch->rm_eo - pmatch->rm_so, 1, stdout);
638 	else
639 		fwrite(line->dat, line->len, 1, stdout);
640 	putchar('\n');
641 }
642