xref: /openbsd/usr.bin/grep/util.c (revision 274d7c50)
1 /*	$OpenBSD: util.c,v 1.61 2019/10/07 17:47:32 tedu Exp $	*/
2 
3 /*-
4  * Copyright (c) 1999 James Howard and Dag-Erling Co�dan Sm�rgrav
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
20  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26  * SUCH DAMAGE.
27  */
28 
29 #include <sys/types.h>
30 #include <sys/stat.h>
31 
32 #include <ctype.h>
33 #include <err.h>
34 #include <errno.h>
35 #include <fts.h>
36 #include <regex.h>
37 #include <stdbool.h>
38 #include <stdio.h>
39 #include <stdlib.h>
40 #include <string.h>
41 #include <unistd.h>
42 #include <zlib.h>
43 
44 #include "grep.h"
45 
46 /*
47  * Process a file line by line...
48  */
49 
50 static int	linesqueued;
51 static int	procline(str_t *l, int);
52 static int	grep_search(fastgrep_t *, char *, size_t, regmatch_t *pmatch, int);
53 #ifndef SMALL
54 static bool	grep_cmp(const char *, const char *, size_t);
55 static void	grep_revstr(unsigned char *, int);
56 #endif
57 
58 int
59 grep_tree(char **argv)
60 {
61 	FTS	*fts;
62 	FTSENT	*p;
63 	int	c, fts_flags;
64 
65 	c = 0;
66 
67 	fts_flags = FTS_PHYSICAL | FTS_NOSTAT | FTS_NOCHDIR;
68 
69 	if (!(fts = fts_open(argv, fts_flags, NULL)))
70 		err(2, NULL);
71 	while ((p = fts_read(fts)) != NULL) {
72 		switch (p->fts_info) {
73 		case FTS_DNR:
74 			break;
75 		case FTS_ERR:
76 			file_err = 1;
77 			if(!sflag)
78 				warnc(p->fts_errno, "%s", p->fts_path);
79 			break;
80 		case FTS_D:
81 		case FTS_DP:
82 			break;
83 		default:
84 			c += procfile(p->fts_path);
85 			break;
86 		}
87 	}
88 	if (errno)
89 		err(2, "fts_read");
90 	fts_close(fts);
91 	return c;
92 }
93 
94 int
95 procfile(char *fn)
96 {
97 	str_t ln;
98 	file_t *f;
99 	int c, t, z, nottext;
100 
101 	mcount = mlimit;
102 
103 	if (fn == NULL) {
104 		fn = "(standard input)";
105 		f = grep_fdopen(STDIN_FILENO);
106 	} else {
107 		f = grep_open(fn);
108 	}
109 	if (f == NULL) {
110 		if (errno == EISDIR)
111 			return 0;
112 		file_err = 1;
113 		if (!sflag)
114 			warn("%s", fn);
115 		return 0;
116 	}
117 
118 	nottext = grep_bin_file(f);
119 	if (nottext && binbehave == BIN_FILE_SKIP) {
120 		grep_close(f);
121 		return 0;
122 	}
123 
124 	ln.file = fn;
125 	if (labelname)
126 		ln.file = (char *)labelname;
127 	ln.line_no = 0;
128 	ln.len = 0;
129 	linesqueued = 0;
130 	tail = 0;
131 	ln.off = -1;
132 
133 	if (Bflag > 0)
134 		initqueue();
135 	for (c = 0;  c == 0 || !(lflag || qflag); ) {
136 		if (mflag && mlimit == 0)
137 			break;
138 		ln.off += ln.len + 1;
139 		if ((ln.dat = grep_fgetln(f, &ln.len)) == NULL)
140 			break;
141 		if (ln.len > 0 && ln.dat[ln.len - 1] == '\n')
142 			--ln.len;
143 		ln.line_no++;
144 
145 		z = tail;
146 
147 		if ((t = procline(&ln, nottext)) == 0 && Bflag > 0 && z == 0) {
148 			enqueue(&ln);
149 			linesqueued++;
150 		}
151 		c += t;
152 		if (mflag && mcount <= 0)
153 			break;
154 	}
155 	if (Bflag > 0)
156 		clearqueue();
157 	grep_close(f);
158 
159 	if (cflag) {
160 		if (!hflag)
161 			printf("%s:", ln.file);
162 		printf("%u\n", c);
163 	}
164 	if (lflag && c != 0)
165 		printf("%s\n", fn);
166 	if (Lflag && c == 0)
167 		printf("%s\n", fn);
168 	if (c && !cflag && !lflag && !Lflag &&
169 	    binbehave == BIN_FILE_BIN && nottext && !qflag)
170 		printf("Binary file %s matches\n", fn);
171 
172 	return c;
173 }
174 
175 
176 /*
177  * Process an individual line in a file. Return non-zero if it matches.
178  */
179 
180 #define isword(x) (isalnum((unsigned char)x) || (x) == '_')
181 
182 static int
183 procline(str_t *l, int nottext)
184 {
185 	regmatch_t	pmatch = { 0 };
186 	int		c, i, r;
187 	regoff_t	offset;
188 
189 	/* size_t will be converted to regoff_t. ssize_t is guaranteed to fit
190 	 * into regoff_t */
191 	if (l->len > SSIZE_MAX) {
192 		errx(2, "Line is too big to process");
193 	}
194 
195 	c = 0;
196 	i = 0;
197 	if (matchall) {
198 		c = 1;
199 		goto print;
200 	}
201 
202 	for (i = 0; i < patterns; i++) {
203 		offset = 0;
204 redo:
205 		if (fg_pattern[i].pattern) {
206 			int flags = 0;
207 			if (offset)
208 				flags |= REG_NOTBOL;
209 			r = grep_search(&fg_pattern[i], l->dat + offset,
210 			    l->len - offset, &pmatch, flags);
211 			pmatch.rm_so += offset;
212 			pmatch.rm_eo += offset;
213 		} else {
214 			int flags = eflags;
215 			if (offset)
216 				flags |= REG_NOTBOL;
217 			pmatch.rm_so = offset;
218 			pmatch.rm_eo = l->len;
219 			r = regexec(&r_pattern[i], l->dat, 1, &pmatch, flags);
220 		}
221 		if (r == 0 && xflag) {
222 			if (pmatch.rm_so != 0 || pmatch.rm_eo != l->len)
223 				r = REG_NOMATCH;
224 		}
225 		if (r == 0) {
226 			c = 1;
227 			if (oflag && pmatch.rm_so != pmatch.rm_eo)
228 				goto print;
229 			break;
230 		}
231 	}
232 	if (oflag)
233 		return c;
234 print:
235 	if (vflag)
236 		c = !c;
237 
238 	/* Count the matches if we have a match limit */
239 	if (mflag)
240 		mcount -= c;
241 
242 	if (c && binbehave == BIN_FILE_BIN && nottext)
243 		return c; /* Binary file */
244 
245 	if ((tail > 0 || c) && !cflag && !qflag) {
246 		if (c) {
247 			if (first > 0 && tail == 0 && (Bflag < linesqueued) &&
248 			    (Aflag || Bflag))
249 				printf("--\n");
250 			first = 1;
251 			tail = Aflag;
252 			if (Bflag > 0)
253 				printqueue();
254 			linesqueued = 0;
255 			printline(l, ':', oflag ? &pmatch : NULL);
256 		} else {
257 			printline(l, '-', oflag ? &pmatch : NULL);
258 			tail--;
259 		}
260 	}
261 	if (oflag && !matchall) {
262 		offset = pmatch.rm_eo;
263 		goto redo;
264 	}
265 	return c;
266 }
267 
268 #ifndef SMALL
269 void
270 fgrepcomp(fastgrep_t *fg, const unsigned char *pattern)
271 {
272 	int i;
273 
274 	/* Initialize. */
275 	fg->patternLen = strlen(pattern);
276 	fg->bol = 0;
277 	fg->eol = 0;
278 	fg->wmatch = wflag;
279 	fg->reversedSearch = 0;
280 
281 	/*
282 	 * Make a copy and upper case it for later if in -i mode,
283 	 * else just copy the pointer.
284 	 */
285 	if (iflag) {
286 		fg->pattern = grep_malloc(fg->patternLen + 1);
287 		for (i = 0; i < fg->patternLen; i++)
288 			fg->pattern[i] = toupper(pattern[i]);
289 		fg->pattern[fg->patternLen] = '\0';
290 	} else
291 		fg->pattern = (unsigned char *)pattern;	/* really const */
292 
293 	/* Preprocess pattern. */
294 	for (i = 0; i <= UCHAR_MAX; i++)
295 		fg->qsBc[i] = fg->patternLen;
296 	for (i = 1; i < fg->patternLen; i++) {
297 		fg->qsBc[fg->pattern[i]] = fg->patternLen - i;
298 		/*
299 		 * If case is ignored, make the jump apply to both upper and
300 		 * lower cased characters.  As the pattern is stored in upper
301 		 * case, apply the same to the lower case equivalents.
302 		 */
303 		if (iflag)
304 			fg->qsBc[tolower(fg->pattern[i])] = fg->patternLen - i;
305 	}
306 }
307 #endif
308 
309 /*
310  * Returns: -1 on failure, 0 on success
311  */
312 int
313 fastcomp(fastgrep_t *fg, const char *pattern)
314 {
315 #ifdef SMALL
316 	return -1;
317 #else
318 	int i;
319 	int bol = 0;
320 	int eol = 0;
321 	int shiftPatternLen;
322 	int hasDot = 0;
323 	int firstHalfDot = -1;
324 	int firstLastHalfDot = -1;
325 	int lastHalfDot = 0;
326 
327 	/* Initialize. */
328 	fg->patternLen = strlen(pattern);
329 	fg->bol = 0;
330 	fg->eol = 0;
331 	fg->wmatch = 0;
332 	fg->reversedSearch = 0;
333 
334 	/* Remove end-of-line character ('$'). */
335 	if (fg->patternLen > 0 && pattern[fg->patternLen - 1] == '$') {
336 		eol++;
337 		fg->eol = 1;
338 		fg->patternLen--;
339 	}
340 
341 	/* Remove beginning-of-line character ('^'). */
342 	if (pattern[0] == '^') {
343 		bol++;
344 		fg->bol = 1;
345 		fg->patternLen--;
346 	}
347 
348 	/* Remove enclosing [[:<:]] and [[:>:]] (word match). */
349 	if (wflag) {
350 		/* basic re's use \( \), extended re's ( ) */
351 		int extra = Eflag ? 1 : 2;
352 		fg->patternLen -= 14 + 2 * extra;
353 		fg->wmatch = 7 + extra;
354 	} else if (fg->patternLen >= 14 &&
355 	    strncmp(pattern + fg->bol, "[[:<:]]", 7) == 0 &&
356 	    strncmp(pattern + fg->bol + fg->patternLen - 7, "[[:>:]]", 7) == 0) {
357 		fg->patternLen -= 14;
358 		fg->wmatch = 7;
359 	}
360 
361 	/*
362 	 * Copy pattern minus '^' and '$' characters as well as word
363 	 * match character classes at the beginning and ending of the
364 	 * string respectively.
365 	 */
366 	fg->pattern = grep_malloc(fg->patternLen + 1);
367 	memcpy(fg->pattern, pattern + bol + fg->wmatch, fg->patternLen);
368 	fg->pattern[fg->patternLen] = '\0';
369 
370 	/* Look for ways to cheat...er...avoid the full regex engine. */
371 	for (i = 0; i < fg->patternLen; i++)
372 	{
373 		switch (fg->pattern[i]) {
374 		case '.':
375 			hasDot = i;
376 			if (i < fg->patternLen / 2) {
377 				if (firstHalfDot < 0)
378 					/* Closest dot to the beginning */
379 					firstHalfDot = i;
380 			} else {
381 				/* Closest dot to the end of the pattern. */
382 				lastHalfDot = i;
383 				if (firstLastHalfDot < 0)
384 					firstLastHalfDot = i;
385 			}
386 			break;
387 		case '(': case ')':
388 		case '{': case '}':
389 			/* Special in BRE if preceded by '\\' */
390 		case '?':
391 		case '+':
392 		case '|':
393 			/* Not special in BRE. */
394 			if (!Eflag)
395 				goto nonspecial;
396 		case '\\':
397 		case '*':
398 		case '[': case ']':
399 			/* Free memory and let others know this is empty. */
400 			free(fg->pattern);
401 			fg->pattern = NULL;
402 			return (-1);
403 		default:
404 nonspecial:
405 			if (iflag)
406 				fg->pattern[i] = toupper(fg->pattern[i]);
407 			break;
408 		}
409 	}
410 
411 	/*
412 	 * Determine if a reverse search would be faster based on the placement
413 	 * of the dots.
414 	 */
415 	if ((!(lflag || cflag || oflag)) && ((!(bol || eol)) &&
416 	    ((lastHalfDot) && ((firstHalfDot < 0) ||
417 	    ((fg->patternLen - (lastHalfDot + 1)) < firstHalfDot))))) {
418 		fg->reversedSearch = 1;
419 		hasDot = fg->patternLen - (firstHalfDot < 0 ?
420 		    firstLastHalfDot : firstHalfDot) - 1;
421 		grep_revstr(fg->pattern, fg->patternLen);
422 	}
423 
424 	/*
425 	 * Normal Quick Search would require a shift based on the position the
426 	 * next character after the comparison is within the pattern.  With
427 	 * wildcards, the position of the last dot effects the maximum shift
428 	 * distance.
429 	 * The closer to the end the wild card is the slower the search.  A
430 	 * reverse version of this algorithm would be useful for wildcards near
431 	 * the end of the string.
432 	 *
433 	 * Examples:
434 	 * Pattern	Max shift
435 	 * -------	---------
436 	 * this		5
437 	 * .his		4
438 	 * t.is		3
439 	 * th.s		2
440 	 * thi.		1
441 	 */
442 
443 	/* Adjust the shift based on location of the last dot ('.'). */
444 	shiftPatternLen = fg->patternLen - hasDot;
445 
446 	/* Preprocess pattern. */
447 	for (i = 0; i <= UCHAR_MAX; i++)
448 		fg->qsBc[i] = shiftPatternLen;
449 	for (i = hasDot + 1; i < fg->patternLen; i++) {
450 		fg->qsBc[fg->pattern[i]] = fg->patternLen - i;
451 		/*
452 		 * If case is ignored, make the jump apply to both upper and
453 		 * lower cased characters.  As the pattern is stored in upper
454 		 * case, apply the same to the lower case equivalents.
455 		 */
456 		if (iflag)
457 			fg->qsBc[tolower(fg->pattern[i])] = fg->patternLen - i;
458 	}
459 
460 	/*
461 	 * Put pattern back to normal after pre-processing to allow for easy
462 	 * comparisons later.
463 	 */
464 	if (fg->reversedSearch)
465 		grep_revstr(fg->pattern, fg->patternLen);
466 
467 	return (0);
468 #endif
469 }
470 
471 /*
472  * Word boundaries using regular expressions are defined as the point
473  * of transition from a non-word char to a word char, or vice versa.
474  * This means that grep -w +a and grep -w a+ never match anything,
475  * because they lack a starting or ending transition, but grep -w a+b
476  * does match a line containing a+b.
477  */
478 #define wmatch(d, l, s, e)	\
479 	((s == 0 || !isword(d[s-1])) && (e == l || !isword(d[e])) && \
480 	  e > s && isword(d[s]) && isword(d[e-1]))
481 
482 static int
483 grep_search(fastgrep_t *fg, char *data, size_t dataLen, regmatch_t *pmatch,
484     int flags)
485 {
486 #ifdef SMALL
487 	return 0;
488 #else
489 	regoff_t j;
490 	int rtrnVal = REG_NOMATCH;
491 
492 	pmatch->rm_so = -1;
493 	pmatch->rm_eo = -1;
494 
495 	/* No point in going farther if we do not have enough data. */
496 	if (dataLen < fg->patternLen)
497 		return (rtrnVal);
498 
499 	/* Only try once at the beginning or ending of the line. */
500 	if (fg->bol || fg->eol) {
501 		if (fg->bol && (flags & REG_NOTBOL))
502 			return 0;
503 		/* Simple text comparison. */
504 		/* Verify data is >= pattern length before searching on it. */
505 		if (dataLen >= fg->patternLen) {
506 			/* Determine where in data to start search at. */
507 			if (fg->eol)
508 				j = dataLen - fg->patternLen;
509 			else
510 				j = 0;
511 			if (!((fg->bol && fg->eol) && (dataLen != fg->patternLen)))
512 				if (grep_cmp(fg->pattern, data + j,
513 				    fg->patternLen)) {
514 					pmatch->rm_so = j;
515 					pmatch->rm_eo = j + fg->patternLen;
516 					if (!fg->wmatch || wmatch(data, dataLen,
517 					    pmatch->rm_so, pmatch->rm_eo))
518 						rtrnVal = 0;
519 				}
520 		}
521 	} else if (fg->reversedSearch) {
522 		/* Quick Search algorithm. */
523 		j = dataLen;
524 		do {
525 			if (grep_cmp(fg->pattern, data + j - fg->patternLen,
526 			    fg->patternLen)) {
527 				pmatch->rm_so = j - fg->patternLen;
528 				pmatch->rm_eo = j;
529 				if (!fg->wmatch || wmatch(data, dataLen,
530 				    pmatch->rm_so, pmatch->rm_eo)) {
531 					rtrnVal = 0;
532 					break;
533 				}
534 			}
535 			/* Shift if within bounds, otherwise, we are done. */
536 			if (j == fg->patternLen)
537 				break;
538 			j -= fg->qsBc[(unsigned char)data[j - fg->patternLen - 1]];
539 		} while (j >= fg->patternLen);
540 	} else {
541 		/* Quick Search algorithm. */
542 		j = 0;
543 		do {
544 			if (grep_cmp(fg->pattern, data + j, fg->patternLen)) {
545 				pmatch->rm_so = j;
546 				pmatch->rm_eo = j + fg->patternLen;
547 				if (fg->patternLen == 0 || !fg->wmatch ||
548 				    wmatch(data, dataLen, pmatch->rm_so,
549 				    pmatch->rm_eo)) {
550 					rtrnVal = 0;
551 					break;
552 				}
553 			}
554 
555 			/* Shift if within bounds, otherwise, we are done. */
556 			if (j + fg->patternLen == dataLen)
557 				break;
558 			else
559 				j += fg->qsBc[(unsigned char)data[j + fg->patternLen]];
560 		} while (j <= (dataLen - fg->patternLen));
561 	}
562 
563 	return (rtrnVal);
564 #endif
565 }
566 
567 
568 void *
569 grep_malloc(size_t size)
570 {
571 	void	*ptr;
572 
573 	if ((ptr = malloc(size)) == NULL)
574 		err(2, "malloc");
575 	return ptr;
576 }
577 
578 void *
579 grep_calloc(size_t nmemb, size_t size)
580 {
581 	void	*ptr;
582 
583 	if ((ptr = calloc(nmemb, size)) == NULL)
584 		err(2, "calloc");
585 	return ptr;
586 }
587 
588 void *
589 grep_realloc(void *ptr, size_t size)
590 {
591 	if ((ptr = realloc(ptr, size)) == NULL)
592 		err(2, "realloc");
593 	return ptr;
594 }
595 
596 void *
597 grep_reallocarray(void *ptr, size_t nmemb, size_t size)
598 {
599 	if ((ptr = reallocarray(ptr, nmemb, size)) == NULL)
600 		err(2, "reallocarray");
601 	return ptr;
602 }
603 
604 #ifndef SMALL
605 /*
606  * Returns:	true on success, false on failure
607  */
608 static bool
609 grep_cmp(const char *pattern, const char *data, size_t len)
610 {
611 	size_t i;
612 
613 	for (i = 0; i < len; i++) {
614 		if (((pattern[i] == data[i]) || (!Fflag && pattern[i] == '.'))
615 		    || (iflag && pattern[i] == toupper((unsigned char)data[i])))
616 			continue;
617 		return false;
618 	}
619 
620 	return true;
621 }
622 
623 static void
624 grep_revstr(unsigned char *str, int len)
625 {
626 	int i;
627 	char c;
628 
629 	for (i = 0; i < len / 2; i++) {
630 		c = str[i];
631 		str[i] = str[len - i - 1];
632 		str[len - i - 1] = c;
633 	}
634 }
635 #endif
636 
637 void
638 printline(str_t *line, int sep, regmatch_t *pmatch)
639 {
640 	int n;
641 
642 	n = 0;
643 	if (!hflag) {
644 		fputs(line->file, stdout);
645 		++n;
646 	}
647 	if (nflag) {
648 		if (n)
649 			putchar(sep);
650 		printf("%lld", line->line_no);
651 		++n;
652 	}
653 	if (bflag) {
654 		if (n)
655 			putchar(sep);
656 		printf("%lld", (long long)line->off +
657 		    (pmatch ? pmatch->rm_so : 0));
658 		++n;
659 	}
660 	if (n)
661 		putchar(sep);
662 	if (pmatch)
663 		fwrite(line->dat + pmatch->rm_so,
664 		    pmatch->rm_eo - pmatch->rm_so, 1, stdout);
665 	else
666 		fwrite(line->dat, line->len, 1, stdout);
667 	putchar('\n');
668 }
669