xref: /openbsd/usr.bin/grep/util.c (revision 4bdff4be)
1 /*	$OpenBSD: util.c,v 1.68 2023/11/15 00:50:43 millert Exp $	*/
2 
3 /*-
4  * Copyright (c) 1999 James Howard and Dag-Erling Co�dan Sm�rgrav
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
20  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26  * SUCH DAMAGE.
27  */
28 
29 #include <sys/types.h>
30 #include <sys/stat.h>
31 
32 #include <ctype.h>
33 #include <err.h>
34 #include <errno.h>
35 #include <fts.h>
36 #include <regex.h>
37 #include <stdbool.h>
38 #include <stdio.h>
39 #include <stdlib.h>
40 #include <string.h>
41 #include <unistd.h>
42 #include <zlib.h>
43 
44 #include "grep.h"
45 
46 /*
47  * Process a file line by line...
48  */
49 
50 static int	linesqueued;
51 static int	procline(str_t *l, int);
52 static int	grep_search(fastgrep_t *, char *, size_t, regmatch_t *pmatch, int);
53 #ifndef SMALL
54 static bool	grep_cmp(const char *, const char *, size_t);
55 static void	grep_revstr(unsigned char *, int);
56 #endif
57 
58 int
59 grep_tree(char **argv)
60 {
61 	FTS	*fts;
62 	FTSENT	*p;
63 	int	c, fts_flags;
64 	char	*dot_argv[] = { ".", NULL };
65 	char	*path;
66 
67 	/* default to . if no path given */
68 	if (argv[0] == NULL)
69 		argv = dot_argv;
70 
71 	c = 0;
72 
73 	fts_flags = FTS_PHYSICAL | FTS_NOSTAT | FTS_NOCHDIR;
74 
75 	if (!(fts = fts_open(argv, fts_flags, NULL)))
76 		err(2, NULL);
77 	while ((p = fts_read(fts)) != NULL) {
78 		switch (p->fts_info) {
79 		case FTS_DNR:
80 			break;
81 		case FTS_ERR:
82 			file_err = 1;
83 			if(!sflag)
84 				warnc(p->fts_errno, "%s", p->fts_path);
85 			break;
86 		case FTS_D:
87 		case FTS_DP:
88 			break;
89 		default:
90 			path = p->fts_path;
91 			/* skip "./" if implied */
92 			if (argv == dot_argv && p->fts_pathlen >= 2)
93 				path += 2;
94 			c |= procfile(path);
95 			break;
96 		}
97 	}
98 	if (errno)
99 		err(2, "fts_read");
100 	fts_close(fts);
101 	return c;
102 }
103 
104 int
105 procfile(char *fn)
106 {
107 	str_t ln;
108 	file_t *f;
109 	int t, z, nottext, overflow = 0;
110 	unsigned long long c;
111 
112 	mcount = mlimit;
113 
114 	if (fn == NULL) {
115 		fn = "(standard input)";
116 		f = grep_fdopen(STDIN_FILENO);
117 	} else {
118 		f = grep_open(fn);
119 	}
120 	if (f == NULL) {
121 		if (errno == EISDIR)
122 			return 0;
123 		file_err = 1;
124 		if (!sflag)
125 			warn("%s", fn);
126 		return 0;
127 	}
128 
129 	nottext = grep_bin_file(f);
130 	if (nottext && binbehave == BIN_FILE_SKIP) {
131 		grep_close(f);
132 		return 0;
133 	}
134 
135 	ln.file = fn;
136 	if (labelname)
137 		ln.file = (char *)labelname;
138 	ln.line_no = 0;
139 	ln.len = 0;
140 	linesqueued = 0;
141 	tail = 0;
142 	ln.off = -1;
143 
144 	if (Bflag > 0)
145 		initqueue();
146 	for (c = 0;  c == 0 || !(lflag || qflag); ) {
147 		if (mflag && mlimit == 0)
148 			break;
149 		ln.off += ln.len + 1;
150 		if ((ln.dat = grep_fgetln(f, &ln.len)) == NULL)
151 			break;
152 		if (ln.len > 0 && ln.dat[ln.len - 1] == '\n')
153 			--ln.len;
154 		ln.line_no++;
155 
156 		z = tail;
157 
158 		if ((t = procline(&ln, nottext)) == 0 && Bflag > 0 && z == 0) {
159 			enqueue(&ln);
160 			linesqueued++;
161 		}
162 		if (ULLONG_MAX - c < (unsigned long long)t)
163 			overflow = 1;
164 		else
165 			c += t;
166 		if (mflag && mcount <= 0 && tail <= 0)
167 			break;
168 	}
169 	if (Bflag > 0)
170 		clearqueue();
171 	grep_close(f);
172 
173 	if (cflag) {
174 		if (!hflag)
175 			printf("%s%c", ln.file, nullflag ? '\0' : ':');
176 		printf("%llu%s\n", c, overflow ? "+" : "");
177 	}
178 	if (lflag && c != 0)
179 		printf("%s%c", fn, nullflag ? '\0' : '\n');
180 	if (Lflag && c == 0)
181 		printf("%s%c", fn, nullflag ? '\0' : '\n');
182 	if (c && !cflag && !lflag && !Lflag &&
183 	    binbehave == BIN_FILE_BIN && nottext && !qflag)
184 		printf("Binary file %s matches\n", fn);
185 
186 	return overflow || c != 0;
187 }
188 
189 
190 /*
191  * Process an individual line in a file. Return non-zero if it matches.
192  */
193 
194 #define isword(x) (isalnum((unsigned char)x) || (x) == '_')
195 
196 static int
197 procline(str_t *l, int nottext)
198 {
199 	regmatch_t	pmatch = { 0 };
200 	int		c, i, r, counted;
201 	regoff_t	offset;
202 
203 	/* size_t will be converted to regoff_t. ssize_t is guaranteed to fit
204 	 * into regoff_t */
205 	if (l->len > SSIZE_MAX) {
206 		errx(2, "Line is too big to process");
207 	}
208 
209 	c = 0;
210 	i = 0;
211 	counted = 0;
212 	if (matchall) {
213 		c = 1;
214 		goto print;
215 	}
216 	if (mflag && mcount <= 0)
217 		goto print;
218 
219 	for (i = 0; i < patterns; i++) {
220 		offset = 0;
221 redo:
222 		if (fg_pattern[i].pattern) {
223 			int flags = 0;
224 			if (offset)
225 				flags |= REG_NOTBOL;
226 			r = grep_search(&fg_pattern[i], l->dat + offset,
227 			    l->len - offset, &pmatch, flags);
228 			pmatch.rm_so += offset;
229 			pmatch.rm_eo += offset;
230 		} else {
231 			int flags = eflags;
232 			if (offset)
233 				flags |= REG_NOTBOL;
234 			pmatch.rm_so = offset;
235 			pmatch.rm_eo = l->len;
236 			r = regexec(&r_pattern[i], l->dat, 1, &pmatch, flags);
237 		}
238 		if (r == 0 && xflag) {
239 			if (pmatch.rm_so != 0 || pmatch.rm_eo != l->len)
240 				r = REG_NOMATCH;
241 		}
242 		if (r == 0) {
243 			c = 1;
244 			if (oflag && pmatch.rm_so != pmatch.rm_eo)
245 				goto print;
246 			break;
247 		}
248 	}
249 	if (oflag)
250 		return c;
251 print:
252 	if (vflag)
253 		c = !c;
254 
255 	/* Count the matches if there is a match limit (but only once). */
256 	if (mflag && !counted) {
257 		mcount -= c;
258 		counted = 1;
259 	}
260 
261 	if (c && binbehave == BIN_FILE_BIN && nottext)
262 		return c; /* Binary file */
263 
264 	if ((tail > 0 || c) && !cflag && !qflag) {
265 		if (c) {
266 			if (first > 0 && tail == 0 && (Aflag || (Bflag &&
267 			    Bflag < linesqueued)))
268 				printf("--\n");
269 			first = 1;
270 			tail = Aflag;
271 			if (Bflag > 0)
272 				printqueue();
273 			linesqueued = 0;
274 			printline(l, ':', oflag ? &pmatch : NULL);
275 		} else {
276 			printline(l, '-', oflag ? &pmatch : NULL);
277 			tail--;
278 		}
279 	}
280 	if (oflag && !matchall) {
281 		offset = pmatch.rm_eo;
282 		goto redo;
283 	}
284 	return c;
285 }
286 
287 #ifndef SMALL
288 void
289 fgrepcomp(fastgrep_t *fg, const unsigned char *pattern)
290 {
291 	int i;
292 
293 	/* Initialize. */
294 	fg->patternLen = strlen(pattern);
295 	fg->bol = 0;
296 	fg->eol = 0;
297 	fg->wmatch = wflag;
298 	fg->reversedSearch = 0;
299 
300 	/*
301 	 * Make a copy and upper case it for later if in -i mode,
302 	 * else just copy the pointer.
303 	 */
304 	if (iflag) {
305 		fg->pattern = grep_malloc(fg->patternLen + 1);
306 		for (i = 0; i < fg->patternLen; i++)
307 			fg->pattern[i] = toupper(pattern[i]);
308 		fg->pattern[fg->patternLen] = '\0';
309 	} else
310 		fg->pattern = (unsigned char *)pattern;	/* really const */
311 
312 	/* Preprocess pattern. */
313 	for (i = 0; i <= UCHAR_MAX; i++)
314 		fg->qsBc[i] = fg->patternLen;
315 	for (i = 1; i < fg->patternLen; i++) {
316 		fg->qsBc[fg->pattern[i]] = fg->patternLen - i;
317 		/*
318 		 * If case is ignored, make the jump apply to both upper and
319 		 * lower cased characters.  As the pattern is stored in upper
320 		 * case, apply the same to the lower case equivalents.
321 		 */
322 		if (iflag)
323 			fg->qsBc[tolower(fg->pattern[i])] = fg->patternLen - i;
324 	}
325 }
326 #endif
327 
328 /*
329  * Returns: -1 on failure, 0 on success
330  */
331 int
332 fastcomp(fastgrep_t *fg, const char *pattern)
333 {
334 #ifdef SMALL
335 	return -1;
336 #else
337 	int i;
338 	int bol = 0;
339 	int eol = 0;
340 	int shiftPatternLen;
341 	int hasDot = 0;
342 	int firstHalfDot = -1;
343 	int firstLastHalfDot = -1;
344 	int lastHalfDot = 0;
345 
346 	/* Initialize. */
347 	fg->patternLen = strlen(pattern);
348 	fg->bol = 0;
349 	fg->eol = 0;
350 	fg->wmatch = 0;
351 	fg->reversedSearch = 0;
352 
353 	/* Remove end-of-line character ('$'). */
354 	if (fg->patternLen > 0 && pattern[fg->patternLen - 1] == '$') {
355 		eol++;
356 		fg->eol = 1;
357 		fg->patternLen--;
358 	}
359 
360 	/* Remove beginning-of-line character ('^'). */
361 	if (pattern[0] == '^') {
362 		bol++;
363 		fg->bol = 1;
364 		fg->patternLen--;
365 	}
366 
367 	/* Remove enclosing [[:<:]] and [[:>:]] (word match). */
368 	if (wflag) {
369 		/* basic re's use \( \), extended re's ( ) */
370 		int extra = Eflag ? 1 : 2;
371 		fg->patternLen -= 14 + 2 * extra;
372 		fg->wmatch = 7 + extra;
373 	} else if (fg->patternLen >= 14 &&
374 	    strncmp(pattern + fg->bol, "[[:<:]]", 7) == 0 &&
375 	    strncmp(pattern + fg->bol + fg->patternLen - 7, "[[:>:]]", 7) == 0) {
376 		fg->patternLen -= 14;
377 		fg->wmatch = 7;
378 	}
379 
380 	/*
381 	 * Copy pattern minus '^' and '$' characters as well as word
382 	 * match character classes at the beginning and ending of the
383 	 * string respectively.
384 	 */
385 	fg->pattern = grep_malloc(fg->patternLen + 1);
386 	memcpy(fg->pattern, pattern + bol + fg->wmatch, fg->patternLen);
387 	fg->pattern[fg->patternLen] = '\0';
388 
389 	/* Look for ways to cheat...er...avoid the full regex engine. */
390 	for (i = 0; i < fg->patternLen; i++)
391 	{
392 		switch (fg->pattern[i]) {
393 		case '.':
394 			hasDot = i;
395 			if (i < fg->patternLen / 2) {
396 				if (firstHalfDot < 0)
397 					/* Closest dot to the beginning */
398 					firstHalfDot = i;
399 			} else {
400 				/* Closest dot to the end of the pattern. */
401 				lastHalfDot = i;
402 				if (firstLastHalfDot < 0)
403 					firstLastHalfDot = i;
404 			}
405 			break;
406 		case '(': case ')':
407 		case '{': case '}':
408 			/* Special in BRE if preceded by '\\' */
409 		case '?':
410 		case '+':
411 		case '|':
412 			/* Not special in BRE. */
413 			if (!Eflag)
414 				goto nonspecial;
415 		case '\\':
416 		case '*':
417 		case '[': case ']':
418 			/* Free memory and let others know this is empty. */
419 			free(fg->pattern);
420 			fg->pattern = NULL;
421 			return (-1);
422 		default:
423 nonspecial:
424 			if (iflag)
425 				fg->pattern[i] = toupper(fg->pattern[i]);
426 			break;
427 		}
428 	}
429 
430 	/*
431 	 * Determine if a reverse search would be faster based on the placement
432 	 * of the dots.
433 	 */
434 	if ((!(lflag || cflag || oflag)) && ((!(bol || eol)) &&
435 	    ((lastHalfDot) && ((firstHalfDot < 0) ||
436 	    ((fg->patternLen - (lastHalfDot + 1)) < firstHalfDot))))) {
437 		fg->reversedSearch = 1;
438 		hasDot = fg->patternLen - (firstHalfDot < 0 ?
439 		    firstLastHalfDot : firstHalfDot) - 1;
440 		grep_revstr(fg->pattern, fg->patternLen);
441 	}
442 
443 	/*
444 	 * Normal Quick Search would require a shift based on the position the
445 	 * next character after the comparison is within the pattern.  With
446 	 * wildcards, the position of the last dot effects the maximum shift
447 	 * distance.
448 	 * The closer to the end the wild card is the slower the search.  A
449 	 * reverse version of this algorithm would be useful for wildcards near
450 	 * the end of the string.
451 	 *
452 	 * Examples:
453 	 * Pattern	Max shift
454 	 * -------	---------
455 	 * this		5
456 	 * .his		4
457 	 * t.is		3
458 	 * th.s		2
459 	 * thi.		1
460 	 */
461 
462 	/* Adjust the shift based on location of the last dot ('.'). */
463 	shiftPatternLen = fg->patternLen - hasDot;
464 
465 	/* Preprocess pattern. */
466 	for (i = 0; i <= UCHAR_MAX; i++)
467 		fg->qsBc[i] = shiftPatternLen;
468 	for (i = hasDot + 1; i < fg->patternLen; i++) {
469 		fg->qsBc[fg->pattern[i]] = fg->patternLen - i;
470 		/*
471 		 * If case is ignored, make the jump apply to both upper and
472 		 * lower cased characters.  As the pattern is stored in upper
473 		 * case, apply the same to the lower case equivalents.
474 		 */
475 		if (iflag)
476 			fg->qsBc[tolower(fg->pattern[i])] = fg->patternLen - i;
477 	}
478 
479 	/*
480 	 * Put pattern back to normal after pre-processing to allow for easy
481 	 * comparisons later.
482 	 */
483 	if (fg->reversedSearch)
484 		grep_revstr(fg->pattern, fg->patternLen);
485 
486 	return (0);
487 #endif
488 }
489 
490 /*
491  * Word boundaries using regular expressions are defined as the point
492  * of transition from a non-word char to a word char, or vice versa.
493  * This means that grep -w +a and grep -w a+ never match anything,
494  * because they lack a starting or ending transition, but grep -w a+b
495  * does match a line containing a+b.
496  */
497 #define wmatch(d, l, s, e)	\
498 	((s == 0 || !isword(d[s-1])) && (e == l || !isword(d[e])) && \
499 	  e > s && isword(d[s]) && isword(d[e-1]))
500 
501 static int
502 grep_search(fastgrep_t *fg, char *data, size_t dataLen, regmatch_t *pmatch,
503     int flags)
504 {
505 #ifdef SMALL
506 	return 0;
507 #else
508 	regoff_t j;
509 	int rtrnVal = REG_NOMATCH;
510 
511 	pmatch->rm_so = -1;
512 	pmatch->rm_eo = -1;
513 
514 	/* No point in going farther if we do not have enough data. */
515 	if (dataLen < fg->patternLen)
516 		return (rtrnVal);
517 
518 	/* Only try once at the beginning or ending of the line. */
519 	if (fg->bol || fg->eol) {
520 		if (fg->bol && (flags & REG_NOTBOL))
521 			return 0;
522 		/* Simple text comparison. */
523 		/* Verify data is >= pattern length before searching on it. */
524 		if (dataLen >= fg->patternLen) {
525 			/* Determine where in data to start search at. */
526 			if (fg->eol)
527 				j = dataLen - fg->patternLen;
528 			else
529 				j = 0;
530 			if (!((fg->bol && fg->eol) && (dataLen != fg->patternLen)))
531 				if (grep_cmp(fg->pattern, data + j,
532 				    fg->patternLen)) {
533 					pmatch->rm_so = j;
534 					pmatch->rm_eo = j + fg->patternLen;
535 					if (!fg->wmatch || wmatch(data, dataLen,
536 					    pmatch->rm_so, pmatch->rm_eo))
537 						rtrnVal = 0;
538 				}
539 		}
540 	} else if (fg->reversedSearch) {
541 		/* Quick Search algorithm. */
542 		j = dataLen;
543 		do {
544 			if (grep_cmp(fg->pattern, data + j - fg->patternLen,
545 			    fg->patternLen)) {
546 				pmatch->rm_so = j - fg->patternLen;
547 				pmatch->rm_eo = j;
548 				if (!fg->wmatch || wmatch(data, dataLen,
549 				    pmatch->rm_so, pmatch->rm_eo)) {
550 					rtrnVal = 0;
551 					break;
552 				}
553 			}
554 			/* Shift if within bounds, otherwise, we are done. */
555 			if (j == fg->patternLen)
556 				break;
557 			j -= fg->qsBc[(unsigned char)data[j - fg->patternLen - 1]];
558 		} while (j >= fg->patternLen);
559 	} else {
560 		/* Quick Search algorithm. */
561 		j = 0;
562 		do {
563 			if (grep_cmp(fg->pattern, data + j, fg->patternLen)) {
564 				pmatch->rm_so = j;
565 				pmatch->rm_eo = j + fg->patternLen;
566 				if (fg->patternLen == 0 || !fg->wmatch ||
567 				    wmatch(data, dataLen, pmatch->rm_so,
568 				    pmatch->rm_eo)) {
569 					rtrnVal = 0;
570 					break;
571 				}
572 			}
573 
574 			/* Shift if within bounds, otherwise, we are done. */
575 			if (j + fg->patternLen == dataLen)
576 				break;
577 			else
578 				j += fg->qsBc[(unsigned char)data[j + fg->patternLen]];
579 		} while (j <= (dataLen - fg->patternLen));
580 	}
581 
582 	return (rtrnVal);
583 #endif
584 }
585 
586 
587 void *
588 grep_malloc(size_t size)
589 {
590 	void	*ptr;
591 
592 	if ((ptr = malloc(size)) == NULL)
593 		err(2, "malloc");
594 	return ptr;
595 }
596 
597 void *
598 grep_calloc(size_t nmemb, size_t size)
599 {
600 	void	*ptr;
601 
602 	if ((ptr = calloc(nmemb, size)) == NULL)
603 		err(2, "calloc");
604 	return ptr;
605 }
606 
607 void *
608 grep_realloc(void *ptr, size_t size)
609 {
610 	if ((ptr = realloc(ptr, size)) == NULL)
611 		err(2, "realloc");
612 	return ptr;
613 }
614 
615 void *
616 grep_reallocarray(void *ptr, size_t nmemb, size_t size)
617 {
618 	if ((ptr = reallocarray(ptr, nmemb, size)) == NULL)
619 		err(2, "reallocarray");
620 	return ptr;
621 }
622 
623 #ifndef SMALL
624 /*
625  * Returns:	true on success, false on failure
626  */
627 static bool
628 grep_cmp(const char *pattern, const char *data, size_t len)
629 {
630 	size_t i;
631 
632 	for (i = 0; i < len; i++) {
633 		if (((pattern[i] == data[i]) || (!Fflag && pattern[i] == '.'))
634 		    || (iflag && pattern[i] == toupper((unsigned char)data[i])))
635 			continue;
636 		return false;
637 	}
638 
639 	return true;
640 }
641 
642 static void
643 grep_revstr(unsigned char *str, int len)
644 {
645 	int i;
646 	char c;
647 
648 	for (i = 0; i < len / 2; i++) {
649 		c = str[i];
650 		str[i] = str[len - i - 1];
651 		str[len - i - 1] = c;
652 	}
653 }
654 #endif
655 
656 void
657 printline(str_t *line, int sep, regmatch_t *pmatch)
658 {
659 	int n;
660 
661 	n = 0;
662 	if (!hflag) {
663 		fputs(line->file, stdout);
664 		if (nullflag)
665 			putchar(0);
666 		else
667 			++n;
668 	}
669 	if (nflag) {
670 		if (n)
671 			putchar(sep);
672 		printf("%lld", line->line_no);
673 		++n;
674 	}
675 	if (bflag) {
676 		if (n)
677 			putchar(sep);
678 		printf("%lld", (long long)line->off +
679 		    (pmatch ? pmatch->rm_so : 0));
680 		++n;
681 	}
682 	if (n)
683 		putchar(sep);
684 	if (pmatch)
685 		fwrite(line->dat + pmatch->rm_so,
686 		    pmatch->rm_eo - pmatch->rm_so, 1, stdout);
687 	else
688 		fwrite(line->dat, line->len, 1, stdout);
689 	putchar('\n');
690 }
691