1 #include <ctype.h>
2 #include <stdarg.h>
3 #include <stdio.h>
4 #include <stdlib.h>
5 #include <string.h>
6 #include <sys/stat.h>
7
8 #include "config.h"
9 #include "util.h"
10
11 #ifdef _WIN32
12 #include <windows.h>
13 #define flockfile(x)
14 #define funlockfile(x)
15 #define getc_unlocked(x) getc(x)
16 #endif
17
18 #define CHECK_AND_RETURN(ptr) \
19 if (ptr == NULL) { \
20 die("Memory allocation failed."); \
21 } \
22 return ptr;
23
24 FILE *out_fd = NULL;
25 ag_stats stats;
ag_malloc(size_t size)26 void *ag_malloc(size_t size) {
27 void *ptr = malloc(size);
28 CHECK_AND_RETURN(ptr)
29 }
30
ag_realloc(void * ptr,size_t size)31 void *ag_realloc(void *ptr, size_t size) {
32 void *new_ptr = realloc(ptr, size);
33 CHECK_AND_RETURN(new_ptr)
34 }
35
ag_calloc(size_t count,size_t size)36 void *ag_calloc(size_t count, size_t size) {
37 void *ptr = calloc(count, size);
38 CHECK_AND_RETURN(ptr)
39 }
40
ag_strdup(const char * s)41 char *ag_strdup(const char *s) {
42 char *str = strdup(s);
43 CHECK_AND_RETURN(str)
44 }
45
ag_strndup(const char * s,size_t size)46 char *ag_strndup(const char *s, size_t size) {
47 char *str = NULL;
48 #ifdef HAVE_STRNDUP
49 str = strndup(s, size);
50 CHECK_AND_RETURN(str)
51 #else
52 str = (char *)ag_malloc(size + 1);
53 strlcpy(str, s, size + 1);
54 return str;
55 #endif
56 }
57
free_strings(char ** strs,const size_t strs_len)58 void free_strings(char **strs, const size_t strs_len) {
59 if (strs == NULL) {
60 return;
61 }
62 size_t i;
63 for (i = 0; i < strs_len; i++) {
64 free(strs[i]);
65 }
66 free(strs);
67 }
68
generate_alpha_skip(const char * find,size_t f_len,size_t skip_lookup[],const int case_sensitive)69 void generate_alpha_skip(const char *find, size_t f_len, size_t skip_lookup[], const int case_sensitive) {
70 size_t i;
71
72 for (i = 0; i < 256; i++) {
73 skip_lookup[i] = f_len;
74 }
75
76 f_len--;
77
78 for (i = 0; i < f_len; i++) {
79 if (case_sensitive) {
80 skip_lookup[(unsigned char)find[i]] = f_len - i;
81 } else {
82 skip_lookup[(unsigned char)tolower(find[i])] = f_len - i;
83 skip_lookup[(unsigned char)toupper(find[i])] = f_len - i;
84 }
85 }
86 }
87
is_prefix(const char * s,const size_t s_len,const size_t pos,const int case_sensitive)88 int is_prefix(const char *s, const size_t s_len, const size_t pos, const int case_sensitive) {
89 size_t i;
90
91 for (i = 0; pos + i < s_len; i++) {
92 if (case_sensitive) {
93 if (s[i] != s[i + pos]) {
94 return 0;
95 }
96 } else {
97 if (tolower(s[i]) != tolower(s[i + pos])) {
98 return 0;
99 }
100 }
101 }
102
103 return 1;
104 }
105
suffix_len(const char * s,const size_t s_len,const size_t pos,const int case_sensitive)106 size_t suffix_len(const char *s, const size_t s_len, const size_t pos, const int case_sensitive) {
107 size_t i;
108
109 for (i = 0; i < pos; i++) {
110 if (case_sensitive) {
111 if (s[pos - i] != s[s_len - i - 1]) {
112 break;
113 }
114 } else {
115 if (tolower(s[pos - i]) != tolower(s[s_len - i - 1])) {
116 break;
117 }
118 }
119 }
120
121 return i;
122 }
123
generate_find_skip(const char * find,const size_t f_len,size_t ** skip_lookup,const int case_sensitive)124 void generate_find_skip(const char *find, const size_t f_len, size_t **skip_lookup, const int case_sensitive) {
125 size_t i;
126 size_t s_len;
127 size_t *sl = ag_malloc(f_len * sizeof(size_t));
128 *skip_lookup = sl;
129 size_t last_prefix = f_len;
130
131 for (i = last_prefix; i > 0; i--) {
132 if (is_prefix(find, f_len, i, case_sensitive)) {
133 last_prefix = i;
134 }
135 sl[i - 1] = last_prefix + (f_len - i);
136 }
137
138 for (i = 0; i < f_len; i++) {
139 s_len = suffix_len(find, f_len, i, case_sensitive);
140 if (find[i - s_len] != find[f_len - 1 - s_len]) {
141 sl[f_len - 1 - s_len] = f_len - 1 - i + s_len;
142 }
143 }
144 }
145
ag_max(size_t a,size_t b)146 size_t ag_max(size_t a, size_t b) {
147 if (b > a) {
148 return b;
149 }
150 return a;
151 }
152
ag_min(size_t a,size_t b)153 size_t ag_min(size_t a, size_t b) {
154 if (b < a) {
155 return b;
156 }
157 return a;
158 }
159
generate_hash(const char * find,const size_t f_len,uint8_t * h_table,const int case_sensitive)160 void generate_hash(const char *find, const size_t f_len, uint8_t *h_table, const int case_sensitive) {
161 int i;
162 for (i = f_len - sizeof(uint16_t); i >= 0; i--) {
163 // Add all 2^sizeof(uint16_t) combinations of capital letters to the hash table
164 int caps_set;
165 for (caps_set = 0; caps_set < (1 << sizeof(uint16_t)); caps_set++) {
166 word_t word;
167 memcpy(&word.as_chars, find + i, sizeof(uint16_t));
168 int cap_index;
169 // Capitalize the letters whose corresponding bits in caps_set are 1
170 for (cap_index = 0; caps_set >> cap_index; cap_index++) {
171 if ((caps_set >> cap_index) & 1)
172 word.as_chars[cap_index] -= 'a' - 'A';
173 }
174 size_t h;
175 // Find next free cell
176 for (h = word.as_word % H_SIZE; h_table[h]; h = (h + 1) % H_SIZE)
177 ;
178 h_table[h] = i + 1;
179 // Don't add capital letters if case sensitive
180 if (case_sensitive)
181 break;
182 }
183 }
184 }
185
186 /* Boyer-Moore strstr */
boyer_moore_strnstr(const char * s,const char * find,const size_t s_len,const size_t f_len,const size_t alpha_skip_lookup[],const size_t * find_skip_lookup,const int case_insensitive)187 const char *boyer_moore_strnstr(const char *s, const char *find, const size_t s_len, const size_t f_len,
188 const size_t alpha_skip_lookup[], const size_t *find_skip_lookup, const int case_insensitive) {
189 ssize_t i;
190 size_t pos = f_len - 1;
191
192 while (pos < s_len) {
193 for (i = f_len - 1; i >= 0 && (case_insensitive ? tolower(s[pos]) : s[pos]) == find[i]; pos--, i--) {
194 }
195 if (i < 0) {
196 return s + pos + 1;
197 }
198 pos += ag_max(alpha_skip_lookup[(unsigned char)s[pos]], find_skip_lookup[i]);
199 }
200
201 return NULL;
202 }
203
204 // Clang's -fsanitize=alignment (included in -fsanitize=undefined) will flag
205 // the intentional unaligned access here, so suppress it for this function
hash_strnstr(const char * s,const char * find,const size_t s_len,const size_t f_len,uint8_t * h_table,const int case_sensitive)206 NO_SANITIZE_ALIGNMENT const char *hash_strnstr(const char *s, const char *find, const size_t s_len, const size_t f_len, uint8_t *h_table, const int case_sensitive) {
207 if (s_len < f_len)
208 return NULL;
209
210 // Step through s
211 const size_t step = f_len - sizeof(uint16_t) + 1;
212 size_t s_i = f_len - sizeof(uint16_t);
213 for (; s_i <= s_len - f_len; s_i += step) {
214 size_t h;
215 for (h = *(const uint16_t *)(s + s_i) % H_SIZE; h_table[h]; h = (h + 1) % H_SIZE) {
216 const char *R = s + s_i - (h_table[h] - 1);
217 size_t i;
218 // Check putative match
219 for (i = 0; i < f_len; i++) {
220 if ((case_sensitive ? R[i] : tolower(R[i])) != find[i])
221 goto next_hash_cell;
222 }
223 return R; // Found
224 next_hash_cell:;
225 }
226 }
227 // Check tail
228 for (s_i = s_i - step + 1; s_i <= s_len - f_len; s_i++) {
229 size_t i;
230 const char *R = s + s_i;
231 for (i = 0; i < f_len; i++) {
232 char s_c = case_sensitive ? R[i] : tolower(R[i]);
233 if (s_c != find[i])
234 goto next_start;
235 }
236 return R;
237 next_start:;
238 }
239 return NULL;
240 }
241
invert_matches(const char * buf,const size_t buf_len,match_t matches[],size_t matches_len)242 size_t invert_matches(const char *buf, const size_t buf_len, match_t matches[], size_t matches_len) {
243 size_t i;
244 size_t match_read_index = 0;
245 size_t inverted_match_count = 0;
246 size_t inverted_match_start = 0;
247 size_t last_line_end = 0;
248 int in_inverted_match = TRUE;
249 match_t next_match;
250
251 log_debug("Inverting %u matches.", matches_len);
252
253 if (matches_len > 0) {
254 next_match = matches[0];
255 } else {
256 next_match.start = buf_len + 1;
257 }
258
259 /* No matches, so the whole buffer is now a match. */
260 if (matches_len == 0) {
261 matches[0].start = 0;
262 matches[0].end = buf_len - 1;
263 return 1;
264 }
265
266 for (i = 0; i < buf_len; i++) {
267 if (i == next_match.start) {
268 i = next_match.end - 1;
269
270 match_read_index++;
271
272 if (match_read_index < matches_len) {
273 next_match = matches[match_read_index];
274 }
275
276 if (in_inverted_match && last_line_end > inverted_match_start) {
277 matches[inverted_match_count].start = inverted_match_start;
278 matches[inverted_match_count].end = last_line_end - 1;
279
280 inverted_match_count++;
281 }
282
283 in_inverted_match = FALSE;
284 } else if (i == buf_len - 1 && in_inverted_match) {
285 matches[inverted_match_count].start = inverted_match_start;
286 matches[inverted_match_count].end = i;
287
288 inverted_match_count++;
289 } else if (buf[i] == '\n') {
290 last_line_end = i + 1;
291
292 if (!in_inverted_match) {
293 inverted_match_start = last_line_end;
294 }
295
296 in_inverted_match = TRUE;
297 }
298 }
299
300 for (i = 0; i < matches_len; i++) {
301 log_debug("Inverted match %i start %i end %i.", i, matches[i].start, matches[i].end);
302 }
303
304 return inverted_match_count;
305 }
306
realloc_matches(match_t ** matches,size_t * matches_size,size_t matches_len)307 void realloc_matches(match_t **matches, size_t *matches_size, size_t matches_len) {
308 if (matches_len < *matches_size) {
309 return;
310 }
311 /* TODO: benchmark initial size of matches. 100 may be too small/big */
312 *matches_size = *matches ? *matches_size * 2 : 100;
313 *matches = ag_realloc(*matches, *matches_size * sizeof(match_t));
314 }
315
compile_study(pcre ** re,pcre_extra ** re_extra,char * q,const int pcre_opts,const int study_opts)316 void compile_study(pcre **re, pcre_extra **re_extra, char *q, const int pcre_opts, const int study_opts) {
317 const char *pcre_err = NULL;
318 int pcre_err_offset = 0;
319
320 *re = pcre_compile(q, pcre_opts, &pcre_err, &pcre_err_offset, NULL);
321 if (*re == NULL) {
322 die("Bad regex! pcre_compile() failed at position %i: %s\nIf you meant to search for a literal string, run ag with -Q",
323 pcre_err_offset,
324 pcre_err);
325 }
326 *re_extra = pcre_study(*re, study_opts, &pcre_err);
327 if (*re_extra == NULL) {
328 log_debug("pcre_study returned nothing useful. Error: %s", pcre_err);
329 }
330 }
331
332 /* This function is very hot. It's called on every file. */
is_binary(const void * buf,const size_t buf_len)333 int is_binary(const void *buf, const size_t buf_len) {
334 size_t suspicious_bytes = 0;
335 size_t total_bytes = buf_len > 512 ? 512 : buf_len;
336 const unsigned char *buf_c = buf;
337 size_t i;
338
339 if (buf_len == 0) {
340 /* Is an empty file binary? Is it text? */
341 return 0;
342 }
343
344 if (buf_len >= 3 && buf_c[0] == 0xEF && buf_c[1] == 0xBB && buf_c[2] == 0xBF) {
345 /* UTF-8 BOM. This isn't binary. */
346 return 0;
347 }
348
349 if (buf_len >= 5 && strncmp(buf, "%PDF-", 5) == 0) {
350 /* PDF. This is binary. */
351 return 1;
352 }
353
354 for (i = 0; i < total_bytes; i++) {
355 if (buf_c[i] == '\0') {
356 /* NULL char. It's binary */
357 return 1;
358 } else if ((buf_c[i] < 7 || buf_c[i] > 14) && (buf_c[i] < 32 || buf_c[i] > 127)) {
359 /* UTF-8 detection */
360 if (buf_c[i] > 193 && buf_c[i] < 224 && i + 1 < total_bytes) {
361 i++;
362 if (buf_c[i] > 127 && buf_c[i] < 192) {
363 continue;
364 }
365 } else if (buf_c[i] > 223 && buf_c[i] < 240 && i + 2 < total_bytes) {
366 i++;
367 if (buf_c[i] > 127 && buf_c[i] < 192 && buf_c[i + 1] > 127 && buf_c[i + 1] < 192) {
368 i++;
369 continue;
370 }
371 }
372 suspicious_bytes++;
373 /* Disk IO is so slow that it's worthwhile to do this calculation after every suspicious byte. */
374 /* This is true even on a 1.6Ghz Atom with an Intel 320 SSD. */
375 /* Read at least 32 bytes before making a decision */
376 if (i >= 32 && (suspicious_bytes * 100) / total_bytes > 10) {
377 return 1;
378 }
379 }
380 }
381 if ((suspicious_bytes * 100) / total_bytes > 10) {
382 return 1;
383 }
384
385 return 0;
386 }
387
is_regex(const char * query)388 int is_regex(const char *query) {
389 char regex_chars[] = {
390 '$',
391 '(',
392 ')',
393 '*',
394 '+',
395 '.',
396 '?',
397 '[',
398 '\\',
399 '^',
400 '{',
401 '|',
402 '\0'
403 };
404
405 return (strpbrk(query, regex_chars) != NULL);
406 }
407
is_fnmatch(const char * filename)408 int is_fnmatch(const char *filename) {
409 char fnmatch_chars[] = {
410 '!',
411 '*',
412 '?',
413 '[',
414 ']',
415 '\0'
416 };
417
418 return (strpbrk(filename, fnmatch_chars) != NULL);
419 }
420
binary_search(const char * needle,char ** haystack,int start,int end)421 int binary_search(const char *needle, char **haystack, int start, int end) {
422 int mid;
423 int rc;
424
425 if (start == end) {
426 return -1;
427 }
428
429 mid = start + ((end - start) / 2);
430
431 rc = strcmp(needle, haystack[mid]);
432 if (rc < 0) {
433 return binary_search(needle, haystack, start, mid);
434 } else if (rc > 0) {
435 return binary_search(needle, haystack, mid + 1, end);
436 }
437
438 return mid;
439 }
440
441 static int wordchar_table[256];
442
init_wordchar_table(void)443 void init_wordchar_table(void) {
444 int i;
445 for (i = 0; i < 256; ++i) {
446 char ch = (char)i;
447 wordchar_table[i] =
448 ('a' <= ch && ch <= 'z') ||
449 ('A' <= ch && ch <= 'Z') ||
450 ('0' <= ch && ch <= '9') ||
451 ch == '_';
452 }
453 }
454
is_wordchar(char ch)455 int is_wordchar(char ch) {
456 return wordchar_table[(unsigned char)ch];
457 }
458
is_lowercase(const char * s)459 int is_lowercase(const char *s) {
460 int i;
461 for (i = 0; s[i] != '\0'; i++) {
462 if (!isascii(s[i]) || isupper(s[i])) {
463 return FALSE;
464 }
465 }
466 return TRUE;
467 }
468
is_directory(const char * path,const struct dirent * d)469 int is_directory(const char *path, const struct dirent *d) {
470 #ifdef HAVE_DIRENT_DTYPE
471 /* Some filesystems, e.g. ReiserFS, always return a type DT_UNKNOWN from readdir or scandir. */
472 /* Call stat if we don't find DT_DIR to get the information we need. */
473 /* Also works for symbolic links to directories. */
474 if (d->d_type != DT_UNKNOWN && d->d_type != DT_LNK) {
475 return d->d_type == DT_DIR;
476 }
477 #endif
478 char *full_path;
479 struct stat s;
480 ag_asprintf(&full_path, "%s/%s", path, d->d_name);
481 if (stat(full_path, &s) != 0) {
482 free(full_path);
483 return FALSE;
484 }
485 #ifdef _WIN32
486 int is_dir = GetFileAttributesA(full_path) & FILE_ATTRIBUTE_DIRECTORY;
487 #else
488 int is_dir = S_ISDIR(s.st_mode);
489 #endif
490 free(full_path);
491 return is_dir;
492 }
493
is_symlink(const char * path,const struct dirent * d)494 int is_symlink(const char *path, const struct dirent *d) {
495 #ifdef _WIN32
496 char full_path[MAX_PATH + 1] = { 0 };
497 sprintf(full_path, "%s\\%s", path, d->d_name);
498 return (GetFileAttributesA(full_path) & FILE_ATTRIBUTE_REPARSE_POINT);
499 #else
500 #ifdef HAVE_DIRENT_DTYPE
501 /* Some filesystems, e.g. ReiserFS, always return a type DT_UNKNOWN from readdir or scandir. */
502 /* Call lstat if we find DT_UNKNOWN to get the information we need. */
503 if (d->d_type != DT_UNKNOWN) {
504 return (d->d_type == DT_LNK);
505 }
506 #endif
507 char *full_path;
508 struct stat s;
509 ag_asprintf(&full_path, "%s/%s", path, d->d_name);
510 if (lstat(full_path, &s) != 0) {
511 free(full_path);
512 return FALSE;
513 }
514 free(full_path);
515 return S_ISLNK(s.st_mode);
516 #endif
517 }
518
is_named_pipe(const char * path,const struct dirent * d)519 int is_named_pipe(const char *path, const struct dirent *d) {
520 #ifdef HAVE_DIRENT_DTYPE
521 if (d->d_type != DT_UNKNOWN) {
522 return d->d_type == DT_FIFO || d->d_type == DT_SOCK;
523 }
524 #endif
525 char *full_path;
526 struct stat s;
527 ag_asprintf(&full_path, "%s/%s", path, d->d_name);
528 if (stat(full_path, &s) != 0) {
529 free(full_path);
530 return FALSE;
531 }
532 free(full_path);
533 return S_ISFIFO(s.st_mode)
534 #ifdef S_ISSOCK
535 || S_ISSOCK(s.st_mode)
536 #endif
537 ;
538 }
539
ag_asprintf(char ** ret,const char * fmt,...)540 void ag_asprintf(char **ret, const char *fmt, ...) {
541 va_list args;
542 va_start(args, fmt);
543 if (vasprintf(ret, fmt, args) == -1) {
544 die("vasprintf returned -1");
545 }
546 va_end(args);
547 }
548
die(const char * fmt,...)549 void die(const char *fmt, ...) {
550 va_list args;
551 va_start(args, fmt);
552 vplog(LOG_LEVEL_ERR, fmt, args);
553 va_end(args);
554 exit(2);
555 }
556
557 #ifndef HAVE_FGETLN
fgetln(FILE * fp,size_t * lenp)558 char *fgetln(FILE *fp, size_t *lenp) {
559 char *buf = NULL;
560 int c, used = 0, len = 0;
561
562 flockfile(fp);
563 while ((c = getc_unlocked(fp)) != EOF) {
564 if (!buf || len >= used) {
565 size_t nsize;
566 char *newbuf;
567 nsize = used + BUFSIZ;
568 if (!(newbuf = realloc(buf, nsize))) {
569 funlockfile(fp);
570 if (buf)
571 free(buf);
572 return NULL;
573 }
574 buf = newbuf;
575 used = nsize;
576 }
577 buf[len++] = c;
578 if (c == '\n') {
579 break;
580 }
581 }
582 funlockfile(fp);
583 *lenp = len;
584 return buf;
585 }
586 #endif
587
588 #ifndef HAVE_GETLINE
589 /*
590 * Do it yourself getline() implementation
591 */
getline(char ** lineptr,size_t * n,FILE * stream)592 ssize_t getline(char **lineptr, size_t *n, FILE *stream) {
593 size_t len = 0;
594 char *srcln = NULL;
595 char *newlnptr = NULL;
596
597 /* get line, bail on error */
598 if (!(srcln = fgetln(stream, &len))) {
599 return -1;
600 }
601
602 if (len >= *n) {
603 /* line is too big for buffer, must realloc */
604 /* double the buffer, bail on error */
605 if (!(newlnptr = realloc(*lineptr, len * 2))) {
606 return -1;
607 }
608 *lineptr = newlnptr;
609 *n = len * 2;
610 }
611
612 memcpy(*lineptr, srcln, len);
613
614 #ifndef HAVE_FGETLN
615 /* Our own implementation of fgetln() returns a malloc()d buffer that we
616 * must free
617 */
618 free(srcln);
619 #endif
620
621 (*lineptr)[len] = '\0';
622 return len;
623 }
624 #endif
625
buf_getline(const char ** line,const char * buf,const size_t buf_len,const size_t buf_offset)626 ssize_t buf_getline(const char **line, const char *buf, const size_t buf_len, const size_t buf_offset) {
627 const char *cur = buf + buf_offset;
628 ssize_t i;
629 for (i = 0; (buf_offset + i < buf_len) && cur[i] != '\n'; i++) {
630 }
631 *line = cur;
632 return i;
633 }
634
635 #ifndef HAVE_REALPATH
636 /*
637 * realpath() for Windows. Turns slashes into backslashes and calls _fullpath
638 */
realpath(const char * path,char * resolved_path)639 char *realpath(const char *path, char *resolved_path) {
640 char *p;
641 char tmp[_MAX_PATH + 1];
642 strlcpy(tmp, path, sizeof(tmp));
643 p = tmp;
644 while (*p) {
645 if (*p == '/') {
646 *p = '\\';
647 }
648 p++;
649 }
650 return _fullpath(resolved_path, tmp, _MAX_PATH);
651 }
652 #endif
653
654 #ifndef HAVE_STRLCPY
strlcpy(char * dst,const char * src,size_t size)655 size_t strlcpy(char *dst, const char *src, size_t size) {
656 char *d = dst;
657 const char *s = src;
658 size_t n = size;
659
660 /* Copy as many bytes as will fit */
661 if (n != 0) {
662 while (--n != 0) {
663 if ((*d++ = *s++) == '\0') {
664 break;
665 }
666 }
667 }
668
669 /* Not enough room in dst, add NUL and traverse rest of src */
670 if (n == 0) {
671 if (size != 0) {
672 *d = '\0'; /* NUL-terminate dst */
673 }
674
675 while (*s++) {
676 }
677 }
678
679 return (s - src - 1); /* count does not include NUL */
680 }
681 #endif
682
683 #ifndef HAVE_VASPRINTF
vasprintf(char ** ret,const char * fmt,va_list args)684 int vasprintf(char **ret, const char *fmt, va_list args) {
685 int rv;
686 *ret = NULL;
687 va_list args2;
688 /* vsnprintf can destroy args, so we need to copy it for the second call */
689 #ifdef __va_copy
690 /* non-standard macro, but usually exists */
691 __va_copy(args2, args);
692 #elif va_copy
693 /* C99 macro. We compile with -std=c89 but you never know */
694 va_copy(args2, args);
695 #else
696 /* Ancient compiler. This usually works but there are no guarantees. */
697 memcpy(args2, args, sizeof(va_list));
698 #endif
699 rv = vsnprintf(NULL, 0, fmt, args);
700 va_end(args);
701 if (rv < 0) {
702 return rv;
703 }
704 *ret = malloc(++rv); /* vsnprintf doesn't count \0 */
705 if (*ret == NULL) {
706 return -1;
707 }
708 rv = vsnprintf(*ret, rv, fmt, args2);
709 va_end(args2);
710 if (rv < 0) {
711 free(*ret);
712 }
713 return rv;
714 }
715 #endif
716