1 /******************************************************************************\
2 * Copyright (c) 2019, Robert van Engelen, Genivia Inc. All rights reserved. *
3 * *
4 * Redistribution and use in source and binary forms, with or without *
5 * modification, are permitted provided that the following conditions are met: *
6 * *
7 * (1) Redistributions of source code must retain the above copyright notice, *
8 * this list of conditions and the following disclaimer. *
9 * *
10 * (2) Redistributions in binary form must reproduce the above copyright *
11 * notice, this list of conditions and the following disclaimer in the *
12 * documentation and/or other materials provided with the distribution. *
13 * *
14 * (3) The name of the author may not be used to endorse or promote products *
15 * derived from this software without specific prior written permission. *
16 * *
17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED *
18 * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF *
19 * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO *
20 * EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, *
21 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, *
22 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; *
23 * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, *
24 * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR *
25 * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF *
26 * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *
27 \******************************************************************************/
28
29 /**
30 @file ugrep.cpp
31 @brief a file search utility like grep
32 @author Robert van Engelen - engelen@genivia.com
33 @copyright (c) 2019-2019, Robert van Engelen, Genivia Inc. All rights reserved.
34 @copyright (c) BSD-3 License - see LICENSE.txt
35
36 Find patterns in files encoded in UTF-8/16/32, ASCII, ISO-8859-1, EBCDIC, code
37 pages 437, 850, 1250 to 1258, and other file formats.
38
39 For the latest fully-featured high-performance version of ugrep, please visit:
40
41 https://github.com/Genivia/ugrep
42
43 This simple version features:
44
45 - Searches the specified files only, no directory recursion.
46 - Patterns are ERE POSIX syntax compliant, extended with RE/flex pattern syntax.
47 - Unicode support for \p{} character categories, bracket list classes, etc.
48 - File encoding support for UTF-8/16/32, EBCDIC, and many other code pages.
49 - ugrep command-line options are the same as grep, simulates grep behavior.
50
51 Examples:
52
53 # display the lines in places.txt that contain capitalized Unicode words
54 ugrep '\p{Upper}\p{Lower}*' places.txt
55
56 # display the lines in places.txt with capitalized Unicode words color-highlighted
57 ugrep --color=auto '\p{Upper}\p{Lower}*' places.txt
58
59 # list all capitalized Unicode words in places.txt
60 ugrep -o '\p{Upper}\p{Lower}*' places.txt
61
62 # list all laughing face emojis (Unicode code points U+1F600 to U+1F60F) in birthday.txt
63 ugrep -o '[-]' birthday.txt
64
65 # list all laughing face emojis (Unicode code points U+1F600 to U+1F60F) in birthday.txt
66 ugrep -o '[\x{1F600}-\x{1F60F}]' birthday.txt
67
68 # display lines containing the names Gödel (or Goedel), Escher, or Bach in GEB.txt and wiki.txt
69 ugrep 'G(ö|oe)del|Escher|Bach' GEB.txt wiki.txt
70
71 # display lines that do not contain the names Gödel (or Goedel), Escher, or Bach in GEB.txt and wiki.txt
72 ugrep -v 'G(ö|oe)del|Escher|Bach' GEB.txt wiki.txt
73
74 # count the number of lines containing the names Gödel (or Goedel), Escher, or Bach in GEB.txt and wiki.txt
75 ugrep -c 'G(ö|oe)del|Escher|Bach' GEB.txt wiki.txt
76
77 # count the number of occurrences of the names Gödel (or Goedel), Escher, or Bach in GEB.txt and wiki.txt
78 ugrep -c -u 'G(ö|oe)del|Escher|Bach' GEB.txt wiki.txt
79
80 # check if some.txt file contains any non-ASCII (i.e. Unicode) characters
81 ugrep -q '[^[:ascii:]]' some.txt && echo "some.txt contains Unicode"
82
83 # display word-anchored 'lorem' in UTF-16 formatted file utf16lorem.txt that contains a UTF-16 BOM
84 ugrep -w -i 'lorem' utf16lorem.txt
85
86 # display word-anchored 'lorem' in UTF-16 formatted file utf16lorem.txt that does not contain a UTF-16 BOM
87 ugrep --file-format=UTF-16 -w -i 'lorem' utf16lorem.txt
88
89 # list the lines to fix in a C/C++ source file by looking for the word TODO while skipping any TODO in quoted strings by using a negative pattern `(?^X)' to ignore quoted strings:
90 ugrep -n -o -e 'TODO' -e '(?^"(\\.|\\\r?\n|[^\\\n"])*")' file.cpp
91
92 # check if 'main' is defined in a C/C++ source file, skipping the word 'main' in comments and strings:
93 ugrep -q -e '\<main\>' -e '(?^"(\\.|\\\r?\n|[^\\\n"])*"|//.*|/[*](.|\n)*?[*]/)' file.cpp
94
95 Compile:
96
97 c++ -std=c++11 -o ugrep ugrep.cpp -lreflex
98
99 */
100
101 #include <reflex/matcher.h>
102
103 // check if we are on a windows OS
104 #if defined(__WIN32__) || defined(_WIN32) || defined(WIN32) || defined(__CYGWIN__) || defined(__MINGW32__) || defined(__MINGW64__) || defined(__BORLANDC__)
105 # define OS_WIN
106 #endif
107
108 // windows has no isatty()
109 #ifdef OS_WIN
110 #define isatty(fildes) ((fildes) == 1)
111 #else
112 #include <unistd.h>
113 #endif
114
115 // ugrep version
116 #define VERSION "1.0.0"
117
118 // ugrep platform -- see configure.ac
119 #if !defined(PLATFORM)
120 # if defined(OS_WIN)
121 # define PLATFORM "WIN"
122 # else
123 # define PLATFORM ""
124 # endif
125 #endif
126
127 // ugrep exit codes
128 #define EXIT_OK 0 // One or more lines were selected
129 #define EXIT_FAIL 1 // No lines were selected
130 #define EXIT_ERROR 2 // An error occurred
131
132 // GREP_COLOR environment variable
133 const char *grep_color = NULL;
134
135 // ugrep command-line options
136 bool flag_filename = false;
137 bool flag_no_filename = false;
138 bool flag_no_messages = false;
139 bool flag_byte_offset = false;
140 bool flag_count = false;
141 bool flag_fixed_strings = false;
142 bool flag_free_space = false;
143 bool flag_ignore_case = false;
144 bool flag_invert_match = false;
145 bool flag_column_number = false;
146 bool flag_line_number = false;
147 bool flag_line_buffered = false;
148 bool flag_only_matching = false;
149 bool flag_quiet = false;
150 bool flag_ungroup = false;
151 bool flag_word_regexp = false;
152 bool flag_line_regexp = false;
153 const char *flag_color = NULL;
154 const char *flag_file_format = NULL;
155 int flag_tabs = 8;
156
157 // function protos
158 bool ugrep(reflex::Pattern& pattern, FILE *file, reflex::Input::file_encoding_type encoding, const char *infile);
159 void help(const char *message = NULL, const char *arg = NULL);
160 void version();
161
162 // table of file formats for ugrep option --file-format
163 const struct { const char *format; reflex::Input::file_encoding_type encoding; } format_table[] = {
164 { "binary", reflex::Input::file_encoding::plain },
165 { "ASCII", reflex::Input::file_encoding::utf8 },
166 { "UTF-8", reflex::Input::file_encoding::utf8 },
167 { "UTF-16", reflex::Input::file_encoding::utf16be },
168 { "UTF-16BE", reflex::Input::file_encoding::utf16be },
169 { "UTF-16LE", reflex::Input::file_encoding::utf16le },
170 { "UTF-32", reflex::Input::file_encoding::utf32be },
171 { "UTF-32BE", reflex::Input::file_encoding::utf32be },
172 { "UTF-32LE", reflex::Input::file_encoding::utf32le },
173 { "ISO-8859-1", reflex::Input::file_encoding::latin },
174 { "ISO-8869-2", reflex::Input::file_encoding::iso8859_2 },
175 { "ISO-8869-3", reflex::Input::file_encoding::iso8859_3 },
176 { "ISO-8869-4", reflex::Input::file_encoding::iso8859_4 },
177 { "ISO-8869-5", reflex::Input::file_encoding::iso8859_5 },
178 { "ISO-8869-6", reflex::Input::file_encoding::iso8859_6 },
179 { "ISO-8869-7", reflex::Input::file_encoding::iso8859_7 },
180 { "ISO-8869-8", reflex::Input::file_encoding::iso8859_8 },
181 { "ISO-8869-9", reflex::Input::file_encoding::iso8859_9 },
182 { "ISO-8869-10", reflex::Input::file_encoding::iso8859_10 },
183 { "ISO-8869-11", reflex::Input::file_encoding::iso8859_11 },
184 { "ISO-8869-13", reflex::Input::file_encoding::iso8859_13 },
185 { "ISO-8869-14", reflex::Input::file_encoding::iso8859_14 },
186 { "ISO-8869-15", reflex::Input::file_encoding::iso8859_15 },
187 { "ISO-8869-16", reflex::Input::file_encoding::iso8859_16 },
188 { "MAC", reflex::Input::file_encoding::macroman },
189 { "MACROMAN", reflex::Input::file_encoding::macroman },
190 { "EBCDIC", reflex::Input::file_encoding::ebcdic },
191 { "CP437", reflex::Input::file_encoding::cp437 },
192 { "CP850", reflex::Input::file_encoding::cp850 },
193 { "CP858", reflex::Input::file_encoding::cp858 },
194 { "CP1250", reflex::Input::file_encoding::cp1250 },
195 { "CP1251", reflex::Input::file_encoding::cp1251 },
196 { "CP1252", reflex::Input::file_encoding::cp1252 },
197 { "CP1253", reflex::Input::file_encoding::cp1253 },
198 { "CP1254", reflex::Input::file_encoding::cp1254 },
199 { "CP1255", reflex::Input::file_encoding::cp1255 },
200 { "CP1256", reflex::Input::file_encoding::cp1256 },
201 { "CP1257", reflex::Input::file_encoding::cp1257 },
202 { "CP1258", reflex::Input::file_encoding::cp1258 },
203 { "KOI8-R", reflex::Input::file_encoding::koi8_r },
204 { "KOI8-U", reflex::Input::file_encoding::koi8_u },
205 { "KOI8-RU", reflex::Input::file_encoding::koi8_ru },
206 { NULL, 0 }
207 };
208
209 // ugrep main()
main(int argc,char ** argv)210 int main(int argc, char **argv)
211 {
212 std::string regex;
213 std::vector<const char*> infiles;
214
215 bool color_term = false;
216
217 #ifndef OS_WIN
218 // check whether we have a color terminal
219 const char *term = getenv("TERM");
220 color_term = term && (strstr(term, "ansi") || strstr(term, "xterm") || strstr(term, "color"));
221 grep_color = getenv("GREP_COLOR");
222 #endif
223
224 // parse ugrep command-line options and arguments
225 for (int i = 1; i < argc; ++i)
226 {
227 const char *arg = argv[i];
228
229 if (*arg == '-'
230 #ifdef OS_WIN
231 || *arg == '/'
232 #endif
233 )
234 {
235 bool is_grouped = true;
236
237 // parse a ugrep command-line option
238 while (is_grouped && *++arg)
239 {
240 switch (*arg)
241 {
242 case '-':
243 ++arg;
244 if (strcmp(arg, "byte-offset") == 0)
245 flag_byte_offset = true;
246 else if (strcmp(arg, "color") == 0 || strcmp(arg, "colour") == 0)
247 flag_color = "auto";
248 else if (strncmp(arg, "color=", 6) == 0)
249 flag_color = arg + 6;
250 else if (strncmp(arg, "colour=", 7) == 0)
251 flag_color = arg + 7;
252 else if (strcmp(arg, "column-number") == 0)
253 flag_column_number = true;
254 else if (strcmp(arg, "count") == 0)
255 flag_count = true;
256 else if (strcmp(arg, "extended-regexp") == 0)
257 ;
258 else if (strncmp(arg, "file-format=", 12) == 0)
259 flag_file_format = arg + 12;
260 else if (strcmp(arg, "fixed-strings") == 0)
261 flag_fixed_strings = true;
262 else if (strcmp(arg, "free-space") == 0)
263 flag_free_space = true;
264 else if (strcmp(arg, "help") == 0)
265 help();
266 else if (strcmp(arg, "ignore-case") == 0)
267 flag_ignore_case = true;
268 else if (strcmp(arg, "invert-match") == 0)
269 flag_invert_match = true;
270 else if (strcmp(arg, "line-number") == 0)
271 flag_line_number = true;
272 else if (strcmp(arg, "line-regexp") == 0)
273 flag_line_regexp = true;
274 else if (strcmp(arg, "no-filename") == 0)
275 flag_no_filename = true;
276 else if (strcmp(arg, "ungroup") == 0)
277 flag_ungroup = true;
278 else if (strcmp(arg, "no-messages") == 0)
279 flag_no_messages = true;
280 else if (strcmp(arg, "only-matching") == 0)
281 flag_only_matching = true;
282 else if (strcmp(arg, "quiet") == 0 || strcmp(arg, "silent") == 0)
283 flag_quiet = true;
284 else if (strncmp(arg, "regexp=", 7) == 0)
285 regex.append(arg + 7).push_back('|');
286 else if (strncmp(arg, "tabs=", 5) == 0)
287 flag_tabs = atoi(arg + 5);
288 else if (strcmp(arg, "version") == 0)
289 version();
290 else if (strcmp(arg, "word-regexp") == 0)
291 flag_word_regexp = true;
292 else
293 help("invalid option --", arg);
294 is_grouped = false;
295 break;
296
297 case 'b':
298 flag_byte_offset = true;
299 break;
300
301 case 'c':
302 flag_count = true;
303 break;
304
305 case 'E':
306 break;
307
308 case 'e':
309 ++arg;
310 if (*arg)
311 regex.append(&arg[*arg == '=']).push_back('|');
312 else if (++i < argc)
313 regex.append(argv[i]).push_back('|');
314 else
315 help("missing pattern for option -e");
316 is_grouped = false;
317 break;
318
319 case 'F':
320 flag_fixed_strings = true;
321 break;
322
323 case 'H':
324 flag_filename = true;
325 flag_no_filename = false;
326 break;
327
328 case 'h':
329 flag_filename = false;
330 flag_no_filename = true;
331 break;
332
333 case 'i':
334 flag_ignore_case = true;
335 break;
336
337 case 'k':
338 flag_column_number = true;
339 break;
340
341 case 'n':
342 flag_line_number = true;
343 break;
344
345 case 'o':
346 flag_only_matching = true;
347 break;
348
349 case 'q':
350 flag_quiet = true;
351 break;
352
353 case 's':
354 flag_no_messages = true;
355 break;
356
357 case 'u':
358 flag_ungroup = true;
359 break;
360
361 case 'V':
362 version();
363 break;
364
365 case 'v':
366 flag_invert_match = true;
367 break;
368
369 case 'w':
370 flag_word_regexp = true;
371 break;
372
373 case 'x':
374 flag_line_regexp = true;
375 break;
376
377 case '?':
378 help();
379 break;
380
381 default:
382 help("invalid option -", arg);
383 }
384 }
385 }
386 else
387 {
388 // parse a ugrep command-line argument
389 if (regex.empty())
390 {
391 // no regex pattern specified yet, so assign it to the regex string
392 regex.assign(arg).push_back('|');
393 }
394 else
395 {
396 // otherwise add the file argument to the list of files
397 infiles.push_back(arg);
398 }
399 }
400 }
401
402 // if no regex pattern was specified then exit
403 if (regex.empty())
404 help();
405
406 // remove the ending '|' from the |-concatenated regexes in the regex string
407 regex.pop_back();
408
409 if (regex.empty())
410 {
411 // if the specified regex is empty then it matches every line
412 regex.assign(".*");
413 }
414 else
415 {
416 // if -F --fixed-strings: make regex literal with \Q and \E
417 if (flag_fixed_strings)
418 regex.insert(0, "\\Q").append("\\E");
419
420 // if -w or -x: make the regex word- or line-anchored, respectively
421 if (flag_word_regexp)
422 regex.insert(0, "\\<(").append(")\\>");
423 else if (flag_line_regexp)
424 regex.insert(0, "^(").append(")$");
425 }
426
427 // if -v invert-match: options -u --ungroup and -o --only-matching options cannot be used
428 if (flag_invert_match)
429 {
430 flag_ungroup = false;
431 flag_only_matching = false;
432 }
433
434 // input is line-buffered if options -c --count -o --only-matching -q --quiet are not specified
435 if (!flag_count && !flag_only_matching && !flag_quiet)
436 flag_line_buffered = true;
437
438 // display file name if more than one input file is specified and option -h --no-filename is not specified
439 if (infiles.size() > 1 && !flag_no_filename)
440 flag_filename = true;
441
442 // (re)set grep_color depending on color_term, isatty(), and the ugrep --color option
443 if (!flag_color || strcmp(flag_color, "never") == 0)
444 {
445 grep_color = NULL;
446 }
447 else if (strcmp(flag_color, "always") == 0)
448 {
449 if (!grep_color)
450 grep_color = "1";
451 }
452 else if (strcmp(flag_color, "auto") == 0)
453 {
454 if (!color_term || !isatty(1))
455 grep_color = NULL;
456 else if (!grep_color)
457 grep_color = "1";
458 }
459 else
460 {
461 help("unknown --color=when value");
462 }
463
464 // if any match was found in any of the input files then we set found==true
465 bool found = false;
466
467 try
468 {
469 reflex::Input::file_encoding_type encoding = reflex::Input::file_encoding::plain;
470
471 // parse ugrep option --file-format=format
472 if (flag_file_format)
473 {
474 int i;
475
476 // scan the format_table[] for a matching format
477 for (i = 0; format_table[i].format != NULL; ++i)
478 if (strcmp(flag_file_format, format_table[i].format) == 0)
479 break;
480
481 if (format_table[i].format == NULL)
482 help("unknown --file-format=format encoding");
483
484 // encoding is the file format used by all input files, if no BOM is present
485 encoding = format_table[i].encoding;
486 }
487
488 std::string modifiers = "(?m";
489 if (flag_ignore_case)
490 modifiers.append("i");
491 if (flag_free_space)
492 modifiers.append("x");
493 modifiers.append(")");
494
495 std::string pattern_options;
496 if (flag_tabs)
497 {
498 if (flag_tabs == 1 || flag_tabs == 2 || flag_tabs == 4 || flag_tabs == 8)
499 pattern_options.assign("T=").push_back(flag_tabs + '0');
500 else
501 help("invalid value for option --tabs");
502 }
503
504 reflex::Pattern pattern(modifiers + reflex::Matcher::convert(regex, reflex::convert_flag::notnewline | reflex::convert_flag::unicode), pattern_options);
505
506 if (infiles.empty())
507 {
508 // read standard input to find pattern matches
509 found |= ugrep(pattern, stdin, encoding, "(standard input)");
510 }
511 else
512 {
513 // read each file to find pattern matches
514 for (auto infile : infiles)
515 {
516 FILE *file = fopen(infile, "r");
517
518 if (file == NULL)
519 {
520 if (flag_no_messages)
521 continue;
522
523 perror("Cannot open file for reading");
524 exit(EXIT_ERROR);
525 }
526
527 found |= ugrep(pattern, file, encoding, infile);
528
529 fclose(file);
530 }
531 }
532 }
533 catch (reflex::regex_error& error)
534 {
535 std::cerr << error.what();
536 exit(EXIT_ERROR);
537 }
538
539 exit(found ? EXIT_OK : EXIT_FAIL);
540 }
541
542 // Search file, display pattern matches, return true when pattern matched anywhere
ugrep(reflex::Pattern & pattern,FILE * file,reflex::Input::file_encoding_type encoding,const char * infile)543 bool ugrep(reflex::Pattern& pattern, FILE *file, reflex::Input::file_encoding_type encoding, const char *infile)
544 {
545 bool found = false;
546
547 std::string label, mark, unmark;
548
549 if (flag_filename && infile)
550 label.assign(infile).append(":");
551
552 if (grep_color)
553 {
554 mark.assign("\033[").append(grep_color).append("m");
555 unmark.assign("\033[0m");
556 }
557
558 // create an input object to read the file (or stdin) using the given file format encoding
559 reflex::Input input(file, encoding);
560
561 if (flag_quiet)
562 {
563 // -q quiet mode: report if a single pattern match was found in the input
564
565 found = reflex::Matcher(pattern, input).find();
566
567 if (flag_invert_match)
568 found = !found;
569 }
570 else if (flag_count)
571 {
572 // -c count mode: count the number of lines/patterns matched
573
574 if (flag_invert_match)
575 {
576 size_t lines = 0;
577 std::string line;
578
579 // -c count mode w/ -v: count the number of non-matching lines
580 while (input)
581 {
582 int ch;
583
584 // read the next line
585 line.clear();
586 while ((ch = input.get()) != EOF && ch != '\n')
587 line.push_back(ch);
588 if (ch == EOF && line.empty())
589 break;
590
591 // count this line if not matched
592 if (!reflex::Matcher(pattern, line).find())
593 {
594 found = true;
595 ++lines;
596 }
597 }
598
599 std::cout << label << lines << std::endl;
600 }
601 else if (flag_ungroup)
602 {
603 // -c count mode w/ -u: count the number of patterns matched in the file
604
605 reflex::Matcher matcher(pattern, input);
606 size_t matches = std::distance(matcher.find.begin(), matcher.find.end());
607
608 std::cout << label << matches << std::endl;
609 found = matches > 0;
610 }
611 else
612 {
613 // -c count mode w/o -u: count the number of matching lines
614
615 size_t lineno = 0;
616 size_t lines = 0;
617
618 reflex::Matcher matcher(pattern, input);
619 for (auto& match : matcher.find)
620 {
621 if (lineno != match.lineno())
622 {
623 lineno = match.lineno();
624 ++lines;
625 }
626 }
627
628 std::cout << label << lines << std::endl;
629 found = lines > 0;
630 }
631 }
632 else if (flag_line_buffered)
633 {
634 // line-buffered: display lines that matched the pattern
635
636 #if defined(WITH_SPAN)
637 if (!flag_ungroup && !flag_invert_match)
638 {
639 size_t lineno = 0;
640
641 reflex::Matcher matcher(pattern, input);
642 for (auto& match : matcher.find)
643 {
644 if (lineno != match.lineno())
645 {
646 lineno = match.lineno();
647 std::cout << label;
648 if (flag_line_number)
649 std::cout << match.lineno() << ":";
650 if (flag_column_number)
651 std::cout << match.columno() + 1 << ":";
652 if (flag_byte_offset)
653 std::cout << match.first() << ":";
654 std::cout << mark << match.span() << unmark << std::endl;
655 found = true;
656 }
657 }
658 }
659 else
660 #endif
661 {
662 size_t byte_offset = 0;
663 size_t lineno = 1;
664 std::string line;
665
666 while (input)
667 {
668 int ch;
669
670 // read the next line
671 line.clear();
672 while ((ch = input.get()) != EOF && ch != '\n')
673 line.push_back(ch);
674 if (ch == EOF && line.empty())
675 break;
676
677 if (flag_invert_match)
678 {
679 // -v invert match: display non-matching line
680
681 if (!reflex::Matcher(pattern, line).find())
682 {
683 std::cout << label;
684 if (flag_line_number)
685 std::cout << lineno << ":";
686 if (flag_byte_offset)
687 std::cout << byte_offset << ":";
688 std::cout << line << std::endl;
689 found = true;
690 }
691 }
692 else if (flag_ungroup)
693 {
694 // search the line for pattern matches and display the line again (with exact offset) for each pattern match
695
696 reflex::Matcher matcher(pattern, line);
697 for (auto& match : matcher.find)
698 {
699 std::cout << label;
700 if (flag_line_number)
701 std::cout << lineno << ":";
702 if (flag_column_number)
703 std::cout << match.columno() + 1 << ":";
704 if (flag_byte_offset)
705 std::cout << byte_offset << ":";
706 std::cout << line.substr(0, match.first()) << mark << match.text() << unmark << line.substr(match.last()) << std::endl;
707 found = true;
708 }
709 }
710 else
711 {
712 // search the line for pattern matches and display the line just once with all matches
713
714 size_t last = 0;
715
716 reflex::Matcher matcher(pattern, line);
717 for (auto& match : matcher.find)
718 {
719 if (last == 0)
720 {
721 std::cout << label;
722 if (flag_line_number)
723 std::cout << lineno << ":";
724 if (flag_column_number)
725 std::cout << match.columno() + 1 << ":";
726 if (flag_byte_offset)
727 std::cout << byte_offset + match.first() << ":";
728 std::cout << line.substr(0, match.first()) << mark << match.text() << unmark;
729 last = match.last();
730 found = true;
731 }
732 else
733 {
734 std::cout << line.substr(last, match.first() - last) << mark << match.text() << unmark;
735 last = match.last();
736 }
737 }
738
739 if (last > 0)
740 std::cout << line.substr(last) << std::endl;
741 }
742
743 // update byte offset and line number
744 byte_offset += line.size() + 1;
745 ++lineno;
746 }
747 }
748 }
749 else
750 {
751 // block-buffered: display pattern matches
752
753 size_t lineno = 0;
754
755 reflex::Matcher matcher(pattern, input);
756 for (auto& match : matcher.find)
757 {
758 if (flag_ungroup || lineno != match.lineno())
759 {
760 lineno = match.lineno();
761 std::cout << label;
762 if (flag_line_number)
763 std::cout << lineno << ":";
764 if (flag_column_number)
765 std::cout << match.columno() + 1 << ":";
766 if (flag_byte_offset)
767 std::cout << match.first() << ":";
768 }
769 std::cout << mark << match.text() << unmark << std::endl;
770 found = true;
771 }
772 }
773
774 return found;
775 }
776
777 // Display help information with an optional diagnostic message and exit
help(const char * message,const char * arg)778 void help(const char *message, const char *arg)
779 {
780 if (message)
781 std::cout << "ugrep: " << message << (arg != NULL ? arg : "") << std::endl;
782 std::cout << "Usage: ugrep [-bcEFgHhiknoqsVvwx] [--colour[=when]|--color[=when]] [-e pattern] [pattern] [file ...]\n\
783 \n\
784 -b, --byte-offset\n\
785 The offset in bytes of a matched pattern is displayed in front of\n\
786 the respective matched line.\n\
787 -c, --count\n\
788 Only a count of selected lines is written to standard output.\n\
789 When used with option -u, counts the number of patterns matched.\n\
790 --colour[=when], --color[=when]\n\
791 Mark up the matching text with the expression stored in the\n\
792 GREP_COLOR environment variable. The possible values of when can\n\
793 be `never', `always' or `auto'.\n\
794 -E, --extended-regexp\n\
795 Ignored, intended for grep compatibility.\n\
796 -e pattern, --regexp=pattern\n\
797 Specify a pattern used during the search of the input: an input\n\
798 line is selected if it matches any of the specified patterns.\n\
799 This option is most useful when multiple -e options are used to\n\
800 specify multiple patterns, or when a pattern begins with a dash\n\
801 (`-').\n\
802 --file-format=format\n\
803 The input file format. The possible values of format can be:";
804 for (int i = 0; format_table[i].format != NULL; ++i)
805 std::cout << (i % 5 ? " " : "\n ") << format_table[i].format;
806 std::cout << "\n\
807 -F, --fixed-strings\n\
808 Interpret pattern as a set of fixed strings (i.e. force ugrep to\n\
809 behave as fgrep).\n\
810 --free-space\n\
811 Spacing (blanks and tabs) in regular expressions are ignored.\n\
812 -H\n\
813 Always print filename headers with output lines.\n\
814 -h, --no-filename\n\
815 Never print filename headers (i.e. filenames) with output lines.\n\
816 -?, --help\n\
817 Print a help message.\n\
818 -i, --ignore-case\n\
819 Perform case insensitive matching. This option applies\n\
820 case-insensitive matching of ASCII characters in the input.\n\
821 By default, ugrep is case sensitive.\n\
822 -k, --column-number\n\
823 The column number of a matched pattern is displayed in front of\n\
824 the respective matched line, starting at column 1. Tabs are\n\
825 expanded before columns are counted.\n\
826 -n, --line-number\n\
827 Each output line is preceded by its relative line number in the\n\
828 file, starting at line 1. The line number counter is reset for\n\
829 each file processed.\n\
830 -o, --only-matching\n\
831 Prints only the matching part of the lines. Allows a pattern\n\
832 match to span multiple lines.\n\
833 -q, --quiet, --silent\n\
834 Quiet mode: suppress normal output. ugrep will only search a file\n\
835 until a match has been found, making searches potentially less\n\
836 expensive. Allows a pattern match to span multiple lines.\n\
837 -s, --no-messages\n\
838 Silent mode. Nonexistent and unreadable files are ignored (i.e.\n\
839 their error messages are suppressed).\n\
840 --tabs=size\n\
841 Set the tab size to 1, 2, 4, or 8 to expand tabs for option -k.\n\
842 -u, --ungroup\n\
843 Do not group pattern matches on the same line. Display the\n\
844 matched line again for each additional pattern match.\n\
845 -V, --version\n\
846 Display version information and exit.\n\
847 -v, --invert-match\n\
848 Selected lines are those not matching any of the specified\n\
849 patterns.\n\
850 -w, --word-regexp\n\
851 The pattern is searched for as a word (as if surrounded by\n\
852 `\\<' and `\\>').\n\
853 -x, --line-regexp\n\
854 Only input lines selected against an entire pattern are considered\n\
855 to be matching lines (as if surrounded by ^ and $).\n\
856 \n\
857 The ugrep utility exits with one of the following values:\n\
858 \n\
859 0 One or more lines were selected.\n\
860 1 No lines were selected.\n\
861 >1 An error occurred.\n\
862 " << std::endl;
863 exit(EXIT_ERROR);
864 }
865
866 // Display version info
version()867 void version()
868 {
869 std::cout << "ugrep (simple) " VERSION " " PLATFORM << std::endl;
870 exit(EXIT_OK);
871 }
872