1 /******************************************************************************\
2 * Copyright (c) 2019, Robert van Engelen, Genivia Inc. All rights reserved.    *
3 *                                                                              *
4 * Redistribution and use in source and binary forms, with or without           *
5 * modification, are permitted provided that the following conditions are met:  *
6 *                                                                              *
7 *   (1) Redistributions of source code must retain the above copyright notice, *
8 *       this list of conditions and the following disclaimer.                  *
9 *                                                                              *
10 *   (2) Redistributions in binary form must reproduce the above copyright      *
11 *       notice, this list of conditions and the following disclaimer in the    *
12 *       documentation and/or other materials provided with the distribution.   *
13 *                                                                              *
14 *   (3) The name of the author may not be used to endorse or promote products  *
15 *       derived from this software without specific prior written permission.  *
16 *                                                                              *
17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED *
18 * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF         *
19 * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO   *
20 * EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,       *
21 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, *
22 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;  *
23 * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,     *
24 * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR      *
25 * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF       *
26 * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.                                   *
27 \******************************************************************************/
28 
29 /**
30 @file      ugrep.cpp
31 @brief     a file search utility like grep
32 @author    Robert van Engelen - engelen@genivia.com
33 @copyright (c) 2019-2019, Robert van Engelen, Genivia Inc. All rights reserved.
34 @copyright (c) BSD-3 License - see LICENSE.txt
35 
36 Find patterns in files encoded in UTF-8/16/32, ASCII, ISO-8859-1, EBCDIC, code
37 pages 437, 850, 1250 to 1258, and other file formats.
38 
39 For the latest fully-featured high-performance version of ugrep, please visit:
40 
41   https://github.com/Genivia/ugrep
42 
43 This simple version features:
44 
45   - Searches the specified files only, no directory recursion.
46   - Patterns are ERE POSIX syntax compliant, extended with RE/flex pattern syntax.
47   - Unicode support for \p{} character categories, bracket list classes, etc.
48   - File encoding support for UTF-8/16/32, EBCDIC, and many other code pages.
49   - ugrep command-line options are the same as grep, simulates grep behavior.
50 
51 Examples:
52 
53   # display the lines in places.txt that contain capitalized Unicode words
54   ugrep '\p{Upper}\p{Lower}*' places.txt
55 
56   # display the lines in places.txt with capitalized Unicode words color-highlighted
57   ugrep --color=auto '\p{Upper}\p{Lower}*' places.txt
58 
59   # list all capitalized Unicode words in places.txt
60   ugrep -o '\p{Upper}\p{Lower}*' places.txt
61 
62   # list all laughing face emojis (Unicode code points U+1F600 to U+1F60F) in birthday.txt
63   ugrep -o '[��-��]' birthday.txt
64 
65   # list all laughing face emojis (Unicode code points U+1F600 to U+1F60F) in birthday.txt
66   ugrep -o '[\x{1F600}-\x{1F60F}]' birthday.txt
67 
68   # display lines containing the names Gödel (or Goedel), Escher, or Bach in GEB.txt and wiki.txt
69   ugrep 'G(ö|oe)del|Escher|Bach' GEB.txt wiki.txt
70 
71   # display lines that do not contain the names Gödel (or Goedel), Escher, or Bach in GEB.txt and wiki.txt
72   ugrep -v 'G(ö|oe)del|Escher|Bach' GEB.txt wiki.txt
73 
74   # count the number of lines containing the names Gödel (or Goedel), Escher, or Bach in GEB.txt and wiki.txt
75   ugrep -c 'G(ö|oe)del|Escher|Bach' GEB.txt wiki.txt
76 
77   # count the number of occurrences of the names Gödel (or Goedel), Escher, or Bach in GEB.txt and wiki.txt
78   ugrep -c -u 'G(ö|oe)del|Escher|Bach' GEB.txt wiki.txt
79 
80   # check if some.txt file contains any non-ASCII (i.e. Unicode) characters
81   ugrep -q '[^[:ascii:]]' some.txt && echo "some.txt contains Unicode"
82 
83   # display word-anchored 'lorem' in UTF-16 formatted file utf16lorem.txt that contains a UTF-16 BOM
84   ugrep -w -i 'lorem' utf16lorem.txt
85 
86   # display word-anchored 'lorem' in UTF-16 formatted file utf16lorem.txt that does not contain a UTF-16 BOM
87   ugrep --file-format=UTF-16 -w -i 'lorem' utf16lorem.txt
88 
89   # list the lines to fix in a C/C++ source file by looking for the word TODO while skipping any TODO in quoted strings by using a negative pattern `(?^X)' to ignore quoted strings:
90   ugrep -n -o -e 'TODO' -e '(?^"(\\.|\\\r?\n|[^\\\n"])*")' file.cpp
91 
92   # check if 'main' is defined in a C/C++ source file, skipping the word 'main' in comments and strings:
93   ugrep -q -e '\<main\>' -e '(?^"(\\.|\\\r?\n|[^\\\n"])*"|//.*|/[*](.|\n)*?[*]/)' file.cpp
94 
95 Compile:
96 
97   c++ -std=c++11 -o ugrep ugrep.cpp -lreflex
98 
99 */
100 
101 #include <reflex/matcher.h>
102 
103 // check if we are on a windows OS
104 #if defined(__WIN32__) || defined(_WIN32) || defined(WIN32) || defined(__CYGWIN__) || defined(__MINGW32__) || defined(__MINGW64__) || defined(__BORLANDC__)
105 # define OS_WIN
106 #endif
107 
108 // windows has no isatty()
109 #ifdef OS_WIN
110 #define isatty(fildes) ((fildes) == 1)
111 #else
112 #include <unistd.h>
113 #endif
114 
115 // ugrep version
116 #define VERSION "1.0.0"
117 
118 // ugrep platform -- see configure.ac
119 #if !defined(PLATFORM)
120 # if defined(OS_WIN)
121 #  define PLATFORM "WIN"
122 # else
123 #  define PLATFORM ""
124 # endif
125 #endif
126 
127 // ugrep exit codes
128 #define EXIT_OK    0 // One or more lines were selected
129 #define EXIT_FAIL  1 // No lines were selected
130 #define EXIT_ERROR 2 // An error occurred
131 
132 // GREP_COLOR environment variable
133 const char *grep_color = NULL;
134 
135 // ugrep command-line options
136 bool flag_filename           = false;
137 bool flag_no_filename        = false;
138 bool flag_no_messages        = false;
139 bool flag_byte_offset        = false;
140 bool flag_count              = false;
141 bool flag_fixed_strings      = false;
142 bool flag_free_space         = false;
143 bool flag_ignore_case        = false;
144 bool flag_invert_match       = false;
145 bool flag_column_number      = false;
146 bool flag_line_number        = false;
147 bool flag_line_buffered      = false;
148 bool flag_only_matching      = false;
149 bool flag_quiet              = false;
150 bool flag_ungroup            = false;
151 bool flag_word_regexp        = false;
152 bool flag_line_regexp        = false;
153 const char *flag_color       = NULL;
154 const char *flag_file_format = NULL;
155 int flag_tabs                = 8;
156 
157 // function protos
158 bool ugrep(reflex::Pattern& pattern, FILE *file, reflex::Input::file_encoding_type encoding, const char *infile);
159 void help(const char *message = NULL, const char *arg = NULL);
160 void version();
161 
162 // table of file formats for ugrep option --file-format
163 const struct { const char *format; reflex::Input::file_encoding_type encoding; } format_table[] = {
164   { "binary",      reflex::Input::file_encoding::plain      },
165   { "ASCII",       reflex::Input::file_encoding::utf8       },
166   { "UTF-8",       reflex::Input::file_encoding::utf8       },
167   { "UTF-16",      reflex::Input::file_encoding::utf16be    },
168   { "UTF-16BE",    reflex::Input::file_encoding::utf16be    },
169   { "UTF-16LE",    reflex::Input::file_encoding::utf16le    },
170   { "UTF-32",      reflex::Input::file_encoding::utf32be    },
171   { "UTF-32BE",    reflex::Input::file_encoding::utf32be    },
172   { "UTF-32LE",    reflex::Input::file_encoding::utf32le    },
173   { "ISO-8859-1",  reflex::Input::file_encoding::latin      },
174   { "ISO-8869-2",  reflex::Input::file_encoding::iso8859_2  },
175   { "ISO-8869-3",  reflex::Input::file_encoding::iso8859_3  },
176   { "ISO-8869-4",  reflex::Input::file_encoding::iso8859_4  },
177   { "ISO-8869-5",  reflex::Input::file_encoding::iso8859_5  },
178   { "ISO-8869-6",  reflex::Input::file_encoding::iso8859_6  },
179   { "ISO-8869-7",  reflex::Input::file_encoding::iso8859_7  },
180   { "ISO-8869-8",  reflex::Input::file_encoding::iso8859_8  },
181   { "ISO-8869-9",  reflex::Input::file_encoding::iso8859_9  },
182   { "ISO-8869-10", reflex::Input::file_encoding::iso8859_10 },
183   { "ISO-8869-11", reflex::Input::file_encoding::iso8859_11 },
184   { "ISO-8869-13", reflex::Input::file_encoding::iso8859_13 },
185   { "ISO-8869-14", reflex::Input::file_encoding::iso8859_14 },
186   { "ISO-8869-15", reflex::Input::file_encoding::iso8859_15 },
187   { "ISO-8869-16", reflex::Input::file_encoding::iso8859_16 },
188   { "MAC",         reflex::Input::file_encoding::macroman   },
189   { "MACROMAN",    reflex::Input::file_encoding::macroman   },
190   { "EBCDIC",      reflex::Input::file_encoding::ebcdic     },
191   { "CP437",       reflex::Input::file_encoding::cp437      },
192   { "CP850",       reflex::Input::file_encoding::cp850      },
193   { "CP858",       reflex::Input::file_encoding::cp858      },
194   { "CP1250",      reflex::Input::file_encoding::cp1250     },
195   { "CP1251",      reflex::Input::file_encoding::cp1251     },
196   { "CP1252",      reflex::Input::file_encoding::cp1252     },
197   { "CP1253",      reflex::Input::file_encoding::cp1253     },
198   { "CP1254",      reflex::Input::file_encoding::cp1254     },
199   { "CP1255",      reflex::Input::file_encoding::cp1255     },
200   { "CP1256",      reflex::Input::file_encoding::cp1256     },
201   { "CP1257",      reflex::Input::file_encoding::cp1257     },
202   { "CP1258",      reflex::Input::file_encoding::cp1258     },
203   { "KOI8-R",      reflex::Input::file_encoding::koi8_r     },
204   { "KOI8-U",      reflex::Input::file_encoding::koi8_u     },
205   { "KOI8-RU",     reflex::Input::file_encoding::koi8_ru    },
206   { NULL, 0 }
207 };
208 
209 // ugrep main()
main(int argc,char ** argv)210 int main(int argc, char **argv)
211 {
212   std::string regex;
213   std::vector<const char*> infiles;
214 
215   bool color_term = false;
216 
217 #ifndef OS_WIN
218   // check whether we have a color terminal
219   const char *term = getenv("TERM");
220   color_term = term && (strstr(term, "ansi") || strstr(term, "xterm") || strstr(term, "color"));
221   grep_color = getenv("GREP_COLOR");
222 #endif
223 
224   // parse ugrep command-line options and arguments
225   for (int i = 1; i < argc; ++i)
226   {
227     const char *arg = argv[i];
228 
229     if (*arg == '-'
230 #ifdef OS_WIN
231      || *arg == '/'
232 #endif
233      )
234     {
235       bool is_grouped = true;
236 
237       // parse a ugrep command-line option
238       while (is_grouped && *++arg)
239       {
240         switch (*arg)
241         {
242           case '-':
243             ++arg;
244             if (strcmp(arg, "byte-offset") == 0)
245               flag_byte_offset = true;
246             else if (strcmp(arg, "color") == 0 || strcmp(arg, "colour") == 0)
247               flag_color = "auto";
248             else if (strncmp(arg, "color=", 6) == 0)
249               flag_color = arg + 6;
250             else if (strncmp(arg, "colour=", 7) == 0)
251               flag_color = arg + 7;
252             else if (strcmp(arg, "column-number") == 0)
253               flag_column_number = true;
254             else if (strcmp(arg, "count") == 0)
255               flag_count = true;
256             else if (strcmp(arg, "extended-regexp") == 0)
257               ;
258             else if (strncmp(arg, "file-format=", 12) == 0)
259               flag_file_format = arg + 12;
260             else if (strcmp(arg, "fixed-strings") == 0)
261               flag_fixed_strings = true;
262             else if (strcmp(arg, "free-space") == 0)
263               flag_free_space = true;
264             else if (strcmp(arg, "help") == 0)
265               help();
266             else if (strcmp(arg, "ignore-case") == 0)
267               flag_ignore_case = true;
268             else if (strcmp(arg, "invert-match") == 0)
269               flag_invert_match = true;
270             else if (strcmp(arg, "line-number") == 0)
271               flag_line_number = true;
272             else if (strcmp(arg, "line-regexp") == 0)
273               flag_line_regexp = true;
274             else if (strcmp(arg, "no-filename") == 0)
275               flag_no_filename = true;
276             else if (strcmp(arg, "ungroup") == 0)
277               flag_ungroup = true;
278             else if (strcmp(arg, "no-messages") == 0)
279               flag_no_messages = true;
280             else if (strcmp(arg, "only-matching") == 0)
281               flag_only_matching = true;
282             else if (strcmp(arg, "quiet") == 0 || strcmp(arg, "silent") == 0)
283               flag_quiet = true;
284             else if (strncmp(arg, "regexp=", 7) == 0)
285               regex.append(arg + 7).push_back('|');
286             else if (strncmp(arg, "tabs=", 5) == 0)
287               flag_tabs = atoi(arg + 5);
288             else if (strcmp(arg, "version") == 0)
289               version();
290             else if (strcmp(arg, "word-regexp") == 0)
291               flag_word_regexp = true;
292             else
293               help("invalid option --", arg);
294             is_grouped = false;
295             break;
296 
297           case 'b':
298             flag_byte_offset = true;
299             break;
300 
301           case 'c':
302             flag_count = true;
303             break;
304 
305           case 'E':
306             break;
307 
308           case 'e':
309             ++arg;
310             if (*arg)
311               regex.append(&arg[*arg == '=']).push_back('|');
312             else if (++i < argc)
313               regex.append(argv[i]).push_back('|');
314             else
315               help("missing pattern for option -e");
316             is_grouped = false;
317             break;
318 
319           case 'F':
320             flag_fixed_strings = true;
321             break;
322 
323           case 'H':
324             flag_filename = true;
325             flag_no_filename = false;
326             break;
327 
328           case 'h':
329             flag_filename = false;
330             flag_no_filename = true;
331             break;
332 
333           case 'i':
334             flag_ignore_case = true;
335             break;
336 
337           case 'k':
338             flag_column_number = true;
339             break;
340 
341           case 'n':
342             flag_line_number = true;
343             break;
344 
345           case 'o':
346             flag_only_matching = true;
347             break;
348 
349           case 'q':
350             flag_quiet = true;
351             break;
352 
353           case 's':
354             flag_no_messages = true;
355             break;
356 
357           case 'u':
358             flag_ungroup = true;
359             break;
360 
361           case 'V':
362             version();
363             break;
364 
365           case 'v':
366             flag_invert_match = true;
367             break;
368 
369           case 'w':
370             flag_word_regexp = true;
371             break;
372 
373           case 'x':
374             flag_line_regexp = true;
375             break;
376 
377           case '?':
378             help();
379             break;
380 
381           default:
382             help("invalid option -", arg);
383         }
384       }
385     }
386     else
387     {
388       // parse a ugrep command-line argument
389       if (regex.empty())
390       {
391         // no regex pattern specified yet, so assign it to the regex string
392         regex.assign(arg).push_back('|');
393       }
394       else
395       {
396         // otherwise add the file argument to the list of files
397         infiles.push_back(arg);
398       }
399     }
400   }
401 
402   // if no regex pattern was specified then exit
403   if (regex.empty())
404     help();
405 
406   // remove the ending '|' from the |-concatenated regexes in the regex string
407   regex.pop_back();
408 
409   if (regex.empty())
410   {
411     // if the specified regex is empty then it matches every line
412     regex.assign(".*");
413   }
414   else
415   {
416     // if -F --fixed-strings: make regex literal with \Q and \E
417     if (flag_fixed_strings)
418       regex.insert(0, "\\Q").append("\\E");
419 
420     // if -w or -x: make the regex word- or line-anchored, respectively
421     if (flag_word_regexp)
422       regex.insert(0, "\\<(").append(")\\>");
423     else if (flag_line_regexp)
424       regex.insert(0, "^(").append(")$");
425   }
426 
427   // if -v invert-match: options -u --ungroup and -o --only-matching options cannot be used
428   if (flag_invert_match)
429   {
430     flag_ungroup = false;
431     flag_only_matching = false;
432   }
433 
434   // input is line-buffered if options -c --count -o --only-matching -q --quiet are not specified
435   if (!flag_count && !flag_only_matching && !flag_quiet)
436     flag_line_buffered = true;
437 
438   // display file name if more than one input file is specified and option -h --no-filename is not specified
439   if (infiles.size() > 1 && !flag_no_filename)
440     flag_filename = true;
441 
442   // (re)set grep_color depending on color_term, isatty(), and the ugrep --color option
443   if (!flag_color || strcmp(flag_color, "never") == 0)
444   {
445     grep_color = NULL;
446   }
447   else if (strcmp(flag_color, "always") == 0)
448   {
449     if (!grep_color)
450       grep_color = "1";
451   }
452   else if (strcmp(flag_color, "auto") == 0)
453   {
454     if (!color_term || !isatty(1))
455       grep_color = NULL;
456     else if (!grep_color)
457       grep_color = "1";
458   }
459   else
460   {
461     help("unknown --color=when value");
462   }
463 
464   // if any match was found in any of the input files then we set found==true
465   bool found = false;
466 
467   try
468   {
469     reflex::Input::file_encoding_type encoding = reflex::Input::file_encoding::plain;
470 
471     // parse ugrep option --file-format=format
472     if (flag_file_format)
473     {
474       int i;
475 
476       // scan the format_table[] for a matching format
477       for (i = 0; format_table[i].format != NULL; ++i)
478         if (strcmp(flag_file_format, format_table[i].format) == 0)
479           break;
480 
481       if (format_table[i].format == NULL)
482         help("unknown --file-format=format encoding");
483 
484       // encoding is the file format used by all input files, if no BOM is present
485       encoding = format_table[i].encoding;
486     }
487 
488     std::string modifiers = "(?m";
489     if (flag_ignore_case)
490       modifiers.append("i");
491     if (flag_free_space)
492       modifiers.append("x");
493     modifiers.append(")");
494 
495     std::string pattern_options;
496     if (flag_tabs)
497     {
498       if (flag_tabs == 1 || flag_tabs == 2 || flag_tabs == 4 || flag_tabs == 8)
499         pattern_options.assign("T=").push_back(flag_tabs + '0');
500       else
501         help("invalid value for option --tabs");
502     }
503 
504     reflex::Pattern pattern(modifiers + reflex::Matcher::convert(regex, reflex::convert_flag::notnewline | reflex::convert_flag::unicode), pattern_options);
505 
506     if (infiles.empty())
507     {
508       // read standard input to find pattern matches
509       found |= ugrep(pattern, stdin, encoding, "(standard input)");
510     }
511     else
512     {
513       // read each file to find pattern matches
514       for (auto infile : infiles)
515       {
516         FILE *file = fopen(infile, "r");
517 
518         if (file == NULL)
519         {
520           if (flag_no_messages)
521             continue;
522 
523           perror("Cannot open file for reading");
524           exit(EXIT_ERROR);
525         }
526 
527         found |= ugrep(pattern, file, encoding, infile);
528 
529         fclose(file);
530       }
531     }
532   }
533   catch (reflex::regex_error& error)
534   {
535     std::cerr << error.what();
536     exit(EXIT_ERROR);
537   }
538 
539   exit(found ? EXIT_OK : EXIT_FAIL);
540 }
541 
542 // Search file, display pattern matches, return true when pattern matched anywhere
ugrep(reflex::Pattern & pattern,FILE * file,reflex::Input::file_encoding_type encoding,const char * infile)543 bool ugrep(reflex::Pattern& pattern, FILE *file, reflex::Input::file_encoding_type encoding, const char *infile)
544 {
545   bool found = false;
546 
547   std::string label, mark, unmark;
548 
549   if (flag_filename && infile)
550     label.assign(infile).append(":");
551 
552   if (grep_color)
553   {
554     mark.assign("\033[").append(grep_color).append("m");
555     unmark.assign("\033[0m");
556   }
557 
558   // create an input object to read the file (or stdin) using the given file format encoding
559   reflex::Input input(file, encoding);
560 
561   if (flag_quiet)
562   {
563     // -q quiet mode: report if a single pattern match was found in the input
564 
565     found = reflex::Matcher(pattern, input).find();
566 
567     if (flag_invert_match)
568       found = !found;
569   }
570   else if (flag_count)
571   {
572     // -c count mode: count the number of lines/patterns matched
573 
574     if (flag_invert_match)
575     {
576       size_t lines = 0;
577       std::string line;
578 
579       // -c count mode w/ -v: count the number of non-matching lines
580       while (input)
581       {
582         int ch;
583 
584         // read the next line
585         line.clear();
586         while ((ch = input.get()) != EOF && ch != '\n')
587           line.push_back(ch);
588         if (ch == EOF && line.empty())
589           break;
590 
591         // count this line if not matched
592         if (!reflex::Matcher(pattern, line).find())
593         {
594           found = true;
595           ++lines;
596         }
597       }
598 
599       std::cout << label << lines << std::endl;
600     }
601     else if (flag_ungroup)
602     {
603       // -c count mode w/ -u: count the number of patterns matched in the file
604 
605       reflex::Matcher matcher(pattern, input);
606       size_t matches = std::distance(matcher.find.begin(), matcher.find.end());
607 
608       std::cout << label << matches << std::endl;
609       found = matches > 0;
610     }
611     else
612     {
613       // -c count mode w/o -u: count the number of matching lines
614 
615       size_t lineno = 0;
616       size_t lines = 0;
617 
618       reflex::Matcher matcher(pattern, input);
619       for (auto& match : matcher.find)
620       {
621         if (lineno != match.lineno())
622         {
623           lineno = match.lineno();
624           ++lines;
625         }
626       }
627 
628       std::cout << label << lines << std::endl;
629       found = lines > 0;
630     }
631   }
632   else if (flag_line_buffered)
633   {
634     // line-buffered: display lines that matched the pattern
635 
636 #if defined(WITH_SPAN)
637     if (!flag_ungroup && !flag_invert_match)
638     {
639       size_t lineno = 0;
640 
641       reflex::Matcher matcher(pattern, input);
642       for (auto& match : matcher.find)
643       {
644         if (lineno != match.lineno())
645         {
646           lineno = match.lineno();
647           std::cout << label;
648           if (flag_line_number)
649             std::cout << match.lineno() << ":";
650           if (flag_column_number)
651             std::cout << match.columno() + 1 << ":";
652           if (flag_byte_offset)
653             std::cout << match.first() << ":";
654           std::cout << mark << match.span() << unmark << std::endl;
655           found = true;
656         }
657       }
658     }
659     else
660 #endif
661     {
662       size_t byte_offset = 0;
663       size_t lineno = 1;
664       std::string line;
665 
666       while (input)
667       {
668         int ch;
669 
670         // read the next line
671         line.clear();
672         while ((ch = input.get()) != EOF && ch != '\n')
673           line.push_back(ch);
674         if (ch == EOF && line.empty())
675           break;
676 
677         if (flag_invert_match)
678         {
679           // -v invert match: display non-matching line
680 
681           if (!reflex::Matcher(pattern, line).find())
682           {
683             std::cout << label;
684             if (flag_line_number)
685               std::cout << lineno << ":";
686             if (flag_byte_offset)
687               std::cout << byte_offset << ":";
688             std::cout << line << std::endl;
689             found = true;
690           }
691         }
692         else if (flag_ungroup)
693         {
694           // search the line for pattern matches and display the line again (with exact offset) for each pattern match
695 
696           reflex::Matcher matcher(pattern, line);
697           for (auto& match : matcher.find)
698           {
699             std::cout << label;
700             if (flag_line_number)
701               std::cout << lineno << ":";
702             if (flag_column_number)
703               std::cout << match.columno() + 1 << ":";
704             if (flag_byte_offset)
705               std::cout << byte_offset << ":";
706             std::cout << line.substr(0, match.first()) << mark << match.text() << unmark << line.substr(match.last()) << std::endl;
707             found = true;
708           }
709         }
710         else
711         {
712           // search the line for pattern matches and display the line just once with all matches
713 
714           size_t last = 0;
715 
716           reflex::Matcher matcher(pattern, line);
717           for (auto& match : matcher.find)
718           {
719             if (last == 0)
720             {
721               std::cout << label;
722               if (flag_line_number)
723                 std::cout << lineno << ":";
724               if (flag_column_number)
725                 std::cout << match.columno() + 1 << ":";
726               if (flag_byte_offset)
727                 std::cout << byte_offset + match.first() << ":";
728               std::cout << line.substr(0, match.first()) << mark << match.text() << unmark;
729               last = match.last();
730               found = true;
731             }
732             else
733             {
734               std::cout << line.substr(last, match.first() - last) << mark << match.text() << unmark;
735               last = match.last();
736             }
737           }
738 
739           if (last > 0)
740             std::cout << line.substr(last) << std::endl;
741         }
742 
743         // update byte offset and line number
744         byte_offset += line.size() + 1;
745         ++lineno;
746       }
747     }
748   }
749   else
750   {
751     // block-buffered: display pattern matches
752 
753     size_t lineno = 0;
754 
755     reflex::Matcher matcher(pattern, input);
756     for (auto& match : matcher.find)
757     {
758       if (flag_ungroup || lineno != match.lineno())
759       {
760         lineno = match.lineno();
761         std::cout << label;
762         if (flag_line_number)
763           std::cout << lineno << ":";
764         if (flag_column_number)
765           std::cout << match.columno() + 1 << ":";
766         if (flag_byte_offset)
767           std::cout << match.first() << ":";
768       }
769       std::cout << mark << match.text() << unmark << std::endl;
770       found = true;
771     }
772   }
773 
774   return found;
775 }
776 
777 // Display help information with an optional diagnostic message and exit
help(const char * message,const char * arg)778 void help(const char *message, const char *arg)
779 {
780   if (message)
781     std::cout << "ugrep: " << message << (arg != NULL ? arg : "") << std::endl;
782   std::cout << "Usage: ugrep [-bcEFgHhiknoqsVvwx] [--colour[=when]|--color[=when]] [-e pattern] [pattern] [file ...]\n\
783 \n\
784     -b, --byte-offset\n\
785             The offset in bytes of a matched pattern is displayed in front of\n\
786             the respective matched line.\n\
787     -c, --count\n\
788             Only a count of selected lines is written to standard output.\n\
789             When used with option -u, counts the number of patterns matched.\n\
790     --colour[=when], --color[=when]\n\
791             Mark up the matching text with the expression stored in the\n\
792             GREP_COLOR environment variable.  The possible values of when can\n\
793             be `never', `always' or `auto'.\n\
794     -E, --extended-regexp\n\
795             Ignored, intended for grep compatibility.\n\
796     -e pattern, --regexp=pattern\n\
797             Specify a pattern used during the search of the input: an input\n\
798             line is selected if it matches any of the specified patterns.\n\
799             This option is most useful when multiple -e options are used to\n\
800             specify multiple patterns, or when a pattern begins with a dash\n\
801             (`-').\n\
802     --file-format=format\n\
803             The input file format.  The possible values of format can be:";
804   for (int i = 0; format_table[i].format != NULL; ++i)
805     std::cout << (i % 5 ? " " : "\n            ") << format_table[i].format;
806   std::cout << "\n\
807     -F, --fixed-strings\n\
808             Interpret pattern as a set of fixed strings (i.e. force ugrep to\n\
809             behave as fgrep).\n\
810     --free-space\n\
811             Spacing (blanks and tabs) in regular expressions are ignored.\n\
812     -H\n\
813             Always print filename headers with output lines.\n\
814     -h, --no-filename\n\
815             Never print filename headers (i.e. filenames) with output lines.\n\
816     -?, --help\n\
817             Print a help message.\n\
818     -i, --ignore-case\n\
819             Perform case insensitive matching. This option applies\n\
820             case-insensitive matching of ASCII characters in the input.\n\
821             By default, ugrep is case sensitive.\n\
822     -k, --column-number\n\
823             The column number of a matched pattern is displayed in front of\n\
824             the respective matched line, starting at column 1.  Tabs are\n\
825             expanded before columns are counted.\n\
826     -n, --line-number\n\
827             Each output line is preceded by its relative line number in the\n\
828             file, starting at line 1.  The line number counter is reset for\n\
829             each file processed.\n\
830     -o, --only-matching\n\
831             Prints only the matching part of the lines.  Allows a pattern\n\
832             match to span multiple lines.\n\
833     -q, --quiet, --silent\n\
834             Quiet mode: suppress normal output.  ugrep will only search a file\n\
835             until a match has been found, making searches potentially less\n\
836             expensive.  Allows a pattern match to span multiple lines.\n\
837     -s, --no-messages\n\
838             Silent mode.  Nonexistent and unreadable files are ignored (i.e.\n\
839             their error messages are suppressed).\n\
840     --tabs=size\n\
841             Set the tab size to 1, 2, 4, or 8 to expand tabs for option -k.\n\
842     -u, --ungroup\n\
843             Do not group pattern matches on the same line.  Display the\n\
844             matched line again for each additional pattern match.\n\
845     -V, --version\n\
846             Display version information and exit.\n\
847     -v, --invert-match\n\
848             Selected lines are those not matching any of the specified\n\
849             patterns.\n\
850     -w, --word-regexp\n\
851             The pattern is searched for as a word (as if surrounded by\n\
852             `\\<' and `\\>').\n\
853     -x, --line-regexp\n\
854             Only input lines selected against an entire pattern are considered\n\
855             to be matching lines (as if surrounded by ^ and $).\n\
856 \n\
857     The ugrep utility exits with one of the following values:\n\
858 \n\
859     0       One or more lines were selected.\n\
860     1       No lines were selected.\n\
861     >1      An error occurred.\n\
862 " << std::endl;
863   exit(EXIT_ERROR);
864 }
865 
866 // Display version info
version()867 void version()
868 {
869   std::cout << "ugrep (simple) " VERSION " " PLATFORM << std::endl;
870   exit(EXIT_OK);
871 }
872