1 /*************************************************************************
2 *
3 *   © 2016 and later: Unicode, Inc. and others.
4 *   License & terms of use: http://www.unicode.org/copyright.html
5 *
6 **************************************************************************
7 **************************************************************************
8 *
9 *   Copyright (C) 2002-2010, International Business Machines
10 *   Corporation and others.  All Rights Reserved.
11 *
12 ***************************************************************************
13 */
14 
15 //
16 //   ugrep  - an ICU sample program illustrating the use of ICU Regular Expressions.
17 //
18 //            The use of the ICU Regex API all occurs within the main()
19 //            function.  The rest of the code deals with opening files,
20 //            encoding conversions, printing results, etc.
21 //
22 //            This is not a full-featured grep program.  The command line options
23 //            have been kept to a minimum to avoid complicating the sample code.
24 //
25 
26 
27 
28 #include <stdio.h>
29 #include <stdlib.h>
30 #include <string.h>
31 
32 #include "unicode/utypes.h"
33 #include "unicode/ustring.h"
34 #include "unicode/regex.h"
35 #include "unicode/ucnv.h"
36 #include "unicode/uclean.h"
37 
38 using namespace icu;
39 
40 //
41 //  The following variables contain parameters that may be set from the command line.
42 //
43 const char *pattern = NULL;     // The regular expression
44 int        firstFileNum;        //  argv index of the first file name
45 UBool      displayFileName = false;
46 UBool      displayLineNum  = false;
47 
48 
49 //
50 //  Info regarding the file currently being processed
51 //
52 const char *fileName;
53 int         fileLen;              // Length, in UTF-16 Code Units.
54 
55 UChar      *ucharBuf = 0;         // Buffer, holds converted file.  (Simple minded program, always reads
56                                   //   the whole file at once.
57 
58 char       *charBuf = 0;          // Buffer, for original, unconverted file data.
59 
60 
61 //
62 //  Info regarding the line currently being processed
63 //
64 int      lineStart;     // Index of first char of the current line in the file buffer
65 int      lineEnd;       // Index of char following the new line sequence for the current line
66 int      lineNum;
67 
68 //
69 //  Converter, used on output to convert Unicode data back to char *
70 //             so that it will display in non-Unicode terminal windows.
71 //
72 UConverter  *outConverter = 0;
73 
74 //
75 //  Function forward declarations
76 //
77 void processOptions(int argc, const char **argv);
78 void nextLine(int start);
79 void printMatch();
80 void printUsage();
81 void readFile(const char *name);
82 
83 
84 
85 //------------------------------------------------------------------------------------------
86 //
87 //   main          for ugrep
88 //
89 //           Structurally, all use of the ICU Regular Expression API is in main(),
90 //           and all of the supporting stuff necessary to make a running program, but
91 //           not directly related to regular expressions, is factored out into these other
92 //           functions.
93 //
94 //------------------------------------------------------------------------------------------
main(int argc,const char ** argv)95 int main(int argc, const char** argv) {
96     UBool     matchFound = false;
97 
98     //
99     //  Process the command line options.
100     //
101     processOptions(argc, argv);
102 
103     //
104     // Create a RegexPattern object from the user supplied pattern string.
105     //
106     UErrorCode status = U_ZERO_ERROR;   // All ICU operations report success or failure
107                                         //   in a status variable.
108 
109     UParseError    parseErr;            // In the event of a syntax error in the regex pattern,
110                                         //   this struct will contain the position of the
111                                         //   error.
112 
113     RegexPattern  *rePat = RegexPattern::compile(pattern, parseErr, status);
114                                         // Note that C++ is doing an automatic conversion
115                                         //  of the (char *) pattern to a temporary
116                                         //  UnicodeString object.
117     if (U_FAILURE(status)) {
118         fprintf(stderr, "ugrep:  error in pattern: \"%s\" at position %d\n",
119             u_errorName(status), parseErr.offset);
120         exit(-1);
121     }
122 
123     //
124     // Create a RegexMatcher from the newly created pattern.
125     //
126     UnicodeString empty;
127     RegexMatcher *matcher = rePat->matcher(empty, status);
128     if (U_FAILURE(status)) {
129         fprintf(stderr, "ugrep:  error in creating RegexMatcher: \"%s\"\n",
130             u_errorName(status));
131         exit(-1);
132     }
133 
134     //
135     // Loop, processing each of the input files.
136     //
137     for (int fileNum=firstFileNum; fileNum < argc; fileNum++) {
138         readFile(argv[fileNum]);
139 
140         //
141         //  Loop through the lines of a file, trying to match the regex pattern on each.
142         //
143         for (nextLine(0); lineStart<fileLen; nextLine(lineEnd)) {
144             UnicodeString s(false, ucharBuf+lineStart, lineEnd-lineStart);
145             matcher->reset(s);
146             if (matcher->find()) {
147                 matchFound = true;
148                 printMatch();
149             }
150         }
151     }
152 
153     //
154     //  Clean up
155     //
156     delete matcher;
157     delete rePat;
158     free(ucharBuf);
159     free(charBuf);
160     ucnv_close(outConverter);
161 
162     u_cleanup();       // shut down ICU, release any cached data it owns.
163 
164     return matchFound? 0: 1;
165 }
166 
167 
168 
169 //------------------------------------------------------------------------------------------
170 //
171 //   doOptions          Run through the command line options, and set
172 //                      the global variables accordingly.
173 //
174 //                      exit without returning if an error occurred and
175 //                      ugrep should not proceed further.
176 //
177 //------------------------------------------------------------------------------------------
processOptions(int argc,const char ** argv)178 void processOptions(int argc, const char **argv) {
179     int            optInd;
180     UBool          doUsage   = false;
181     UBool          doVersion = false;
182     const char    *arg;
183 
184 
185     for(optInd = 1; optInd < argc; ++optInd) {
186         arg = argv[optInd];
187 
188         /* version info */
189         if(strcmp(arg, "-V") == 0 || strcmp(arg, "--version") == 0) {
190             doVersion = true;
191         }
192         /* usage info */
193         else if(strcmp(arg, "--help") == 0) {
194             doUsage = true;
195         }
196         else if(strcmp(arg, "-n") == 0 || strcmp(arg, "--line-number") == 0) {
197             displayLineNum = true;
198         }
199         /* POSIX.1 says all arguments after -- are not options */
200         else if(strcmp(arg, "--") == 0) {
201             /* skip the -- */
202             ++optInd;
203             break;
204         }
205         /* unrecognized option */
206         else if(strncmp(arg, "-", strlen("-")) == 0) {
207             printf("ugrep: invalid option -- %s\n", arg+1);
208             doUsage = true;
209         }
210         /* done with options */
211         else {
212             break;
213         }
214     }
215 
216     if (doUsage) {
217         printUsage();
218         exit(0);
219     }
220 
221     if (doVersion) {
222         printf("ugrep version 0.01\n");
223         if (optInd == argc) {
224             exit(0);
225         }
226     }
227 
228     int  remainingArgs = argc-optInd;     // pattern file ...
229     if (remainingArgs < 2) {
230         fprintf(stderr, "ugrep:  files or pattern are missing.\n");
231         printUsage();
232         exit(1);
233     }
234 
235     if (remainingArgs > 2) {
236         // More than one file to be processed.   Display file names with match output.
237         displayFileName = true;
238     }
239 
240     pattern      = argv[optInd];
241     firstFileNum = optInd+1;
242 }
243 
244 //------------------------------------------------------------------------------------------
245 //
246 //   printUsage
247 //
248 //------------------------------------------------------------------------------------------
printUsage()249 void printUsage() {
250     printf("ugrep [options] pattern file...\n"
251         "     -V or --version     display version information\n"
252         "     --help              display this help and exit\n"
253         "     --                  stop further option processing\n"
254         "-n,  --line-number       Prefix each line of output with the line number within its input file.\n"
255         );
256     exit(0);
257 }
258 
259 //------------------------------------------------------------------------------------------
260 //
261 //    readFile          Read a file into memory, and convert it to Unicode.
262 //
263 //                      Since this is just a demo program, take the simple minded approach
264 //                      of always reading the whole file at once.  No intelligent buffering
265 //                      is done.
266 //
267 //------------------------------------------------------------------------------------------
readFile(const char * name)268 void readFile(const char *name) {
269 
270     //
271     //  Initialize global file variables
272     //
273     fileName = name;
274     fileLen  = 0;      // zero length prevents processing in case of errors.
275 
276 
277     //
278     //  Open the file and determine its size.
279     //
280     FILE *file = fopen(name, "rb");
281     if (file == 0 ) {
282         fprintf(stderr, "ugrep: Could not open file \"%s\"\n", fileName);
283         return;
284     }
285     fseek(file, 0, SEEK_END);
286     int rawFileLen = ftell(file);
287     fseek(file, 0, SEEK_SET);
288 
289 
290     //
291     //   Read in the file
292     //
293     charBuf    = (char *)realloc(charBuf, rawFileLen+1);   // Need error checking...
294     int t = static_cast<int>(fread(charBuf, 1, rawFileLen, file));
295     if (t != rawFileLen)  {
296         fprintf(stderr, "Error reading file \"%s\"\n", fileName);
297         fclose(file);
298         return;
299     }
300     charBuf[rawFileLen]=0;
301     fclose(file);
302 
303     //
304     // Look for a Unicode Signature (BOM) in the data
305     //
306     int32_t        signatureLength;
307     const char *   charDataStart = charBuf;
308     UErrorCode     status        = U_ZERO_ERROR;
309     const char*    encoding      = ucnv_detectUnicodeSignature(
310                            charDataStart, rawFileLen, &signatureLength, &status);
311     if (U_FAILURE(status)) {
312         fprintf(stderr, "ugrep: ICU Error \"%s\" from ucnv_detectUnicodeSignature()\n",
313             u_errorName(status));
314         return;
315     }
316     if(encoding!=NULL ){
317         charDataStart  += signatureLength;
318         rawFileLen     -= signatureLength;
319     }
320 
321     //
322     // Open a converter to take the file to UTF-16
323     //
324     UConverter* conv;
325     conv = ucnv_open(encoding, &status);
326     if (U_FAILURE(status)) {
327         fprintf(stderr, "ugrep: ICU Error \"%s\" from ucnv_open()\n", u_errorName(status));
328         return;
329     }
330 
331     //
332     // Convert the file data to UChar.
333     //  Preflight first to determine required buffer size.
334     //
335     uint32_t destCap = ucnv_toUChars(conv,
336                        NULL,           //  dest,
337                        0,              //  destCapacity,
338                        charDataStart,
339                        rawFileLen,
340                        &status);
341     if (status != U_BUFFER_OVERFLOW_ERROR) {
342         fprintf(stderr, "ugrep: ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
343         return;
344     };
345 
346     status = U_ZERO_ERROR;
347     ucharBuf = (UChar *)realloc(ucharBuf, (destCap+1) * sizeof(UChar));
348     ucnv_toUChars(conv,
349         ucharBuf,           //  dest,
350         destCap+1,
351         charDataStart,
352         rawFileLen,
353         &status);
354     if (U_FAILURE(status)) {
355         fprintf(stderr, "ugrep: ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
356         return;
357     };
358     ucnv_close(conv);
359 
360     //
361     //  Successful conversion.  Set the global size variables so that
362     //     the rest of the processing will proceed for this file.
363     //
364     fileLen = destCap;
365 }
366 
367 
368 
369 
370 
371 //------------------------------------------------------------------------------------------
372 //
373 //   nextLine           Advance the line index variables, starting at the
374 //                      specified position in the input file buffer, by
375 //                      scanning forward until the next end-of-line.
376 //
377 //                      Need to take into account all of the possible Unicode
378 //                      line ending sequences.
379 //
380 //------------------------------------------------------------------------------------------
nextLine(int startPos)381 void nextLine(int  startPos) {
382     if (startPos == 0) {
383         lineNum = 0;
384     } else {
385         lineNum++;
386     }
387     lineStart = lineEnd = startPos;
388 
389     for (;;) {
390         if (lineEnd >= fileLen) {
391             return;
392         }
393         UChar c = ucharBuf[lineEnd];
394         lineEnd++;
395         if (c == 0x0a   ||       // Line Feed
396             c == 0x0c   ||       // Form Feed
397             c == 0x0d   ||       // Carriage Return
398             c == 0x85   ||       // Next Line
399             c == 0x2028 ||       // Line Separator
400             c == 0x2029)         // Paragraph separator
401         {
402             break;
403         }
404     }
405 
406     // Check for CR/LF sequence, and advance over the LF if we're in the middle of one.
407     if (lineEnd < fileLen           &&
408         ucharBuf[lineEnd-1] == 0x0d &&
409         ucharBuf[lineEnd]   == 0x0a)
410     {
411         lineEnd++;
412     }
413 }
414 
415 
416 //------------------------------------------------------------------------------------------
417 //
418 //   printMatch         Called when a matching line has been located.
419 //                      Print out the line from the file with the match, after
420 //                         converting it back to the default code page.
421 //
422 //------------------------------------------------------------------------------------------
printMatch()423 void printMatch() {
424     char                buf[2000];
425     UErrorCode         status       = U_ZERO_ERROR;
426 
427     // If we haven't already created a converter for output, do it now.
428     if (outConverter == 0) {
429         outConverter = ucnv_open(NULL, &status);
430         if (U_FAILURE(status)) {
431             fprintf(stderr, "ugrep:  Error opening default converter: \"%s\"\n",
432                 u_errorName(status));
433             exit(-1);
434         }
435     };
436 
437     // Convert the line to be printed back to the default 8 bit code page.
438     //   If the line is too long for our buffer, just truncate it.
439     ucnv_fromUChars(outConverter,
440                     buf,                   // destination buffer for conversion
441                     sizeof(buf),           // capacity of destination buffer
442                     &ucharBuf[lineStart],   // Input to conversion
443                     lineEnd-lineStart,     // number of UChars to convert
444                     &status);
445     buf[sizeof(buf)-1] = 0;                // Add null for use in case of too long lines.
446                                            // The converter null-terminates its output unless
447                                            //   the buffer completely fills.
448 
449     if (displayFileName) {
450         printf("%s:", fileName);
451     }
452     if (displayLineNum) {
453         printf("%d:", lineNum);
454     }
455     printf("%s", buf);
456 }
457 
458