1 /*
2  * Copyright (c) 2015-2019, Intel Corporation
3  *
4  * Redistribution and use in source and binary forms, with or without
5  * modification, are permitted provided that the following conditions are met:
6  *
7  *  * Redistributions of source code must retain the above copyright notice,
8  *    this list of conditions and the following disclaimer.
9  *  * Redistributions in binary form must reproduce the above copyright
10  *    notice, this list of conditions and the following disclaimer in the
11  *    documentation and/or other materials provided with the distribution.
12  *  * Neither the name of Intel Corporation nor the names of its contributors
13  *    may be used to endorse or promote products derived from this software
14  *    without specific prior written permission.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
20  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26  * POSSIBILITY OF SUCH DAMAGE.
27  */
28 
29 /**
30  * \file
31  * \brief Hyperscan compile dump tool
32  *
33  * Given a set of patterns, dump all available data from the compilation
34  * process into a directory. This tool is intended to assist Hyperscan
35  * developers with developement and debugging by providing insights into the
36  * built bytecode.
37  *
38  * Note: requires that hyperscan is built with DUMP_SUPPORT enabled.
39  */
40 
41 #include "config.h"
42 
43 #include "cross_compile.h"
44 #include "ExpressionParser.h"
45 #include "expressions.h"
46 #include "expression_path.h"
47 #include "string_util.h"
48 
49 #include "grey.h"
50 #include "hs_compile.h"
51 #include "hs_internal.h"
52 #include "scratch_dump.h"
53 
54 #include <cassert>
55 #include <cerrno>
56 #include <cstdio>
57 #include <cstdlib>
58 #include <string>
59 #include <vector>
60 
61 #ifndef _WIN32
62 #include <getopt.h>
63 #else
64 #include "win_getopt.h"
65 #endif
66 #include <sys/stat.h>
67 
68 #ifndef _WIN32
69 #include <dirent.h>
70 #else
71 #include <direct.h>
72 #define stat _stat
73 #endif
74 
75 #include <boost/ptr_container/ptr_vector.hpp>
76 
77 using namespace std;
78 using namespace ue2;
79 using boost::ptr_vector;
80 
81 namespace /* anonymous */ {
82 
83 // Input pattern file
84 string patternfile;
85 // Output path
86 string dumpbase(".");
87 // Compile with streaming
88 bool streaming = true;
89 bool vectored = false;
90 
91 bool echoSigs = false;
92 bool dump_db = false;
93 bool force_utf8 = false;
94 int force_prefilter = 0;
95 
96 unsigned int onlyId;
97 u32 somFlags = 0;
98 unsigned somPrecisionMode = HS_MODE_SOM_HORIZON_LARGE;
99 
100 bool singleId = false;
101 string signatureFile;
102 
103 unique_ptr<hs_platform_info> plat_info;
104 
105 bool dump_intermediate = true;
106 bool force_edit_distance = false;
107 u32 edit_distance = 0;
108 
109 int use_literal_api = 0;
110 
111 } // namespace
112 
113 // Usage statement.
114 static
usage(const char * name,const char * error)115 void usage(const char *name, const char *error) {
116     printf("Usage: %s [OPTIONS...]\n\n", name);
117     printf("Options:\n\n");
118     printf("  -h              Display help and exit.\n");
119     printf("  -G OVERRIDES    Overrides for the grey box.\n");
120     printf("  -e PATH         Path to expression directory or file.\n");
121     printf("  -s FILE         Signature file to use.\n");
122     printf("  -z NUM          Signature ID to use.\n");
123     printf("  -N, --block     Compile in block mode"
124            " (default: streaming).\n");
125     printf("  -V, --vectored  Compile in vectored mode"
126            " (default: streaming).\n");
127     printf("  -o, --output PATH\n");
128     printf("                  Use data dump directory PATH (default: dump).\n");
129     printf("                  WARNING: existing files in output directory are"
130            " deleted.\n");
131     printf("  -x NAME         Cross-compile for arch NAME\n");
132     printf("  -D, --dump_db   Dump the final database.\n");
133     printf("  -P, --print     Echo signature set to stdout.\n");
134     printf("  -X, --no_intermediate\n");
135     printf("                  Do not dump intermediate data.\n");
136     printf("\n");
137     printf("Pattern flags:\n");
138     printf("  -d NUMBER       Set SOM precision mode (default: 8 (large)).\n");
139     printf("  -E DISTANCE     Match all patterns within edit distance"
140            " DISTANCE.\n");
141     printf("  -8              Force UTF8 mode on all patterns.\n");
142     printf("  -L              Apply HS_FLAG_SOM_LEFTMOST to all patterns.\n");
143     printf(" --prefilter      Apply HS_FLAG_PREFILTER to all patterns.\n");
144     printf(" --literal-on     Use Hyperscan pure literal matching API.\n");
145     printf("\n");
146     printf("Example:\n");
147     printf("$ %s -e pattern.file -s sigfile\n", name);
148     printf("\n");
149 
150     if (error) {
151         printf("Error: %s\n", error);
152     }
153 }
154 
155 static
processArgs(int argc,char * argv[],Grey & grey)156 void processArgs(int argc, char *argv[], Grey &grey) {
157     static const char *options = "d:De:E:G:hLNo:Ps:VXx:z:8";
158     static struct option longOptions[] = {
159         {"dump_db",             no_argument,        nullptr, 'D'},
160         {"help",                no_argument,        nullptr, 'h'},
161         {"output",              required_argument,  nullptr, 'o'},
162         {"block",               no_argument,        nullptr, 'N'},
163         {"no_intermediate",     no_argument,        nullptr, 'X'},
164         {"vectored",            no_argument,        nullptr, 'V'},
165         {"print",               no_argument,        nullptr, 'P'},
166         {"utf8",                no_argument,        nullptr, '8'},
167         {"prefilter",           no_argument,        &force_prefilter, 1},
168         {"som-width",           required_argument,  nullptr, 'd'},
169         {"literal-on",          no_argument,        &use_literal_api, 1},
170         {nullptr, 0, nullptr, 0}
171     };
172 
173     for (;;) {
174         int c = getopt_long(argc, argv, options, longOptions, nullptr);
175 
176         if (c < 0) {
177             break;
178         }
179         switch (c) {
180         case 'D':
181             dump_db = true;
182             break;
183 
184         case 'd': {
185             unsigned dist;
186             if (!fromString(optarg, dist)) {
187                 usage(argv[0], "Must provide an integer argument to '-d' flag");
188                 exit(1);
189             }
190             switch (dist) {
191             case 2:
192                 somPrecisionMode = HS_MODE_SOM_HORIZON_SMALL;
193                 break;
194             case 4:
195                 somPrecisionMode = HS_MODE_SOM_HORIZON_MEDIUM;
196                 break;
197             case 8:
198                 somPrecisionMode = HS_MODE_SOM_HORIZON_LARGE;
199                 break;
200             default:
201                 usage(argv[0], "SOM precision must be 2, 4 or 8");
202                 exit(1);
203             }
204             break;
205         }
206 
207         case 'h':
208             usage(argv[0], nullptr);
209             exit(0);
210 
211         case 'e':
212             patternfile = optarg;
213             break;
214 
215 #ifndef RELEASE_BUILD
216         case 'G':
217             applyGreyOverrides(&grey, string(optarg));
218             break;
219 #endif
220 
221         case 'L':
222             somFlags |= HS_FLAG_SOM_LEFTMOST;
223             break;
224 
225         case 'o':
226             dumpbase = optarg;
227             break;
228 
229         case 'P':
230             echoSigs = true;
231             break;
232 
233         case 's':
234             signatureFile.assign(optarg);
235             break;
236 
237         case 'N':
238             streaming = false;
239             break;
240 
241         case 'V':
242             streaming = false;
243             vectored = true;
244             break;
245 
246         case 'X':
247             dump_intermediate = false;
248             break;
249 
250         case 'x':
251             plat_info = xcompileReadMode(optarg);
252             if (!plat_info) {
253                 usage(argv[0], xcompileUsage().c_str());
254                 exit(1);
255             }
256             break;
257 
258         case 'z':
259             if (!fromString(optarg, onlyId)) {
260                 usage(argv[0], "Argument to '-z' flag must be an integer");
261                 exit(1);
262             }
263             singleId = true;
264             break;
265         case 'E': {
266             u32 dist;
267             if (!fromString(optarg, dist)) {
268                 usage(argv[0], "Argument to '-E' flag must be an integer");
269                 exit(1);
270             }
271             force_edit_distance = true;
272             edit_distance = dist;
273             break;
274         }
275         case '8':
276             force_utf8 = true;
277             break;
278         case 0:
279             break;
280         default:
281             usage(argv[0], "");
282             exit(1);
283         }
284     }
285 
286     if (patternfile.empty() && !signatureFile.empty()) {
287         /* attempt to infer an expression directory */
288         patternfile = inferExpressionPath(signatureFile);
289     }
290 
291     if (patternfile.size() == 0) {
292         usage(argv[0], "No pattern file provided");
293         exit(1);
294     }
295     if (dumpbase.size() == 0) {
296         usage(argv[0], "No output directory provided");
297         exit(1);
298     }
299 }
300 
301 static
dumpDb(const struct hs_database * out,const Grey & grey)302 void dumpDb(const struct hs_database *out, const Grey &grey) {
303     char *bytes = nullptr;
304     size_t len = 0;
305     hs_error_t err = hs_serialize_database(out, &bytes, &len);
306     if (err != HS_SUCCESS) {
307         printf("ERROR: hs_serialize_database() failed with error %u\n", err);
308         return;
309     }
310 
311     FILE *f = fopen((grey.dumpPath + "db.raw").c_str(), "w");
312     if (!f) {
313         printf("ERROR: unable to write database out: %s", strerror(errno));
314     } else {
315         fwrite(bytes, 1, len, f);
316         fclose(f);
317     }
318     free(bytes);
319 }
320 
321 static
buildDumpFlags(void)322 u32 buildDumpFlags(void) {
323     u32 flags = 0;
324     flags |= Grey::DUMP_BASICS;
325     flags |= Grey::DUMP_IMPL;
326 
327     if (dump_intermediate) {
328         flags |= Grey::DUMP_PARSE;
329         flags |= Grey::DUMP_INT_GRAPH;
330     }
331 
332     return flags;
333 }
334 
335 #ifndef _WIN32
336 static
clearDir(const string & path)337 void clearDir(const string &path) {
338     DIR *dir = opendir(path.c_str());
339     if (!dir) {
340         printf("ERROR: couldn't open location %s: %s\n", path.c_str(),
341                strerror(errno));
342         exit(1);
343     }
344 
345     struct dirent *d_ent;
346     while (nullptr != (d_ent = readdir(dir))) {
347         string name(d_ent->d_name);
348         if (name == "." || name == "..") {
349             continue;
350         }
351         string f = path + '/' + name;
352         if (unlink(f.c_str()) < 0) {
353             printf("ERROR: couldn't remove file %s: %s\n", f.c_str(),
354                    strerror(errno));
355         }
356     }
357     closedir(dir);
358 }
359 #else // windows
360 static
clearDir(const string & path)361 void clearDir(const string &path) {
362     WIN32_FIND_DATA ffd;
363     HANDLE hFind = INVALID_HANDLE_VALUE;
364     string glob = path + "/*";
365     hFind = FindFirstFile(glob.c_str(), &ffd);
366     if (hFind == INVALID_HANDLE_VALUE) {
367         printf("ERROR: couldn't open location %s\n", path.c_str());
368         exit(1);
369     }
370     do {
371         string basename(ffd.cFileName);
372         string fname(path);
373         fname.push_back('/');
374         fname.append(basename);
375 
376         // Ignore '.' and '..'
377         if (basename == "." || basename == "..") {
378             continue;
379         }
380 
381         if (!DeleteFile(fname.c_str())) {
382             printf("ERROR: couldn't remove file %s\n", fname.c_str());
383         }
384 
385     } while (FindNextFile(hFind, &ffd) != 0);
386     FindClose(hFind);
387 }
388 #endif
389 
390 static
makeDirectory(const string & dirName)391 int makeDirectory(const string &dirName) {
392 #ifndef _WIN32
393     mode_t mode = S_IRUSR | S_IWUSR | S_IXUSR | S_IRGRP | S_IXGRP |
394                   S_IROTH | S_IXOTH;
395     return mkdir(dirName.c_str(), mode);
396 #else
397     return _mkdir(dirName.c_str());
398 #endif
399 }
400 
401 static
prepareDumpLoc(string parent,string path,u32 flags,Grey & grey)402 void prepareDumpLoc(string parent, string path, u32 flags, Grey &grey) {
403     struct stat st;
404     if (stat(parent.c_str(), &st)) {
405         // Create dump location if not found
406         if (makeDirectory(parent) < 0) {
407             printf("ERROR: could not create dump location %s: %s\n",
408                    parent.c_str(), strerror(errno));
409             exit(1);
410         }
411     }
412 
413     // If not separator terminated, add separator
414     if (parent.back() != '/') {
415         parent.push_back('/');
416     }
417 
418     // Append path to parent
419     path = parent.append(path);
420     if (stat(path.c_str(), &st)) {
421         // Create dump location if not found
422         if (makeDirectory(path) < 0) {
423             printf("ERROR: could not create dump location %s: %s\n",
424                    path.c_str(), strerror(errno));
425             exit(1);
426         }
427     }
428 
429     // remove anything in the dump dir - most likely stale
430     clearDir(path);
431 
432     // If not separator terminated, add separator
433     if (path.back() != '/') {
434         path.push_back('/');
435     }
436 
437     grey.dumpPath = path;
438     grey.dumpFlags = flags;
439 }
440 
441 static
buildMode()442 unsigned buildMode() {
443     unsigned mode = 0;
444     if (streaming) {
445         mode |= HS_MODE_STREAM;
446         mode |= somPrecisionMode;
447         assert(!vectored);
448     } else if (vectored) {
449         mode |= HS_MODE_VECTORED;
450     } else {
451         mode |= HS_MODE_BLOCK;
452     }
453 
454     return mode;
455 }
456 
457 static
dumpScratch(const hs_database_t * db,const Grey & grey)458 void dumpScratch(const hs_database_t *db, const Grey &grey) {
459     hs_scratch_t *scratch = nullptr;
460     hs_error_t err = hs_alloc_scratch(db, &scratch);
461     if (err == HS_SUCCESS) {
462         FILE *f = fopen((grey.dumpPath + "scratch.txt").c_str(), "w");
463         if (f) {
464             dumpScratch(scratch, f);
465             fclose(f);
466         } else {
467             printf("ERROR: could not open %s: %s\n",
468                    (grey.dumpPath + "scratch.txt").c_str(), strerror(errno));
469         }
470     } else {
471         printf("ERROR: hs_alloc_scratch() failed with error %u\n", err);
472     }
473     hs_free_scratch(scratch);
474 }
475 
476 static
dumpInfo(const hs_database_t * db,const Grey & grey)477 void dumpInfo(const hs_database_t *db, const Grey &grey) {
478     char *info = nullptr;
479     hs_error_t err = hs_database_info(db, &info);
480     if (err == HS_SUCCESS) {
481         FILE *f = fopen((grey.dumpPath + "db_info.txt").c_str(), "w");
482         if (f) {
483             fprintf(f, "%s\n", info);
484             fclose(f);
485         } else {
486             printf("ERROR: could not open %s: %s\n",
487                    (grey.dumpPath + "db_info.txt").c_str(), strerror(errno));
488         }
489     } else {
490         printf("ERROR: hs_database_info() failed with error %u\n", err);
491     }
492     free(info);
493 }
494 
495 static
dumpDataMulti(const vector<const char * > & patterns,const vector<unsigned> & flags,const vector<unsigned> & ids,ptr_vector<hs_expr_ext> & ext,const Grey & grey)496 unsigned int dumpDataMulti(const vector<const char *> &patterns,
497                            const vector<unsigned> &flags,
498                            const vector<unsigned> &ids,
499                            ptr_vector<hs_expr_ext> &ext,
500                            const Grey &grey) {
501     unsigned mode = buildMode();
502 
503     printf("Compiling %zu patterns.\n", patterns.size());
504 
505     hs_database_t *db = nullptr;
506     hs_compile_error_t *compile_err;
507 
508     hs_error_t err;
509     const size_t count = patterns.size();
510     if (use_literal_api) {
511         // Compute length of each pattern.
512         vector<size_t> lens(count);
513         for (unsigned int i = 0; i < count; i++) {
514             lens[i] = strlen(patterns[i]);
515         }
516         err = hs_compile_lit_multi_int(patterns.data(), flags.data(),
517                                        ids.data(), ext.c_array(), lens.data(),
518                                        count, mode, plat_info.get(), &db,
519                                        &compile_err, grey);
520     } else {
521         err = hs_compile_multi_int(patterns.data(), flags.data(), ids.data(),
522                                    ext.c_array(), count, mode, plat_info.get(),
523                                    &db, &compile_err, grey);
524     }
525 
526     if (err != HS_SUCCESS) {
527         if (compile_err && compile_err->message) {
528             printf("ERROR: Compile failed: %s\n", compile_err->message);
529         } else {
530             printf("ERROR: hs_compile_multi_int() returned error %u", err);
531         }
532         hs_free_compile_error(compile_err);
533         return 1;
534     }
535 
536     assert(db);
537     dumpScratch(db, grey);
538     dumpInfo(db, grey);
539 
540     if (dump_db) {
541         dumpDb(db, grey);
542     }
543 
544     hs_free_database(db);
545     return 0;
546 }
547 
548 static
dumpData(const ExpressionMap & exprMap,Grey & grey)549 unsigned int dumpData(const ExpressionMap &exprMap, Grey &grey) {
550     u32 dump_flags = buildDumpFlags();
551     string path = "dump";
552     prepareDumpLoc(dumpbase, path, dump_flags, grey);
553     printf("Dumping data for all patterns in '%s' to '%s/%s'\n",
554            patternfile.c_str(), dumpbase.c_str(), path.c_str());
555 
556     string pat_name = grey.dumpPath + "patterns.txt";
557     FILE *pat_out = fopen(pat_name.c_str(), "w");
558     if (!pat_out) {
559         printf("ERROR: unable to open %s\n", pat_name.c_str());
560         return 1;
561     }
562 
563     const size_t numPatterns = exprMap.size();
564     vector<string> expressions(numPatterns);
565     vector<unsigned> ids(numPatterns);
566     vector<unsigned> flags(numPatterns);
567     ptr_vector<hs_expr_ext> ext;
568     ext.reserve(numPatterns);
569 
570     size_t n = 0;
571     for (const auto &elem : exprMap) {
572         const auto &id = elem.first;
573         const auto &regex = elem.second;
574         if (echoSigs) {
575             printf("%u:%s\n", id, regex.c_str());
576         }
577         fprintf(pat_out, "%u:%s\n", id, regex.c_str());
578 
579         ext.push_back(new hs_expr_ext);
580         ids[n] = id;
581         if (!readExpression(regex, expressions[n], &flags[n], &ext[n])) {
582             printf("ERROR: failed to parse expr: %s (id %u)\n",
583                    regex.c_str(), id);
584             fclose(pat_out);
585             return 1;
586         }
587 
588         if (force_edit_distance) {
589             ext[n].flags |= HS_EXT_FLAG_EDIT_DISTANCE;
590             ext[n].edit_distance = edit_distance;
591         }
592 
593         flags[n] |= somFlags;
594         if (force_utf8) {
595             flags[n] |= HS_FLAG_UTF8;
596         }
597         if (force_prefilter) {
598             flags[n] |= HS_FLAG_PREFILTER;
599         }
600 
601         n++;
602     }
603     assert(n);
604 
605     // Our compiler takes an array of plain ol' C strings.
606     vector<const char *> patterns(n);
607     for (size_t i = 0; i < n; i++) {
608         patterns[i] = expressions[i].c_str();
609     }
610 
611     fclose(pat_out);
612     return dumpDataMulti(patterns, flags, ids, ext, grey);
613 }
614 
main(int argc,char * argv[])615 int HS_CDECL main(int argc, char *argv[]) {
616     Grey grey;
617     grey.dumpFlags = Grey::DUMP_BASICS;
618 
619     processArgs(argc, argv, grey);
620 
621     // Load patterns
622     ExpressionMap exprMap;
623     loadExpressions(patternfile, exprMap);
624 
625     if (!signatureFile.empty()) {
626         SignatureSet sigs;
627         loadSignatureList(signatureFile, sigs);
628         exprMap = limitToSignatures(exprMap, sigs);
629     }
630 
631     if (singleId) {
632         exprMap = limitToSignatures(exprMap, {onlyId});
633     }
634 
635     if (exprMap.empty()) {
636         printf("No signatures.\n");
637         return 1;
638     }
639 
640     return dumpData(exprMap, grey);
641 }
642