1 /*
2 * Copyright (c) 2015-2019, Intel Corporation
3 *
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions are met:
6 *
7 * * Redistributions of source code must retain the above copyright notice,
8 * this list of conditions and the following disclaimer.
9 * * Redistributions in binary form must reproduce the above copyright
10 * notice, this list of conditions and the following disclaimer in the
11 * documentation and/or other materials provided with the distribution.
12 * * Neither the name of Intel Corporation nor the names of its contributors
13 * may be used to endorse or promote products derived from this software
14 * without specific prior written permission.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
20 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26 * POSSIBILITY OF SUCH DAMAGE.
27 */
28
29 /**
30 * \file
31 * \brief Hyperscan compile dump tool
32 *
33 * Given a set of patterns, dump all available data from the compilation
34 * process into a directory. This tool is intended to assist Hyperscan
35 * developers with developement and debugging by providing insights into the
36 * built bytecode.
37 *
38 * Note: requires that hyperscan is built with DUMP_SUPPORT enabled.
39 */
40
41 #include "config.h"
42
43 #include "cross_compile.h"
44 #include "ExpressionParser.h"
45 #include "expressions.h"
46 #include "expression_path.h"
47 #include "string_util.h"
48
49 #include "grey.h"
50 #include "hs_compile.h"
51 #include "hs_internal.h"
52 #include "scratch_dump.h"
53
54 #include <cassert>
55 #include <cerrno>
56 #include <cstdio>
57 #include <cstdlib>
58 #include <string>
59 #include <vector>
60
61 #ifndef _WIN32
62 #include <getopt.h>
63 #else
64 #include "win_getopt.h"
65 #endif
66 #include <sys/stat.h>
67
68 #ifndef _WIN32
69 #include <dirent.h>
70 #else
71 #include <direct.h>
72 #define stat _stat
73 #endif
74
75 #include <boost/ptr_container/ptr_vector.hpp>
76
77 using namespace std;
78 using namespace ue2;
79 using boost::ptr_vector;
80
81 namespace /* anonymous */ {
82
83 // Input pattern file
84 string patternfile;
85 // Output path
86 string dumpbase(".");
87 // Compile with streaming
88 bool streaming = true;
89 bool vectored = false;
90
91 bool echoSigs = false;
92 bool dump_db = false;
93 bool force_utf8 = false;
94 int force_prefilter = 0;
95
96 unsigned int onlyId;
97 u32 somFlags = 0;
98 unsigned somPrecisionMode = HS_MODE_SOM_HORIZON_LARGE;
99
100 bool singleId = false;
101 string signatureFile;
102
103 unique_ptr<hs_platform_info> plat_info;
104
105 bool dump_intermediate = true;
106 bool force_edit_distance = false;
107 u32 edit_distance = 0;
108
109 int use_literal_api = 0;
110
111 } // namespace
112
113 // Usage statement.
114 static
usage(const char * name,const char * error)115 void usage(const char *name, const char *error) {
116 printf("Usage: %s [OPTIONS...]\n\n", name);
117 printf("Options:\n\n");
118 printf(" -h Display help and exit.\n");
119 printf(" -G OVERRIDES Overrides for the grey box.\n");
120 printf(" -e PATH Path to expression directory or file.\n");
121 printf(" -s FILE Signature file to use.\n");
122 printf(" -z NUM Signature ID to use.\n");
123 printf(" -N, --block Compile in block mode"
124 " (default: streaming).\n");
125 printf(" -V, --vectored Compile in vectored mode"
126 " (default: streaming).\n");
127 printf(" -o, --output PATH\n");
128 printf(" Use data dump directory PATH (default: dump).\n");
129 printf(" WARNING: existing files in output directory are"
130 " deleted.\n");
131 printf(" -x NAME Cross-compile for arch NAME\n");
132 printf(" -D, --dump_db Dump the final database.\n");
133 printf(" -P, --print Echo signature set to stdout.\n");
134 printf(" -X, --no_intermediate\n");
135 printf(" Do not dump intermediate data.\n");
136 printf("\n");
137 printf("Pattern flags:\n");
138 printf(" -d NUMBER Set SOM precision mode (default: 8 (large)).\n");
139 printf(" -E DISTANCE Match all patterns within edit distance"
140 " DISTANCE.\n");
141 printf(" -8 Force UTF8 mode on all patterns.\n");
142 printf(" -L Apply HS_FLAG_SOM_LEFTMOST to all patterns.\n");
143 printf(" --prefilter Apply HS_FLAG_PREFILTER to all patterns.\n");
144 printf(" --literal-on Use Hyperscan pure literal matching API.\n");
145 printf("\n");
146 printf("Example:\n");
147 printf("$ %s -e pattern.file -s sigfile\n", name);
148 printf("\n");
149
150 if (error) {
151 printf("Error: %s\n", error);
152 }
153 }
154
155 static
processArgs(int argc,char * argv[],Grey & grey)156 void processArgs(int argc, char *argv[], Grey &grey) {
157 static const char *options = "d:De:E:G:hLNo:Ps:VXx:z:8";
158 static struct option longOptions[] = {
159 {"dump_db", no_argument, nullptr, 'D'},
160 {"help", no_argument, nullptr, 'h'},
161 {"output", required_argument, nullptr, 'o'},
162 {"block", no_argument, nullptr, 'N'},
163 {"no_intermediate", no_argument, nullptr, 'X'},
164 {"vectored", no_argument, nullptr, 'V'},
165 {"print", no_argument, nullptr, 'P'},
166 {"utf8", no_argument, nullptr, '8'},
167 {"prefilter", no_argument, &force_prefilter, 1},
168 {"som-width", required_argument, nullptr, 'd'},
169 {"literal-on", no_argument, &use_literal_api, 1},
170 {nullptr, 0, nullptr, 0}
171 };
172
173 for (;;) {
174 int c = getopt_long(argc, argv, options, longOptions, nullptr);
175
176 if (c < 0) {
177 break;
178 }
179 switch (c) {
180 case 'D':
181 dump_db = true;
182 break;
183
184 case 'd': {
185 unsigned dist;
186 if (!fromString(optarg, dist)) {
187 usage(argv[0], "Must provide an integer argument to '-d' flag");
188 exit(1);
189 }
190 switch (dist) {
191 case 2:
192 somPrecisionMode = HS_MODE_SOM_HORIZON_SMALL;
193 break;
194 case 4:
195 somPrecisionMode = HS_MODE_SOM_HORIZON_MEDIUM;
196 break;
197 case 8:
198 somPrecisionMode = HS_MODE_SOM_HORIZON_LARGE;
199 break;
200 default:
201 usage(argv[0], "SOM precision must be 2, 4 or 8");
202 exit(1);
203 }
204 break;
205 }
206
207 case 'h':
208 usage(argv[0], nullptr);
209 exit(0);
210
211 case 'e':
212 patternfile = optarg;
213 break;
214
215 #ifndef RELEASE_BUILD
216 case 'G':
217 applyGreyOverrides(&grey, string(optarg));
218 break;
219 #endif
220
221 case 'L':
222 somFlags |= HS_FLAG_SOM_LEFTMOST;
223 break;
224
225 case 'o':
226 dumpbase = optarg;
227 break;
228
229 case 'P':
230 echoSigs = true;
231 break;
232
233 case 's':
234 signatureFile.assign(optarg);
235 break;
236
237 case 'N':
238 streaming = false;
239 break;
240
241 case 'V':
242 streaming = false;
243 vectored = true;
244 break;
245
246 case 'X':
247 dump_intermediate = false;
248 break;
249
250 case 'x':
251 plat_info = xcompileReadMode(optarg);
252 if (!plat_info) {
253 usage(argv[0], xcompileUsage().c_str());
254 exit(1);
255 }
256 break;
257
258 case 'z':
259 if (!fromString(optarg, onlyId)) {
260 usage(argv[0], "Argument to '-z' flag must be an integer");
261 exit(1);
262 }
263 singleId = true;
264 break;
265 case 'E': {
266 u32 dist;
267 if (!fromString(optarg, dist)) {
268 usage(argv[0], "Argument to '-E' flag must be an integer");
269 exit(1);
270 }
271 force_edit_distance = true;
272 edit_distance = dist;
273 break;
274 }
275 case '8':
276 force_utf8 = true;
277 break;
278 case 0:
279 break;
280 default:
281 usage(argv[0], "");
282 exit(1);
283 }
284 }
285
286 if (patternfile.empty() && !signatureFile.empty()) {
287 /* attempt to infer an expression directory */
288 patternfile = inferExpressionPath(signatureFile);
289 }
290
291 if (patternfile.size() == 0) {
292 usage(argv[0], "No pattern file provided");
293 exit(1);
294 }
295 if (dumpbase.size() == 0) {
296 usage(argv[0], "No output directory provided");
297 exit(1);
298 }
299 }
300
301 static
dumpDb(const struct hs_database * out,const Grey & grey)302 void dumpDb(const struct hs_database *out, const Grey &grey) {
303 char *bytes = nullptr;
304 size_t len = 0;
305 hs_error_t err = hs_serialize_database(out, &bytes, &len);
306 if (err != HS_SUCCESS) {
307 printf("ERROR: hs_serialize_database() failed with error %u\n", err);
308 return;
309 }
310
311 FILE *f = fopen((grey.dumpPath + "db.raw").c_str(), "w");
312 if (!f) {
313 printf("ERROR: unable to write database out: %s", strerror(errno));
314 } else {
315 fwrite(bytes, 1, len, f);
316 fclose(f);
317 }
318 free(bytes);
319 }
320
321 static
buildDumpFlags(void)322 u32 buildDumpFlags(void) {
323 u32 flags = 0;
324 flags |= Grey::DUMP_BASICS;
325 flags |= Grey::DUMP_IMPL;
326
327 if (dump_intermediate) {
328 flags |= Grey::DUMP_PARSE;
329 flags |= Grey::DUMP_INT_GRAPH;
330 }
331
332 return flags;
333 }
334
335 #ifndef _WIN32
336 static
clearDir(const string & path)337 void clearDir(const string &path) {
338 DIR *dir = opendir(path.c_str());
339 if (!dir) {
340 printf("ERROR: couldn't open location %s: %s\n", path.c_str(),
341 strerror(errno));
342 exit(1);
343 }
344
345 struct dirent *d_ent;
346 while (nullptr != (d_ent = readdir(dir))) {
347 string name(d_ent->d_name);
348 if (name == "." || name == "..") {
349 continue;
350 }
351 string f = path + '/' + name;
352 if (unlink(f.c_str()) < 0) {
353 printf("ERROR: couldn't remove file %s: %s\n", f.c_str(),
354 strerror(errno));
355 }
356 }
357 closedir(dir);
358 }
359 #else // windows
360 static
clearDir(const string & path)361 void clearDir(const string &path) {
362 WIN32_FIND_DATA ffd;
363 HANDLE hFind = INVALID_HANDLE_VALUE;
364 string glob = path + "/*";
365 hFind = FindFirstFile(glob.c_str(), &ffd);
366 if (hFind == INVALID_HANDLE_VALUE) {
367 printf("ERROR: couldn't open location %s\n", path.c_str());
368 exit(1);
369 }
370 do {
371 string basename(ffd.cFileName);
372 string fname(path);
373 fname.push_back('/');
374 fname.append(basename);
375
376 // Ignore '.' and '..'
377 if (basename == "." || basename == "..") {
378 continue;
379 }
380
381 if (!DeleteFile(fname.c_str())) {
382 printf("ERROR: couldn't remove file %s\n", fname.c_str());
383 }
384
385 } while (FindNextFile(hFind, &ffd) != 0);
386 FindClose(hFind);
387 }
388 #endif
389
390 static
makeDirectory(const string & dirName)391 int makeDirectory(const string &dirName) {
392 #ifndef _WIN32
393 mode_t mode = S_IRUSR | S_IWUSR | S_IXUSR | S_IRGRP | S_IXGRP |
394 S_IROTH | S_IXOTH;
395 return mkdir(dirName.c_str(), mode);
396 #else
397 return _mkdir(dirName.c_str());
398 #endif
399 }
400
401 static
prepareDumpLoc(string parent,string path,u32 flags,Grey & grey)402 void prepareDumpLoc(string parent, string path, u32 flags, Grey &grey) {
403 struct stat st;
404 if (stat(parent.c_str(), &st)) {
405 // Create dump location if not found
406 if (makeDirectory(parent) < 0) {
407 printf("ERROR: could not create dump location %s: %s\n",
408 parent.c_str(), strerror(errno));
409 exit(1);
410 }
411 }
412
413 // If not separator terminated, add separator
414 if (parent.back() != '/') {
415 parent.push_back('/');
416 }
417
418 // Append path to parent
419 path = parent.append(path);
420 if (stat(path.c_str(), &st)) {
421 // Create dump location if not found
422 if (makeDirectory(path) < 0) {
423 printf("ERROR: could not create dump location %s: %s\n",
424 path.c_str(), strerror(errno));
425 exit(1);
426 }
427 }
428
429 // remove anything in the dump dir - most likely stale
430 clearDir(path);
431
432 // If not separator terminated, add separator
433 if (path.back() != '/') {
434 path.push_back('/');
435 }
436
437 grey.dumpPath = path;
438 grey.dumpFlags = flags;
439 }
440
441 static
buildMode()442 unsigned buildMode() {
443 unsigned mode = 0;
444 if (streaming) {
445 mode |= HS_MODE_STREAM;
446 mode |= somPrecisionMode;
447 assert(!vectored);
448 } else if (vectored) {
449 mode |= HS_MODE_VECTORED;
450 } else {
451 mode |= HS_MODE_BLOCK;
452 }
453
454 return mode;
455 }
456
457 static
dumpScratch(const hs_database_t * db,const Grey & grey)458 void dumpScratch(const hs_database_t *db, const Grey &grey) {
459 hs_scratch_t *scratch = nullptr;
460 hs_error_t err = hs_alloc_scratch(db, &scratch);
461 if (err == HS_SUCCESS) {
462 FILE *f = fopen((grey.dumpPath + "scratch.txt").c_str(), "w");
463 if (f) {
464 dumpScratch(scratch, f);
465 fclose(f);
466 } else {
467 printf("ERROR: could not open %s: %s\n",
468 (grey.dumpPath + "scratch.txt").c_str(), strerror(errno));
469 }
470 } else {
471 printf("ERROR: hs_alloc_scratch() failed with error %u\n", err);
472 }
473 hs_free_scratch(scratch);
474 }
475
476 static
dumpInfo(const hs_database_t * db,const Grey & grey)477 void dumpInfo(const hs_database_t *db, const Grey &grey) {
478 char *info = nullptr;
479 hs_error_t err = hs_database_info(db, &info);
480 if (err == HS_SUCCESS) {
481 FILE *f = fopen((grey.dumpPath + "db_info.txt").c_str(), "w");
482 if (f) {
483 fprintf(f, "%s\n", info);
484 fclose(f);
485 } else {
486 printf("ERROR: could not open %s: %s\n",
487 (grey.dumpPath + "db_info.txt").c_str(), strerror(errno));
488 }
489 } else {
490 printf("ERROR: hs_database_info() failed with error %u\n", err);
491 }
492 free(info);
493 }
494
495 static
dumpDataMulti(const vector<const char * > & patterns,const vector<unsigned> & flags,const vector<unsigned> & ids,ptr_vector<hs_expr_ext> & ext,const Grey & grey)496 unsigned int dumpDataMulti(const vector<const char *> &patterns,
497 const vector<unsigned> &flags,
498 const vector<unsigned> &ids,
499 ptr_vector<hs_expr_ext> &ext,
500 const Grey &grey) {
501 unsigned mode = buildMode();
502
503 printf("Compiling %zu patterns.\n", patterns.size());
504
505 hs_database_t *db = nullptr;
506 hs_compile_error_t *compile_err;
507
508 hs_error_t err;
509 const size_t count = patterns.size();
510 if (use_literal_api) {
511 // Compute length of each pattern.
512 vector<size_t> lens(count);
513 for (unsigned int i = 0; i < count; i++) {
514 lens[i] = strlen(patterns[i]);
515 }
516 err = hs_compile_lit_multi_int(patterns.data(), flags.data(),
517 ids.data(), ext.c_array(), lens.data(),
518 count, mode, plat_info.get(), &db,
519 &compile_err, grey);
520 } else {
521 err = hs_compile_multi_int(patterns.data(), flags.data(), ids.data(),
522 ext.c_array(), count, mode, plat_info.get(),
523 &db, &compile_err, grey);
524 }
525
526 if (err != HS_SUCCESS) {
527 if (compile_err && compile_err->message) {
528 printf("ERROR: Compile failed: %s\n", compile_err->message);
529 } else {
530 printf("ERROR: hs_compile_multi_int() returned error %u", err);
531 }
532 hs_free_compile_error(compile_err);
533 return 1;
534 }
535
536 assert(db);
537 dumpScratch(db, grey);
538 dumpInfo(db, grey);
539
540 if (dump_db) {
541 dumpDb(db, grey);
542 }
543
544 hs_free_database(db);
545 return 0;
546 }
547
548 static
dumpData(const ExpressionMap & exprMap,Grey & grey)549 unsigned int dumpData(const ExpressionMap &exprMap, Grey &grey) {
550 u32 dump_flags = buildDumpFlags();
551 string path = "dump";
552 prepareDumpLoc(dumpbase, path, dump_flags, grey);
553 printf("Dumping data for all patterns in '%s' to '%s/%s'\n",
554 patternfile.c_str(), dumpbase.c_str(), path.c_str());
555
556 string pat_name = grey.dumpPath + "patterns.txt";
557 FILE *pat_out = fopen(pat_name.c_str(), "w");
558 if (!pat_out) {
559 printf("ERROR: unable to open %s\n", pat_name.c_str());
560 return 1;
561 }
562
563 const size_t numPatterns = exprMap.size();
564 vector<string> expressions(numPatterns);
565 vector<unsigned> ids(numPatterns);
566 vector<unsigned> flags(numPatterns);
567 ptr_vector<hs_expr_ext> ext;
568 ext.reserve(numPatterns);
569
570 size_t n = 0;
571 for (const auto &elem : exprMap) {
572 const auto &id = elem.first;
573 const auto ®ex = elem.second;
574 if (echoSigs) {
575 printf("%u:%s\n", id, regex.c_str());
576 }
577 fprintf(pat_out, "%u:%s\n", id, regex.c_str());
578
579 ext.push_back(new hs_expr_ext);
580 ids[n] = id;
581 if (!readExpression(regex, expressions[n], &flags[n], &ext[n])) {
582 printf("ERROR: failed to parse expr: %s (id %u)\n",
583 regex.c_str(), id);
584 fclose(pat_out);
585 return 1;
586 }
587
588 if (force_edit_distance) {
589 ext[n].flags |= HS_EXT_FLAG_EDIT_DISTANCE;
590 ext[n].edit_distance = edit_distance;
591 }
592
593 flags[n] |= somFlags;
594 if (force_utf8) {
595 flags[n] |= HS_FLAG_UTF8;
596 }
597 if (force_prefilter) {
598 flags[n] |= HS_FLAG_PREFILTER;
599 }
600
601 n++;
602 }
603 assert(n);
604
605 // Our compiler takes an array of plain ol' C strings.
606 vector<const char *> patterns(n);
607 for (size_t i = 0; i < n; i++) {
608 patterns[i] = expressions[i].c_str();
609 }
610
611 fclose(pat_out);
612 return dumpDataMulti(patterns, flags, ids, ext, grey);
613 }
614
main(int argc,char * argv[])615 int HS_CDECL main(int argc, char *argv[]) {
616 Grey grey;
617 grey.dumpFlags = Grey::DUMP_BASICS;
618
619 processArgs(argc, argv, grey);
620
621 // Load patterns
622 ExpressionMap exprMap;
623 loadExpressions(patternfile, exprMap);
624
625 if (!signatureFile.empty()) {
626 SignatureSet sigs;
627 loadSignatureList(signatureFile, sigs);
628 exprMap = limitToSignatures(exprMap, sigs);
629 }
630
631 if (singleId) {
632 exprMap = limitToSignatures(exprMap, {onlyId});
633 }
634
635 if (exprMap.empty()) {
636 printf("No signatures.\n");
637 return 1;
638 }
639
640 return dumpData(exprMap, grey);
641 }
642