1 /*
2 * Copyright (c) 2015-2019, Intel Corporation
3 *
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions are met:
6 *
7 * * Redistributions of source code must retain the above copyright notice,
8 * this list of conditions and the following disclaimer.
9 * * Redistributions in binary form must reproduce the above copyright
10 * notice, this list of conditions and the following disclaimer in the
11 * documentation and/or other materials provided with the distribution.
12 * * Neither the name of Intel Corporation nor the names of its contributors
13 * may be used to endorse or promote products derived from this software
14 * without specific prior written permission.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
20 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26 * POSSIBILITY OF SUCH DAMAGE.
27 */
28
29 #include "config.h"
30
31 #include "BoundedQueue.h"
32 #include "DatabaseProxy.h"
33 #include "FileCorpora.h"
34 #include "GraphTruth.h"
35 #include "GroundTruth.h"
36 #include "NfaGeneratedCorpora.h"
37 #include "Thread.h"
38 #include "UltimateTruth.h"
39 #include "args.h"
40 #include "common.h"
41 #include "cross_compile.h"
42 #include "expressions.h"
43 #include "limit.h"
44 #include "ng_corpus_properties.h"
45 #include "sig.h"
46 #include "simple_timer.h"
47 #include "util/expression_path.h"
48 #include "util/string_util.h"
49
50 #include "grey.h"
51 #include "hs.h"
52 #include "parser/utf8_validate.h"
53 #include "ue2common.h"
54 #include "util/container.h"
55 #include "util/make_unique.h"
56
57 #include <algorithm>
58 #include <cassert>
59 #include <cctype>
60 #include <cstdio>
61 #include <cstdlib>
62 #include <fstream>
63 #include <iostream>
64 #include <iterator>
65 #include <mutex>
66 #include <queue>
67 #include <string>
68 #include <thread>
69 #include <vector>
70
71 #include <errno.h>
72 #include <time.h>
73 #include <sys/types.h>
74 #include <sys/stat.h>
75
76 using namespace std;
77 using namespace ue2;
78
79 unsigned int numThreads = 1;
80 unsigned int numScannerThreads = 1;
81 unsigned int numGeneratorThreads = 1;
82 enum ColliderMode colliderMode = MODE_BLOCK;
83 bool echo_matches = false;
84 int g_quiet = 0;
85 bool g_verbose = false;
86 bool g_allSignatures = false;
87 string g_exprPath;
88 vector<string> g_signatureFiles;
89 string g_cmdline;
90 bool g_ue2CompileAll = false;
91 unsigned g_streamBlocks = 0;
92 unsigned long long g_streamOffset = 0;
93 unsigned multicompile_bands = 0;
94 vector<unsigned> g_signatures;
95 unsigned long int g_matchLimit = DEFAULT_PCRE_MATCH_LIMIT;
96 unsigned long int g_matchLimitRecursion = DEFAULT_PCRE_MATCH_RECURSION_LIMIT;
97 string g_corpora_prefix;
98 string g_corpora_suffix;
99 size_t g_memoryLimit = 1000; // megabytes per thread
100 unsigned int somFlags = 0;
101 bool loadDatabases = false;
102 bool saveDatabases = false;
103 bool saveCorpora = false;
104 string saveCorporaFile;
105 string serializePath;
106 bool force_utf8 = false;
107 int force_prefilter = 0;
108 int no_groups = 0;
109 unsigned somPrecisionMode = HS_MODE_SOM_HORIZON_LARGE;
110 unsigned limit_matches = 0;
111 unsigned randomSeed = 0;
112 bool use_random_alignment = false;
113 bool use_PCRE = true;
114 bool use_NFA = true;
115 bool use_UE2 = true;
116 bool use_copy_scratch = false;
117 bool use_copy_stream = false;
118 bool use_mangle_scratch = false;
119 bool use_compress_expand = false;
120 bool use_compress_reset_expand = false;
121 bool use_literal_api = false;
122 int abort_on_failure = 0;
123 int no_signal_handler = 0;
124 size_t max_scan_queue_len = 25000;
125 size_t max_generator_queue_len = 25000;
126 bool force_edit_distance = false;
127 unsigned edit_distance = 0;
128 CorpusProperties corpus_gen_prop;
129
130 // Semi constants
131 unsigned min_ue2_align = 0;
132 unsigned max_ue2_align = MAX_MAX_UE2_ALIGN;
133
134 #define DEDUPE_MATCHES
135
136 static
countCores()137 unsigned countCores() {
138 unsigned count = std::thread::hardware_concurrency();
139 return count ? count : 1;
140 }
141
142 // Detect the Address Sanitizer with either GCC or Clang.
143 #if defined(__SANITIZE_ADDRESS__)
144 # define BUILT_WITH_ASAN
145 #elif defined(__has_feature)
146 # if __has_feature(address_sanitizer)
147 # define BUILT_WITH_ASAN
148 # endif
149 #endif
150
151 // Set the default params that can be overridden with commandline args
152 static
setDefaults()153 void setDefaults() {
154 // Seed random number generator for corpora
155 randomSeed = time(nullptr);
156 // Overcommit since we have generators and scanners running.
157 numThreads = countCores() * 2;
158
159 #ifdef BUILT_WITH_ASAN
160 cout << "NOTE: Built with AddressSanitizer.\n"
161 << "Defaulting to no memory limit and no signal handler.\n"
162 << endl;
163 g_memoryLimit = 0;
164 no_signal_handler = 1;
165 #endif
166 }
167
168 static
exit_with_fail(void)169 void exit_with_fail(void) {
170 cout << "Failing cmdline was:\n " << g_cmdline << endl;
171 if (abort_on_failure) {
172 cout << "Calling abort()" << endl;
173 abort();
174 }
175 exit(1);
176 }
177
178 namespace /* anonymous */ {
179
180 // For saving corpora out if the -w flag is specified. Note that we need a
181 // mutex to serialise writes from different threads.
182 class CorpusWriter {
183 public:
CorpusWriter(const string & filename)184 explicit CorpusWriter(const string &filename)
185 : out(filename.c_str(), ios_base::trunc) {}
186
write(const string & str)187 void write(const string &str) {
188 std::lock_guard<std::mutex> lock(mutex);
189 out << str << flush;
190 }
191
192 private:
193 ofstream out;
194 std::mutex mutex;
195 };
196
197 unique_ptr<CorpusWriter> corporaOut = nullptr;
198
199 // Encapsulates all of the data reported from a test
200 struct TestSummary {
201 unsigned totalCorpora = 0;
202 unsigned totalExpressions = 0;
203 unsigned failCorpora = 0;
204 unsigned failPcreCompile = 0;
205 unsigned failNGCompile = 0;
206 unsigned failUe2Compile = 0;
207 unsigned failCompileDifference = 0; // failed in pcre but not ue2
208 unsigned failPcreScan = 0;
209 unsigned failNGScan = 0;
210 unsigned failUe2Scan = 0;
211 unsigned failDiff = 0;
212 unsigned failNoGroundTruth = 0;
213 set<unsigned> failIds;
214 set<unsigned> nogtIds;
215
216 // true if we've got a failure
hasFailure__anonce675f310111::TestSummary217 bool hasFailure() const {
218 return failDiff != 0 || !failIds.empty() || failCompileDifference != 0;
219 }
220
merge__anonce675f310111::TestSummary221 void merge(const TestSummary &a) {
222 totalCorpora += a.totalCorpora;
223 totalExpressions += a.totalExpressions;
224 failCorpora += a.failCorpora;
225 failPcreCompile += a.failPcreCompile;
226 failNGCompile += a.failNGCompile;
227 failUe2Compile += a.failUe2Compile;
228 failCompileDifference += a.failCompileDifference;
229 failPcreScan += a.failPcreScan;
230 failNGScan += a.failNGScan;
231 failUe2Scan += a.failUe2Scan;
232 failDiff += a.failDiff;
233 failNoGroundTruth += a.failNoGroundTruth;
234 failIds.insert(begin(a.failIds), end(a.failIds));
235 nogtIds.insert(begin(a.nogtIds), end(a.nogtIds));
236 }
237 };
238
239 enum TestResult {
240 TEST_NO_GROUND_TRUTH,
241 TEST_PASSED,
242 TEST_SKIPPED,
243 TEST_FAILED_COMPILE,
244 TEST_FAILED
245 };
246
247 struct TestUnit {
248 shared_ptr<CompiledPcre> pcre; // libpcre bytecode
249 shared_ptr<CNGInfo> cngi; // NFA graph info (compilation is deferred)
250 shared_ptr<DatabaseProxy> ue2; // ue2 bytecode
251 Corpus corpus; // a local copy, as we may modify it
252
253 unsigned id; // expression id
254 unsigned corpus_id; // corpus id
255 bool highlander; // single match flag
256 bool prefilter; // prefilter flag
257 bool som; // start of match flag
258 bool multi; // if false, we're in single mode.
259 bool utf8; // at least one of our patterns is utf8
260
261 enum TestResult result;
262
TestUnit__anonce675f310111::TestUnit263 TestUnit(unsigned sig_id, unsigned c_id, const Corpus &c,
264 shared_ptr<CompiledPcre> pcre_in, shared_ptr<CNGInfo> cngi_in,
265 shared_ptr<DatabaseProxy> ue2_in, bool multi_in, bool utf8_in,
266 bool highlander_in, bool prefilter_in, bool som_in)
267 : pcre(pcre_in), cngi(cngi_in), ue2(ue2_in), corpus(c), id(sig_id),
268 corpus_id(c_id), highlander(highlander_in), prefilter(prefilter_in),
269 som(som_in), multi(multi_in), utf8(utf8_in),
270 result(TEST_NO_GROUND_TRUTH) {}
271 };
272
273 } // namespace
274
275 // For ease of printing match sets
276 static
operator <<(std::ostream & os,const set<MatchResult> & v)277 std::ostream &operator<<(std::ostream &os, const set<MatchResult> &v) {
278 auto vi = v.begin(), ve = v.end();
279 while (vi != ve) {
280 // match offsets
281 os << '(' << vi->from << ',' << vi->to << ')';
282 if (++vi != ve) {
283 os << ", ";
284 }
285 }
286 return os;
287 }
288
289 static
printCorpus(ostream & out,const Corpus & corpus)290 void printCorpus(ostream &out, const Corpus &corpus) {
291 // Print the offending corpus
292 string corpus_data(corpus.data.begin() + g_corpora_prefix.size(),
293 corpus.data.end() - g_corpora_suffix.size());
294 bool trimmed = false;
295 if (corpus_data.size() > 1000) {
296 corpus_data.resize(1000);
297 trimmed = true;
298 }
299 out << " Corpus data: '" << printable(corpus_data) << "'";
300 if (trimmed) {
301 out << " ...";
302 }
303 out << "\n";
304 }
305
306 static
printGroundTruthDifference(ostream & out,const ExpressionMap & exprMap,const TestUnit & unit,const ResultSet & pcre_results,const ResultSet & ngw_results)307 void printGroundTruthDifference(ostream &out, const ExpressionMap &exprMap,
308 const TestUnit &unit,
309 const ResultSet &pcre_results,
310 const ResultSet &ngw_results) {
311 assert(contains(exprMap, unit.id));
312 // Print the expression itself
313 out << " Expression: '" << exprMap.at(unit.id) << "'\n";
314 printCorpus(out, unit.corpus);
315 out << " PCRE matches: " << pcre_results.matches << "\n";
316 out << " NFA matches: " << ngw_results.matches << "\n";
317
318 vector<MatchResult> diff;
319
320 set_difference(pcre_results.matches.begin(), pcre_results.matches.end(),
321 ngw_results.matches.begin(), ngw_results.matches.end(),
322 back_inserter(diff));
323
324 for (const auto &match : diff) {
325 out << " PCRE only: match (" << match.from << "," << match.to << ")\n";
326 }
327
328 diff.clear();
329
330 set_difference(ngw_results.matches.begin(), ngw_results.matches.end(),
331 pcre_results.matches.begin(), pcre_results.matches.end(),
332 back_inserter(diff));
333
334 for (const auto &match : diff) {
335 out << " NFA only: match (" << match.from << "," << match.to << ")\n";
336 }
337 out.flush();
338 }
339
340 // Report the difference information when a pattern causes different matches in
341 // our engines.
342 static
printDifference(ostream & out,const ExpressionMap & exprMap,const TestUnit & unit,const ResultSet & gt_results,const vector<ResultSet> & ue2_results,const vector<bool> & pass)343 void printDifference(ostream &out, const ExpressionMap &exprMap,
344 const TestUnit &unit, const ResultSet >_results,
345 const vector<ResultSet> &ue2_results,
346 const vector<bool> &pass) {
347 assert(contains(exprMap, unit.id));
348 // Print the expression itself
349 out << " Expression: '" << exprMap.at(unit.id) << "'\n";
350 printCorpus(out, unit.corpus);
351 out << " " << gt_results.src << " matches: " << gt_results.matches << endl;
352
353 for (u32 align = min_ue2_align; align < max_ue2_align; align++) {
354 if (pass[align]) {
355 continue;
356 }
357
358 u32 align_in = align;
359 out << " UE2 (" << align;
360 while (align + 1 < max_ue2_align) {
361 if (pass[align + 1] ||
362 ue2_results[align] != ue2_results[align + 1]) {
363 break;
364 }
365 align++;
366 }
367
368 if (align != align_in) {
369 out << " - " << align;
370 }
371
372 out << ") matches: " << ue2_results[align].matches;
373 out << endl;
374
375 vector<MatchResult> only;
376
377 // Print matches only returned by ground truth
378 set_difference(gt_results.matches.begin(),
379 gt_results.matches.end(),
380 ue2_results[align].matches.begin(),
381 ue2_results[align].matches.end(),
382 back_inserter(only));
383 for (const auto &match : only) {
384 out << " " << gt_results.src << " only: match ("
385 << match.from << "," << match.to << ')' << endl;
386 }
387
388 // Print matches only returned by UE2
389 only.clear();
390
391 set_difference(ue2_results[align].matches.begin(),
392 ue2_results[align].matches.end(),
393 gt_results.matches.begin(),
394 gt_results.matches.end(),
395 back_inserter(only));
396
397 for (const auto &match : only) {
398 out << " UE2 only: match (" << match.from << "," << match.to << ')'
399 << endl;
400 }
401
402 #ifdef DEDUPE_MATCHES
403 for (const auto &match : ue2_results[align].dupe_matches) {
404 out << " UE2 dupe: match (" << match.from << "," << match.to
405 << ')' << endl;
406 }
407 #endif
408
409 if (ue2_results[align].uoom) {
410 out << " *** UE2 produced matches out of order" << endl;
411 }
412 if (ue2_results[align].match_after_halt) {
413 out << " *** UE2 produced matches after termination" << endl;
414 }
415 if (ue2_results[align].invalid_id) {
416 out << " *** UE2 produced matches for invalid ids" << endl;
417 }
418 }
419 }
420
421 static
printMode(void)422 void printMode(void) {
423 if (!g_ue2CompileAll) {
424 cout << "Single/";
425 } else if (!multicompile_bands) {
426 cout << "Multi/";
427 } else {
428 cout << "Multi-" << multicompile_bands << "/";
429 }
430
431 switch (colliderMode) {
432 case MODE_BLOCK:
433 cout << "Block";
434 break;
435 case MODE_STREAMING:
436 cout << "Streaming-" << g_streamBlocks;
437 if (g_streamOffset) {
438 cout << " offset " << g_streamOffset;
439 }
440 if (use_copy_stream) {
441 cout << " [copy stream]";
442 }
443 if (use_compress_expand) {
444 cout << " [compress]";
445 }
446 if (use_compress_reset_expand) {
447 cout << " [compress+reset]";
448 }
449 break;
450 case MODE_VECTORED:
451 cout << "Vectored-" << g_streamBlocks;
452 break;
453 case MODE_HYBRID:
454 cout << "Hybrid";
455 break;
456 }
457
458 if (use_copy_scratch) {
459 cout << " [copy scratch]";
460 }
461 if (use_mangle_scratch) {
462 cout << " [mangle]";
463 }
464 cout << endl;
465 }
466
467 static
printSummaryV(const TestSummary & sum)468 void printSummaryV(const TestSummary &sum) {
469 cout << endl;
470 cout << "Summary:" << endl;
471 cout << "Mode: ";
472 printMode();
473 cout << "=========" << endl;
474 cout << "Expressions processed: " << sum.totalExpressions << endl;
475 cout << "Corpora processed: " << sum.totalCorpora << endl;
476 cout << "Expressions with failures: " << sum.failIds.size() << endl;
477 cout << " Corpora generation failures: " << sum.failCorpora << endl;
478 cout << " Compilation failures: ";
479 cout << "pcre:" << sum.failPcreCompile << ", ";
480 cout << "ng:" << sum.failNGCompile << ", ";
481 cout << "ue2:" << sum.failUe2Compile << endl;
482
483 cout << " Matching failures: ";
484 cout << "pcre:" << sum.failPcreScan << ", ";
485 cout << "ng:" << sum.failNGScan << ", ";
486 cout << "ue2:" << sum.failUe2Scan << endl;
487 cout << " Match differences: " << sum.failIds.size() << endl;
488 cout << " No ground truth: " << sum.nogtIds.size() << endl;
489 cout << "Total match differences: " << sum.failDiff << endl;
490 }
491
492 static
printSummaryQ(const TestSummary & sum)493 void printSummaryQ(const TestSummary &sum) {
494 cout << "Summary: ";
495 printMode();
496
497 cout << "Processed: " << sum.totalExpressions << " expressions, "
498 << sum.totalCorpora << " corpora" << endl;
499 cout << "Failures: " << sum.failIds.size()
500 << " (corpora: " << sum.failCorpora << "; compile: ";
501 cout << "pcre:" << sum.failPcreCompile << ", ";
502 cout << "ng:" << sum.failNGCompile << ", ";
503 cout << "ue2:" << sum.failUe2Compile << "; match: ";
504
505 cout << "pcre:" << sum.failPcreScan << ", ";
506 cout << "ng:" << sum.failNGScan << ", ";
507 cout << "ue2:" << sum.failUe2Scan << ")" << endl;
508 cout << "Differences: " << sum.failIds.size() << " expressions, "
509 << sum.failDiff << " total" << endl;
510 cout << "No ground truth: " << sum.nogtIds.size() << " expressions" << endl;
511 }
512
513 static
printSummary(const TestSummary & sum)514 void printSummary(const TestSummary &sum) {
515 if (g_quiet > 1) {
516 printSummaryQ(sum);
517 } else {
518 printSummaryV(sum);
519 }
520 }
521
522 // Returns true if this Highlander mode test succeeded.
523 static
checkSingleMatch(const ResultSet & ground_truth,const ResultSet & ue2)524 bool checkSingleMatch(const ResultSet &ground_truth, const ResultSet &ue2) {
525 // In Highlander (single-match) mode, UE2 must return only one of the
526 // matches returned by PCRE/GraphTruth. It need not be the earliest one.
527 if (ground_truth.matches.empty()) {
528 return ue2.matches.empty();
529 } else if (ue2.matches.size() != 1) {
530 return false;
531 } else {
532 return contains(ground_truth.matches, *ue2.matches.begin());
533 }
534 }
535
536 // Returns true if this prefiltering mode test succeeded.
537 static
checkPrefilterMatch(const ResultSet & ground_truth,const ResultSet & ue2,bool highlander)538 bool checkPrefilterMatch(const ResultSet &ground_truth, const ResultSet &ue2,
539 bool highlander) {
540 if (highlander) {
541 // Highlander + prefilter is tricky. Best we can do is say that if PCRE
542 // returns matches, UE2 must return a match, though it may not be one
543 // of the ones returned by PCRE (it may be an earlier match).
544 if (!ground_truth.matches.empty()) {
545 return ue2.matches.size() == 1;
546 }
547 // We can't verify anything more.
548 return true;
549 } else if (!limit_matches || ue2.matches.size() < limit_matches) {
550 // In prefilter mode, every match found by PCRE must be found by UE2,
551 // but the UE2 set may be a superset of the PCRE match set.
552 return std::includes(ue2.matches.begin(), ue2.matches.end(),
553 ground_truth.matches.begin(), ground_truth.matches.end());
554 }
555
556 // Otherwise, we've hit our match limit. Prefilter mode is quite difficult
557 // to verify in this case, so we just verify that "something happened".
558 return true;
559 }
560
561 static
makeEndOfMatchOnly(const ResultSet & rs)562 ResultSet makeEndOfMatchOnly(const ResultSet &rs) {
563 ResultSet out(rs.src);
564 for (const auto &match : rs.matches) {
565 out.addMatch(0, match.to);
566 }
567 return out;
568 }
569
570 static
checkMultiMatch(const ResultSet & ground_truth,const ResultSet & ue2)571 bool checkMultiMatch(const ResultSet &ground_truth, const ResultSet &ue2) {
572 // If we had out-of-order matches or matches after termination, we have a
573 // bug!
574 if (ue2.uoom || ue2.match_after_halt || ue2.invalid_id) {
575 return false;
576 }
577
578 // If we have more UE2 matches than our limit, we have a bug!
579 if (limit_matches && ue2.matches.size() > limit_matches) {
580 return false;
581 }
582
583 // If we have more UE2 matches than PCRE matches, we have a bug!
584 if (ue2.matches.size() > ground_truth.matches.size()) {
585 return false;
586 }
587
588 // If we've got fewer matches than our limit to test, then the match sets
589 // must be identical.
590 if (!limit_matches || ground_truth.matches.size() < limit_matches) {
591 return ground_truth == ue2;
592 }
593
594 // We're in limit_matches mode _and_ we have hit the limit. Every match in
595 // 'ue2' must be in 'pcre'. (We can't just trim pcre and do an equality
596 // test as matches may come out of UE2 a little out of order.)
597
598 // In streaming mode, the limit may mean that we get a different SOM from
599 // the leftmost one. So we compare only end offsets.
600 if (colliderMode == MODE_STREAMING || colliderMode == MODE_VECTORED) {
601 ResultSet gt_eom = makeEndOfMatchOnly(ground_truth);
602 ResultSet ue2_eom = makeEndOfMatchOnly(ue2);
603 return std::includes(gt_eom.matches.begin(), gt_eom.matches.end(),
604 ue2_eom.matches.begin(), ue2_eom.matches.end());
605 }
606
607 return std::includes(ground_truth.matches.begin(),
608 ground_truth.matches.end(),
609 ue2.matches.begin(), ue2.matches.end());
610 }
611
612 // Check results, returns true if there has any failure.
613 static
checkTestResults(ostream & out,TestSummary & summary,const ExpressionMap & exprMap,TestUnit & unit,const ResultSet & gt_results,const vector<ResultSet> & ue2_results)614 bool checkTestResults(ostream &out, TestSummary &summary,
615 const ExpressionMap &exprMap, TestUnit &unit,
616 const ResultSet >_results,
617 const vector<ResultSet> &ue2_results) {
618 bool failed = false;
619 bool any_fail = false;
620 vector<bool> pass(max_ue2_align, false);
621
622 for (unsigned align = min_ue2_align; align != max_ue2_align; ++align) {
623 if (unit.prefilter) {
624 failed = !checkPrefilterMatch(gt_results, ue2_results[align],
625 unit.highlander);
626 } else if (unit.highlander) {
627 failed = !checkSingleMatch(gt_results, ue2_results[align]);
628 } else {
629 // In non-Highlander mode, the two result sets MUST be equal
630 // don't check PCRE if the scan didn't succeed
631 failed = !checkMultiMatch(gt_results, ue2_results[align]);
632 }
633
634 #ifdef DEDUPE_MATCHES
635 if (!failed) {
636 failed |= !ue2_results[align].dupe_matches.empty();
637 }
638 #endif
639
640 pass[align] = !failed;
641
642 any_fail |= failed;
643
644 summary.failDiff += failed ? 1 : 0;
645
646 if (g_verbose) {
647 if (failed) {
648 out << "FAILED: id " << unit.id << ", alignment " << align
649 << ", corpus " << unit.corpus_id << ", results differ"
650 << endl;
651 } else {
652 out << "PASSED: id " << unit.id << ", alignment " << align
653 << ", corpus " << unit.corpus_id
654 << " (matched "<< gt_results.src << ":"
655 << gt_results.matches.size()
656 << ", ue2:" << ue2_results[align].matches.size() << ")"
657 << endl;
658 }
659 }
660 }
661
662 if (!any_fail) {
663 return false;
664 }
665
666 if (!g_verbose) {
667 out << "FAILED: id " << unit.id << ", alignment";
668 for (unsigned align = min_ue2_align; align != max_ue2_align; ++align) {
669 if (!pass[align]) {
670 out << " " << align;
671
672 if (align + 1 < max_ue2_align && !pass[align + 1]) {
673 while (align + 1 < max_ue2_align && !pass[align + 1]) {
674 align++;
675 }
676
677 out << "-" << align;
678 }
679 }
680 }
681
682 out << ", corpus " << unit.corpus_id << ", results differ" << endl;
683 }
684 printDifference(out, exprMap, unit, gt_results, ue2_results, pass);
685
686 return true;
687 }
688
689 // Construct a UE2 database, taking care of loading/saving to disk when
690 // appropriate
691 static
constructDatabase(const set<unsigned int> & ids,const UltimateTruth & ultimate)692 shared_ptr<DatabaseProxy> constructDatabase(const set<unsigned int> &ids,
693 const UltimateTruth &ultimate) {
694 assert(!ids.empty());
695
696 if (loadDatabases) {
697 string filename = ultimate.dbFilename(ids);
698 shared_ptr<BaseDB> db = ultimate.loadDatabase(filename, ids);
699 if (!db) {
700 if (!g_quiet) {
701 cout << "FAILED: could not load database " << filename << endl;
702 }
703 return nullptr;
704 }
705 return make_shared<DatabaseProxy>(db);
706 }
707
708 shared_ptr<DatabaseProxy> ue2 = make_shared<DatabaseProxy>(ids);
709
710 try {
711 // If we're not runnable (i.e. we're cross-compiling), let's at least
712 // try to build the database.
713 if (!ultimate.runnable()) {
714 shared_ptr<BaseDB> db = ue2->get(ultimate);
715 assert(db); // throws otherwise
716 }
717
718 // Compile and save if we've been told to.
719 if (saveDatabases) {
720 string filename = ultimate.dbFilename(ids);
721 if (!ultimate.saveDatabase(*(ue2->get(ultimate)),
722 filename.c_str())) {
723 cout << "FAILED: could not save database to file: " << filename
724 << endl;
725 }
726 }
727 } catch (const CompileFailed &fail) {
728 if (!g_quiet) {
729 cout << "FAILED: ue2 compile failed for " << *ids.begin() << ": "
730 << fail.error << endl;
731 }
732 // Return null database to indicate failure.
733 ue2 = nullptr;
734 }
735
736 return ue2;
737 }
738
739 static
getGraphTruth(ostream & out,CNGInfo & cngi,GraphTruth & graph,TestUnit & unit,ResultSet & ngw_results,TestSummary & summary,const string & expression)740 bool getGraphTruth(ostream &out, CNGInfo &cngi, GraphTruth &graph,
741 TestUnit &unit, ResultSet &ngw_results,
742 TestSummary &summary, const string &expression) {
743 debug_stage = STAGE_GRAPH_RUN;
744
745 // Skip patterns we've previously marked as bad.
746 if (cngi.is_bad()) {
747 summary.failNGScan++;
748 return false;
749 }
750
751 // If we already have match information for this corpus, we don't need to
752 // run PCRE at all. At the moment our on-disk format for corpora with match
753 // information only includes the end-of-match offset, so we only use these
754 // in non-som modes. If edit distance is forced, all bets are off so we
755 // ignore this as well.
756 if (!g_streamOffset && unit.corpus.hasMatches && !force_utf8 && !cngi.som &&
757 !force_edit_distance) {
758 if (g_verbose) {
759 out << "Using corpus match set rather than NFA graph" << endl;
760 }
761 ngw_results = ResultSet(unit.corpus.matches, RESULT_FROM_GRAPH);
762 } else {
763 // compile the actual graph
764 const CompiledNG *cng;
765 try {
766 debug_stage = STAGE_GRAPH_COMPILE;
767 cng = cngi.get();
768 debug_stage = STAGE_UNDEFINED;
769 }
770 catch (const NGCompileFailure &err) {
771 debug_stage = STAGE_UNDEFINED;
772 summary.failNGCompile++;
773 summary.failNGScan++;
774 cngi.mark_bad();
775 if (!g_quiet) {
776 cout << "FAILED: id " << unit.id
777 << ", NFA graph compile failed (" << err.msg << ")"
778 << endl;
779 }
780 return false;
781 }
782 debug_stage = STAGE_GRAPH_RUN;
783
784 // Run NFA graph and collect match information.
785 string error;
786 assert(cng);
787 if (!graph.run(unit.id, *cng, cngi, unit.corpus.data, ngw_results,
788 error)) {
789 if (!g_quiet) {
790 out << "FAILED: id " << unit.id
791 << ", NFA graph scan failed: " << error << "\n"
792 << " Expression: '" << expression << "'\n"
793 << " Corpus data: '" << printable(unit.corpus.data)
794 << "'\n"
795 << " (note: marking bad, skipping subsequent tests)"
796 << endl;
797 }
798 summary.failNGScan++;
799 cngi.mark_bad();
800 return false;
801 }
802 }
803
804 return true;
805 }
806
807 static
getGroundTruth(ostream & out,CompiledPcre & cpcre,GroundTruth & ground,TestUnit & unit,ResultSet & pcre_results,TestSummary & summary)808 bool getGroundTruth(ostream &out, CompiledPcre &cpcre, GroundTruth &ground,
809 TestUnit &unit, ResultSet &pcre_results,
810 TestSummary &summary) {
811 debug_stage = STAGE_PCRE_RUN;
812
813 // Skip patterns we've previously marked as bad.
814 if (cpcre.is_bad()) {
815 summary.failPcreScan++;
816 return false;
817 }
818
819 // If we already have match information for this corpus, we don't need to
820 // run PCRE at all. At the moment our on-disk format for corpora with match
821 // information only includes the end-of-match offset, so we only use these
822 // in non-som modes. Also, we can't trust corpus matches if there was an
823 // edit distance requested for all patterns.
824 if (!g_streamOffset && unit.corpus.hasMatches && !force_utf8 && !cpcre.som
825 && !force_edit_distance) {
826 if (g_verbose) {
827 out << "Using corpus match set rather than PCRE" << endl;
828 }
829 pcre_results = ResultSet(unit.corpus.matches, RESULT_FROM_PCRE);
830 } else {
831 // Run PCRE and collect match information.
832 string error;
833 if (!ground.run(unit.id, cpcre, unit.corpus.data, pcre_results,
834 error)) {
835 if (!g_quiet) {
836 out << "FAILED: id " << unit.id
837 << ", libpcre scan failed: " << error << "\n"
838 << " Expression: '" << cpcre.expression << "'\n"
839 << " Corpus data: '" << printable(unit.corpus.data)
840 << "'\n"
841 << " (note: marking PCRE bad, skipping subsequent tests)"
842 << endl;
843 }
844 summary.failPcreScan++;
845 cpcre.mark_bad();
846 return false;
847 }
848 }
849
850 return true;
851 }
852
853 static
writeCorpus(unsigned id,const Corpus & corpus,const ResultSet & results)854 void writeCorpus(unsigned id, const Corpus &corpus, const ResultSet &results) {
855 assert(corporaOut);
856 ostringstream oss;
857 oss << id << "=\"" << printable(corpus.data) << "\": ";
858
859 auto vi = results.matches.begin();
860 auto ve = results.matches.end();
861
862 // Print match end offsets only.
863 while (vi != ve) {
864 oss << vi->to;
865 if (++vi != ve) {
866 oss << ",";
867 }
868 }
869 oss << "\n";
870 corporaOut->write(oss.str());
871 }
872
873 static
runTestUnit(ostream & out,GroundTruth & ground,GraphTruth & graph,UltimateTruth & ultimate,TestUnit & unit,TestSummary & summary,const ExpressionMap & exprMap)874 void runTestUnit(ostream &out, GroundTruth &ground, GraphTruth &graph,
875 UltimateTruth &ultimate, TestUnit &unit, TestSummary &summary,
876 const ExpressionMap &exprMap) {
877 assert(use_UE2);
878 Corpus &corpus = unit.corpus;
879
880 shared_ptr<const BaseDB> db;
881 if (use_UE2) {
882 // Acquire UE2 database.
883 debug_stage = STAGE_UE2_COMPILE;
884 try {
885 db = unit.ue2->get(ultimate);
886 } catch (const CompileFailed &fail) {
887 summary.failUe2Compile++;
888 if (!g_quiet) {
889 out << "FAILED: ue2 compile failed for " << unit.id << ": "
890 << fail.error << endl;
891 unit.result = TEST_FAILED_COMPILE;
892 debug_stage = STAGE_UNDEFINED;
893 return;
894 }
895 }
896 debug_stage = STAGE_UNDEFINED;
897
898 if (!db) {
899 // Database previously failed compilation.
900 unit.result = TEST_SKIPPED;
901 return;
902 }
903 }
904
905 // If the user has specified that they want prefix/suffix data added to
906 // their corpora, we do it here; this is as local as possible to the
907 // test, so we don't keep piles of HUGE corpora hanging around.
908 if (!g_corpora_prefix.empty()) {
909 corpus.data.insert(0, g_corpora_prefix);
910 corpus.hasMatches = false;
911 }
912 if (!g_corpora_suffix.empty()) {
913 corpus.data.append(g_corpora_suffix);
914 corpus.hasMatches = false;
915 }
916
917 ResultSet gt_results(RESULT_FROM_PCRE);
918 vector<ResultSet> ue2_results(max_ue2_align, ResultSet(RESULT_FROM_UE2));
919
920 bool gt_done = false;
921
922 // run PCRE test if enabled and if compile succeeded
923 if (unit.pcre && use_PCRE) {
924 gt_done = getGroundTruth(out, *unit.pcre, ground, unit, gt_results,
925 summary);
926 }
927
928 // run NFA if PCRE failed (or wasn't run), or if we don't run UE2
929 if (unit.cngi && (use_NFA && !gt_done)) {
930 gt_done = getGraphTruth(out, *unit.cngi, graph, unit, gt_results,
931 summary, exprMap.find(unit.id)->second);
932 }
933
934 // both ground truth methods either failed or didn't run
935 if (!gt_done) {
936 unit.result = TEST_NO_GROUND_TRUTH;
937 return;
938 }
939
940 // Write out corpora if we've been told to
941 if (saveCorpora) {
942 writeCorpus(unit.id, unit.corpus, gt_results);
943 }
944
945 debug_stage = STAGE_UE2_RUN;
946 for (unsigned int align = min_ue2_align; align != max_ue2_align; ++align) {
947 bool ok = ultimate.run(unit.id, db, corpus.data, !unit.multi, align,
948 ue2_results[align]);
949
950 if (!ok) {
951 if (!g_quiet) {
952 out << "FAILED: id " << unit.id << ", ue2 scan at alignment "
953 << align << " failed" << endl;
954 }
955 unit.result = TEST_FAILED;
956 debug_stage = STAGE_UNDEFINED;
957 return;
958 }
959 }
960
961 // if we're using UE2, check all the different results modes
962 if (checkTestResults(out, summary, exprMap, unit, gt_results,
963 ue2_results)) {
964 unit.result = TEST_FAILED;
965 } else {
966 unit.result = TEST_PASSED;
967 }
968
969 debug_stage = STAGE_UNDEFINED;
970 }
971
972 /* Used for testing the graph truth agains PCE */
973 static
runGroundCompTestUnit(ostream & out,GroundTruth & ground,GraphTruth & graph,TestUnit & unit,TestSummary & summary,const ExpressionMap & exprMap)974 void runGroundCompTestUnit(ostream &out, GroundTruth &ground, GraphTruth &graph,
975 TestUnit &unit, TestSummary &summary,
976 const ExpressionMap &exprMap) {
977 assert(!use_UE2);
978 assert(use_PCRE);
979 assert(use_NFA);
980 Corpus &corpus = unit.corpus;
981
982 // If the user has specified that they want prefix/suffix data added to
983 // their corpora, we do it here; this is as local as possible to the
984 // test, so we don't keep piles of HUGE corpora hanging around.
985 if (!g_corpora_prefix.empty()) {
986 corpus.data.insert(0, g_corpora_prefix);
987 corpus.hasMatches = false;
988 }
989 if (!g_corpora_suffix.empty()) {
990 corpus.data.append(g_corpora_suffix);
991 corpus.hasMatches = false;
992 }
993
994 ResultSet pcre_results(RESULT_FROM_PCRE);
995 ResultSet ngw_results(RESULT_FROM_GRAPH);
996
997 bool pcreResult = false;
998 bool graphResult = false;
999
1000 if (unit.pcre) {
1001 pcreResult = getGroundTruth(out, *unit.pcre, ground, unit, pcre_results,
1002 summary);
1003 }
1004
1005 if (unit.cngi) {
1006 graphResult = getGraphTruth(out, *unit.cngi, graph, unit, ngw_results,
1007 summary, exprMap.find(unit.id)->second);
1008 }
1009
1010 // no ground truth found either NFA or PCRE failed
1011 if (!pcreResult || !graphResult) {
1012 unit.result = TEST_NO_GROUND_TRUTH;
1013 return;
1014 }
1015
1016 // Write out corpora if we've been told to
1017 if (saveCorpora) {
1018 writeCorpus(unit.id, unit.corpus, pcre_results);
1019 }
1020
1021 if (pcre_results.matches != ngw_results.matches) {
1022 unit.result = TEST_FAILED;
1023 out << "FAILED: id " << unit.id << ", corpus " << unit.corpus_id
1024 << ", results differ" << endl;
1025
1026 printGroundTruthDifference(out, exprMap, unit, pcre_results,
1027 ngw_results);
1028 } else {
1029 unit.result = TEST_PASSED;
1030 if (g_verbose) {
1031 out << "PASSED: id " << unit.id << ", corpus " << unit.corpus_id
1032 << " (matched pcre:" << pcre_results.matches.size()
1033 << ", matched ng:" << ngw_results.matches.size() << ")" << endl;
1034 }
1035 }
1036
1037 debug_stage = STAGE_UNDEFINED;
1038 }
1039
1040 static
addCorporaToQueue(ostream & out,BoundedQueue<TestUnit> & testq,unsigned id,CorporaSource & corpora,TestSummary & summary,shared_ptr<CompiledPcre> cpcre,shared_ptr<CNGInfo> cngi,shared_ptr<DatabaseProxy> ue2,bool multi,bool utf8)1041 void addCorporaToQueue(ostream &out, BoundedQueue<TestUnit> &testq, unsigned id,
1042 CorporaSource &corpora, TestSummary &summary,
1043 shared_ptr<CompiledPcre> cpcre, shared_ptr<CNGInfo> cngi,
1044 shared_ptr<DatabaseProxy> ue2, bool multi, bool utf8) {
1045 // build corpora
1046 vector<Corpus> c;
1047 try {
1048 corpora.generate(id, c);
1049 }
1050 catch (CorpusFailure &err) {
1051 if (!g_quiet) {
1052 out << "FAILED: id " << id << ", corpora failure: " << err.message
1053 << endl;
1054 }
1055 summary.failCorpora++;
1056 return;
1057 }
1058
1059 const bool som = cpcre ? cpcre->som : cngi->som;
1060 const bool prefilter = cpcre ? cpcre->prefilter : cngi->prefilter;
1061 const bool highlander = cpcre ? cpcre->highlander : cngi->highlander;
1062
1063 // If we're in UTF-8 mode and the corpus isn't valid UTF-8, skip it:
1064 // Hyperscan's behaviour when scanning invalid UTF-8 data in UTF-8 mode
1065 // is undefined.
1066 if (utf8) {
1067 auto is_invalid_utf8 = [](const Corpus &corpus) {
1068 return !isValidUtf8(corpus.data.c_str(), corpus.data.size());
1069 };
1070 c.erase(remove_if(begin(c), end(c), is_invalid_utf8), end(c));
1071 }
1072
1073 // Collect together corpora units in a container so that we don't have to
1074 // repeatedly lock the queue.
1075 vector<unique_ptr<TestUnit>> tests;
1076 tests.reserve(c.size());
1077
1078 size_t corpus_id = 0;
1079 for (const Corpus &corpus : c) {
1080 tests.push_back(ue2::make_unique<TestUnit>(id, corpus_id, corpus, cpcre,
1081 cngi, ue2, multi, utf8,
1082 highlander, prefilter, som));
1083 corpus_id++;
1084 }
1085
1086 testq.push(begin(tests), end(tests));
1087 }
1088
1089 namespace /* anonymous */ {
1090
1091 // A subclass of Thread that stores its own output in a stringstream, flushing
1092 // it to cout when necessary.
1093 class OutputThread : public Thread {
1094 public:
OutputThread(size_t id)1095 OutputThread(size_t id) : Thread(id) {}
~OutputThread()1096 ~OutputThread() override {
1097 flush_output();
1098 }
1099
1100 protected:
flush_output()1101 void flush_output() {
1102 const string &s = out.str();
1103 if (!s.empty()) {
1104 cout << s;
1105 out.str(""); // make empty
1106 }
1107 }
1108
1109 // Output stream, flushed to cout after every test unit.
1110 stringstream out;
1111 };
1112
1113 class ScanThread : public OutputThread {
1114 public:
ScanThread(size_t id,BoundedQueue<TestUnit> & testq,const ExpressionMap & e,const hs_platform_info * plat,const Grey & grey)1115 ScanThread(size_t id, BoundedQueue<TestUnit> &testq, const ExpressionMap &e,
1116 const hs_platform_info *plat, const Grey &grey)
1117 : OutputThread(id), q(testq),
1118 ground(out, e, g_matchLimit, g_matchLimitRecursion), graph(out, e),
1119 ultimate(out, e, plat, grey, g_streamBlocks), exprMap(e) {}
1120
run()1121 void run() override {
1122 DEBUG_PRINTF("thread %zu running\n", thread_id);
1123 for (;;) {
1124 const auto unit = q.pop(thread_id);
1125 if (!unit) {
1126 // Sentinel value, indicates that we have run out of units to
1127 // process.
1128 DEBUG_PRINTF("thread %zu stopped\n", thread_id);
1129 break;
1130 }
1131
1132 assert(unit);
1133 assert(exprMap.find(unit->id) != exprMap.end());
1134
1135 // Debug information is stored in TLS and (hopefully) printed out in
1136 // the event of a crash.
1137 debug_expr = unit->id;
1138 debug_corpus = unit->corpus_id;
1139 debug_corpus_ptr = unit->corpus.data.c_str();
1140 debug_corpus_len = unit->corpus.data.size();
1141 debug_expr_ptr = exprMap.find(unit->id)->second.c_str();
1142
1143 if (use_UE2) {
1144 runTestUnit(out, ground, graph, ultimate, *unit, summary,
1145 exprMap);
1146 } else {
1147 runGroundCompTestUnit(out, ground, graph, *unit, summary,
1148 exprMap);
1149 }
1150
1151 if (unit->result == TEST_NO_GROUND_TRUTH) {
1152 summary.nogtIds.insert(unit->id);
1153 // this is fine, continue
1154 } else if (unit->result == TEST_FAILED) {
1155 summary.failIds.insert(unit->id);
1156 }
1157
1158 count++;
1159 summary.totalCorpora++;
1160 flush_output();
1161 }
1162 }
1163
getSummary() const1164 const TestSummary &getSummary() const { return summary; }
1165
1166 public:
1167 size_t count = 0; // number of units processed
1168
1169 private:
1170 // Shared queue.
1171 BoundedQueue<TestUnit> &q;
1172
1173 // Thread-local data.
1174 GroundTruth ground; // independent copy
1175 GraphTruth graph; // independent copy
1176 UltimateTruth ultimate; // independent copy
1177 TestSummary summary;
1178
1179 // Constant shared data.
1180 const ExpressionMap &exprMap;
1181 };
1182
1183 /** Represent a work item for the corpus generation threads. This contains
1184 * all information relating to an expression. The corpus generator will
1185 * generate corpora for this expression and enqueue work items representing
1186 * complete test cases for the scanning threads.
1187 */
1188 struct CorpusGenUnit {
CorpusGenUnit__anonce675f310311::CorpusGenUnit1189 CorpusGenUnit(unique_ptr<CNGInfo> cngi_in, unique_ptr<CompiledPcre> pcre_in,
1190 shared_ptr<DatabaseProxy> ue2_in, unsigned expr_id,
1191 bool multi_in, bool utf8_in)
1192 : cngi(move(cngi_in)), pcre(move(pcre_in)), ue2(ue2_in), id(expr_id),
1193 multi(multi_in), utf8(utf8_in) {}
1194
1195 unique_ptr<CNGInfo> cngi;
1196 unique_ptr<CompiledPcre> pcre;
1197
1198 /* ue2 shared_ptr as in multicompile and banded compile it is shared amongst
1199 * various corpus units (with differing expression ids). */
1200 shared_ptr<DatabaseProxy> ue2;
1201
1202 unsigned id; // expression id
1203 bool multi; // ue2 contains more than one expression
1204 bool utf8; // ue2 can be run against utf8 corpora
1205 };
1206
1207 class CorpusGenThread : public OutputThread {
1208 public:
CorpusGenThread(size_t id,BoundedQueue<TestUnit> & testq_in,BoundedQueue<CorpusGenUnit> & corpq_in,const CorporaSource & corpora_in)1209 CorpusGenThread(size_t id, BoundedQueue<TestUnit> &testq_in,
1210 BoundedQueue<CorpusGenUnit> &corpq_in,
1211 const CorporaSource &corpora_in)
1212 : OutputThread(id), testq(testq_in), corpq(corpq_in),
1213 corpora(corpora_in.clone()) {}
1214
run()1215 void run() override {
1216 DEBUG_PRINTF("thread %zu running\n", thread_id);
1217 for (;;) {
1218 auto c = corpq.pop(thread_id);
1219 if (!c) {
1220 break;
1221 }
1222
1223 addCorporaToQueue(out, testq, c->id, *corpora, summary,
1224 move(c->pcre), move(c->cngi), c->ue2, c->multi,
1225 c->utf8);
1226
1227 count++;
1228 flush_output();
1229 }
1230 }
1231
getSummary() const1232 const TestSummary &getSummary() const { return summary; }
1233
1234 public:
1235 size_t count = 0; // number of units processed
1236
1237 private:
1238 // Output queue, shared between threads.
1239 BoundedQueue<TestUnit> &testq;
1240
1241 // Input queue, shared between corpus generator threads.
1242 BoundedQueue<CorpusGenUnit> &corpq;
1243
1244 // Thread-local data.
1245 const unique_ptr<CorporaSource> corpora; // independent copy
1246 TestSummary summary;
1247 };
1248
1249 } // namespace
1250
1251 static
makeNGInfo(const unsigned id,TestSummary & summary,GraphTruth & graph,UltimateTruth & ultimate,shared_ptr<DatabaseProxy> ue2)1252 unique_ptr<CNGInfo> makeNGInfo(const unsigned id, TestSummary &summary,
1253 GraphTruth &graph, UltimateTruth &ultimate,
1254 shared_ptr<DatabaseProxy> ue2) {
1255 string nfaErr;
1256
1257 try {
1258 debug_stage = STAGE_GRAPH_PREPROCESS;
1259 auto cngi = graph.preprocess(id);
1260 debug_stage = STAGE_UNDEFINED;
1261 return cngi;
1262 }
1263 catch (const NGCompileFailure &err) {
1264 nfaErr = err.msg;
1265 debug_stage = STAGE_UNDEFINED;
1266 // fall through
1267 }
1268 catch (const NGUnsupportedFailure &err) {
1269 // unsupported error happens when the pattern appears to be valid, but
1270 // there are things that we don't yet support (e.g. SOM).
1271 // in this case, try again, suppressing the errors
1272 debug_stage = STAGE_UNDEFINED;
1273 summary.failNGCompile++;
1274
1275 // try again and suppress unsupported errors
1276 try {
1277 debug_stage = STAGE_GRAPH_PREPROCESS;
1278 auto cngi = graph.preprocess(id, true);
1279 debug_stage = STAGE_UNDEFINED;
1280
1281 // preprocess succeeded - that means the pattern itself is valid.
1282 // however, we can't use it, so we have to mark it as bad
1283 // only print the error in the following cases:
1284 // 1) if verbose is specified
1285 // 2) if we are not using UE2 and quiet is NOT specified
1286 if ((!use_UE2 && !g_quiet) || g_verbose) {
1287 cout << "FAILED: id " << id << ", NFA graph preprocess failed ("
1288 << err.msg << ")" << endl;
1289 }
1290 cngi->mark_bad();
1291 return cngi;
1292 }
1293 catch (const NGCompileFailure &e) {
1294 // compile failed
1295 nfaErr = e.msg;
1296 debug_stage = STAGE_UNDEFINED;
1297 // fall through
1298 }
1299 }
1300
1301 // We should ensure that we also fail compilation with UE2, otherwise we
1302 // likely have a pattern support bug.
1303 try {
1304 auto db = ue2->get(ultimate);
1305 if (db) {
1306 // if we made it this far, that means UE2 compile succeeded while
1307 // NFA compile failed.
1308 cout << "FAILED: id " << id << ", NFA graph preprocess failed ("
1309 << nfaErr << ") but UE2 compile succeeded." << endl;
1310 summary.failNGCompile++;
1311 summary.failCompileDifference++;
1312 return nullptr;
1313 }
1314 // If db is nullptr, we have previously failed compilation of this
1315 // database.
1316 }
1317 catch (const CompileFailed &) {
1318 // Everything's OK: compilation failed in Hyperscan as well. Fall
1319 // through.
1320 }
1321 summary.failNGCompile++;
1322 if (!g_quiet) {
1323 cout << "FAILED: id " << id << ", NFA graph preprocess failed ("
1324 << nfaErr << ")" << endl;
1325 }
1326 return nullptr;
1327 }
1328
1329 static
makePcre(const unsigned id,TestSummary & summary,GroundTruth & ground,UltimateTruth & ultimate,shared_ptr<DatabaseProxy> ue2)1330 unique_ptr<CompiledPcre> makePcre(const unsigned id, TestSummary &summary,
1331 GroundTruth &ground, UltimateTruth &ultimate,
1332 shared_ptr<DatabaseProxy> ue2) {
1333 string pcreErr;
1334
1335 try {
1336 debug_stage = STAGE_PCRE_COMPILE;
1337 auto cpcre = ground.compile(id);
1338 debug_stage = STAGE_UNDEFINED;
1339 return cpcre;
1340 }
1341 catch (const SoftPcreCompileFailure &err) {
1342 debug_stage = STAGE_UNDEFINED;
1343 summary.failPcreCompile++;
1344 if (g_verbose) {
1345 cout << "FAILED: id " << id
1346 << ", libpcre compile failed with soft error: " << err.msg
1347 << endl;
1348 }
1349 return nullptr;
1350 }
1351 catch (const PcreCompileFailure &err) {
1352 debug_stage = STAGE_UNDEFINED;
1353 pcreErr = err.msg;
1354 // fall through
1355 }
1356
1357 // We should ensure that we also fail compilation with UE2, otherwise we
1358 // likely have a pattern support bug.
1359 try {
1360 auto db = ue2->get(ultimate);
1361 if (db) {
1362 // OK, so now we have a situation: PCRE failed but UE2 succeeded.
1363 // There is one situation where this is legal: patterns beginning
1364 // with (*UTF8), which will throw an error due to the callback
1365 // wrapping we do for PCRE. We can check these by trying to compile
1366 // an "unwrapped" PCRE.
1367 ground.compile(id, true);
1368 // If we didn't throw, PCRE failed above but succeeded when not
1369 // wrapped in a callback, and UE2 succeeded. Not worth reporting,
1370 // fall through.
1371 }
1372 }
1373 catch (const CompileFailed &) {
1374 // Everything's OK: compilation failed in Hyperscan as well. Fall
1375 // through.
1376 }
1377 catch (const PcreCompileFailure &) {
1378 cout << "FAILED: id " << id << ", libpcre compile failed (" << pcreErr
1379 << ") but UE2 compile succeeded." << endl;
1380 summary.failPcreCompile++;
1381 summary.failCompileDifference++;
1382 return nullptr;
1383 }
1384
1385 if (!g_quiet) {
1386 cout << "FAILED: id " << id << ", libpcre compile failed: " << pcreErr
1387 << endl;
1388 }
1389
1390 summary.failPcreCompile++;
1391 return nullptr;
1392 }
1393
1394 static
drainGenerators(BoundedQueue<CorpusGenUnit> & corpq,vector<unique_ptr<CorpusGenThread>> & generators,TestSummary & summary)1395 void drainGenerators(BoundedQueue<CorpusGenUnit> &corpq,
1396 vector<unique_ptr<CorpusGenThread>> &generators,
1397 TestSummary &summary) {
1398 // Push a sentinel per thread.
1399 for (size_t i = 0; i < generators.size(); i++) {
1400 corpq.push(nullptr);
1401 }
1402
1403 // Wait for workers to end and retrieve their results.
1404 for (auto &c : generators) {
1405 c->join();
1406 summary.merge(c->getSummary());
1407 }
1408 }
1409
1410 // Note: In multi-pattern cases, utf8 is true if any pattern to be run against
1411 // this corpus is in UTF-8 mode.
1412 static
makeCorpusGenUnit(unsigned id,TestSummary & summary,GroundTruth & ground,GraphTruth & graph,UltimateTruth & ultimate,shared_ptr<DatabaseProxy> ue2,bool multi,bool utf8)1413 unique_ptr<CorpusGenUnit> makeCorpusGenUnit(unsigned id, TestSummary &summary,
1414 GroundTruth &ground,
1415 GraphTruth &graph,
1416 UltimateTruth &ultimate,
1417 shared_ptr<DatabaseProxy> ue2,
1418 bool multi, bool utf8) {
1419 unique_ptr<CompiledPcre> cpcre;
1420 unique_ptr<CNGInfo> cngi;
1421
1422 // compile PCRE bytecode
1423 if (use_PCRE) {
1424 cpcre = makePcre(id, summary, ground, ultimate, ue2);
1425 }
1426 if (use_NFA) {
1427 cngi = makeNGInfo(id, summary, graph, ultimate, ue2);
1428 }
1429
1430 // if both compiles failed, skip the test
1431 if (!cpcre && !cngi) {
1432 return nullptr;
1433 }
1434
1435 // Caller may already have set the UTF-8 property (in multi cases)
1436 utf8 |= cpcre ? cpcre->utf8 : cngi->utf8;
1437
1438 return ue2::make_unique<CorpusGenUnit>(move(cngi), move(cpcre), ue2, id,
1439 multi, utf8);
1440 }
1441
1442 static
hasUTF8Pattern(GroundTruth & ground,ExpressionMap::const_iterator it,ExpressionMap::const_iterator end)1443 bool hasUTF8Pattern(GroundTruth &ground, ExpressionMap::const_iterator it,
1444 ExpressionMap::const_iterator end) {
1445 /* note: we cannot just check the flags as utf8 can be enabled in the
1446 * pattern itself with (*UTF) */
1447 debug_stage = STAGE_PCRE_COMPILE;
1448 for (; it != end; ++it) {
1449 try {
1450 auto cpcre = ground.compile(it->first);
1451 assert(cpcre); // Would have thrown PcreCompileFailure otherwise.
1452 if (cpcre->utf8) {
1453 DEBUG_PRINTF("UTF8 mode\n");
1454 debug_stage = STAGE_UNDEFINED;
1455 return true;
1456 }
1457 }
1458 catch (const PcreCompileFailure &) {
1459 continue;
1460 }
1461 }
1462 debug_stage = STAGE_UNDEFINED;
1463 return false;
1464 }
1465
1466 // Fill a test queue with single-pattern tests.
1467 static
buildSingle(BoundedQueue<CorpusGenUnit> & corpq,TestSummary & summary,GroundTruth & ground,GraphTruth & graph,UltimateTruth & ultimate,const ExpressionMap & exprMap)1468 void buildSingle(BoundedQueue<CorpusGenUnit> &corpq, TestSummary &summary,
1469 GroundTruth &ground, GraphTruth &graph,
1470 UltimateTruth &ultimate, const ExpressionMap &exprMap) {
1471 for (const auto &m : exprMap) {
1472 unsigned id = m.first;
1473 debug_expr = id;
1474 debug_expr_ptr = m.second.c_str();
1475
1476 shared_ptr<DatabaseProxy> ue2 = constructDatabase({id}, ultimate);
1477 if (!ue2) {
1478 summary.failUe2Compile++;
1479 continue;
1480 }
1481
1482 // if we're cross-compiling, then we don't bother building PCRE and
1483 // running scans, we're just going to output the database bytecode.
1484 if (!ultimate.runnable()) {
1485 continue;
1486 }
1487
1488 bool multi = false;
1489 bool utf8 = false;
1490 auto u = makeCorpusGenUnit(id, summary, ground, graph, ultimate, ue2,
1491 multi, utf8);
1492 if (u) {
1493 corpq.push(move(u));
1494 }
1495 }
1496 }
1497
1498 // Fill a test queue with multi-pattern tests of size N, where N is the band
1499 // size specified on the command line.
1500 static
buildBanded(BoundedQueue<CorpusGenUnit> & corpq,TestSummary & summary,GroundTruth & ground,GraphTruth & graph,UltimateTruth & ultimate,const ExpressionMap & exprMap)1501 void buildBanded(BoundedQueue<CorpusGenUnit> &corpq, TestSummary &summary,
1502 GroundTruth &ground, GraphTruth &graph,
1503 UltimateTruth &ultimate, const ExpressionMap &exprMap) {
1504 for (auto i = exprMap.begin(), e = exprMap.end(); i != e;) {
1505 debug_expr = i->first;
1506 debug_expr_ptr = i->second.c_str();
1507
1508 // Build a set of IDs in this band from the expression map
1509 set<unsigned> bandIds;
1510
1511 if (g_verbose) {
1512 cout << "Building set:";
1513 }
1514
1515 ExpressionMap::const_iterator band_end = i;
1516 for (u32 j = 0; j < multicompile_bands && band_end != e;
1517 j++, ++band_end) {
1518 bandIds.insert(bandIds.end(), band_end->first);
1519 if (g_verbose) {
1520 cout << " " << band_end->first;
1521 }
1522 }
1523
1524 if (g_verbose) {
1525 cout << endl;
1526 }
1527
1528 // compile UE2 bytecode
1529 shared_ptr<DatabaseProxy> ue2 = constructDatabase(bandIds, ultimate);
1530 if (!ue2) {
1531 summary.failUe2Compile++;
1532 i = band_end;
1533 continue;
1534 }
1535
1536 // if we're cross-compiling, then we don't bother building PCRE and
1537 // running scans, we're just going to output the database bytecode.
1538 if (!ultimate.runnable()) {
1539 i = band_end;
1540 continue;
1541 }
1542
1543 bool utf8 = hasUTF8Pattern(ground, i, band_end);
1544
1545 for (; i != band_end; ++i) {
1546 unsigned id = i->first;
1547 bool multi = true;
1548 auto u = makeCorpusGenUnit(id, summary, ground, graph, ultimate,
1549 ue2, multi, utf8);
1550 if (u) {
1551 corpq.push(move(u));
1552 }
1553 }
1554 }
1555 }
1556
1557 // Fill a test queue with multi-pattern tests.
1558 static
buildMulti(BoundedQueue<CorpusGenUnit> & corpq,TestSummary & summary,GroundTruth & ground,GraphTruth & graph,UltimateTruth & ultimate,const ExpressionMap & exprMap)1559 void buildMulti(BoundedQueue<CorpusGenUnit> &corpq, TestSummary &summary,
1560 GroundTruth &ground, GraphTruth &graph, UltimateTruth &ultimate,
1561 const ExpressionMap &exprMap) {
1562 // Build a set of all IDs from the expression map
1563 set<unsigned> idsAll;
1564 for (const auto &e : exprMap) {
1565 idsAll.insert(e.first);
1566 }
1567
1568 // Compile in UE2
1569 shared_ptr<DatabaseProxy> ue2 = constructDatabase(idsAll, ultimate);
1570 if (!ue2) {
1571 summary.failUe2Compile++;
1572 return;
1573 }
1574
1575 // if we're cross-compiling, then we don't bother building PCRE and
1576 // running scans, we're just going to output the database bytecode.
1577 if (!ultimate.runnable()) {
1578 return;
1579 }
1580
1581 bool utf8 = hasUTF8Pattern(ground, exprMap.begin(), exprMap.end());
1582
1583 for (const auto &m : exprMap) {
1584 unsigned id = m.first;
1585 debug_expr = id;
1586 debug_expr_ptr = m.second.c_str();
1587 bool multi = true;
1588 auto u = makeCorpusGenUnit(id, summary, ground, graph, ultimate, ue2,
1589 multi, utf8);
1590 if (u) {
1591 corpq.push(move(u));
1592 }
1593 }
1594 }
1595
1596 static
generateTests(CorporaSource & corpora_src,const ExpressionMap & exprMap,TestSummary & summary,const hs_platform_info * plat,const Grey & grey,BoundedQueue<TestUnit> & testq)1597 void generateTests(CorporaSource &corpora_src, const ExpressionMap &exprMap,
1598 TestSummary &summary, const hs_platform_info *plat,
1599 const Grey &grey, BoundedQueue<TestUnit> &testq) {
1600 GraphTruth graph(cout, exprMap);
1601 GroundTruth ground(cout, exprMap, g_matchLimit, g_matchLimitRecursion);
1602 UltimateTruth ultimate(cout, exprMap, plat, grey, g_streamBlocks);
1603
1604 // Construct corpus generator queue and threads.
1605 BoundedQueue<CorpusGenUnit> corpq(numGeneratorThreads,
1606 max_generator_queue_len);
1607 vector<unique_ptr<CorpusGenThread>> generators;
1608 for (size_t i = 0; i < numGeneratorThreads; i++) {
1609 auto c = make_unique<CorpusGenThread>(i, testq, corpq, corpora_src);
1610 c->start();
1611 generators.push_back(move(c));
1612 }
1613
1614 if (g_ue2CompileAll && multicompile_bands) {
1615 printf("Running single-pattern/banded-multi-compile test for %zu "
1616 "expressions.\n\n", exprMap.size());
1617 buildBanded(corpq, summary, ground, graph, ultimate, exprMap);
1618 } else if (g_ue2CompileAll) {
1619 printf("Running single-pattern/multi-compile test for %zu "
1620 "expressions.\n\n", exprMap.size());
1621 buildMulti(corpq, summary, ground, graph, ultimate, exprMap);
1622 } else {
1623 printf("Running single-pattern/single-compile test for %zu "
1624 "expressions.\n\n", exprMap.size());
1625 buildSingle(corpq, summary, ground, graph, ultimate, exprMap);
1626 }
1627
1628 drainGenerators(corpq, generators, summary);
1629 }
1630
1631 static
printSettingsV(const vector<string> & corporaFiles,const hs_platform_info * platform)1632 void printSettingsV(const vector<string> &corporaFiles,
1633 const hs_platform_info *platform) {
1634 cout << "hscollider: The Pattern Collider Mark II\n\n"
1635 << "Number of threads: " << numThreads << " (" << numScannerThreads
1636 << " scanner, " << numGeneratorThreads << " generator)\n"
1637 << "Expression path: " << g_exprPath << "\n"
1638 << "Signature files: ";
1639 if (g_signatureFiles.empty()) {
1640 cout << "none" << endl;
1641 } else {
1642 for (unsigned i = 0; i < g_signatureFiles.size(); i++) {
1643 string &fname = g_signatureFiles[i];
1644 if (i > 0) {
1645 cout << string(20, ' ');
1646 }
1647 cout << fname << endl;
1648 }
1649 }
1650 cout << "Mode of operation: ";
1651
1652 switch (colliderMode) {
1653 case MODE_BLOCK: cout << "block mode"; break;
1654 case MODE_STREAMING: cout << "streaming mode"; break;
1655 case MODE_VECTORED: cout << "vectored mode"; break;
1656 case MODE_HYBRID: cout << "hybrid mode"; break;
1657 }
1658 cout << endl;
1659
1660 if (limit_matches) {
1661 cout << "Terminate scanning after " << limit_matches << " matches."
1662 << endl;
1663 }
1664
1665 if (platform) {
1666 cout << "Cross-compile for: " << to_string(*platform) << endl;
1667 }
1668
1669 if (loadDatabases) {
1670 cout << "Loading DBs from: " << serializePath << endl;
1671 }
1672 if (saveDatabases) {
1673 cout << "Saving DBs to: " << serializePath << endl;
1674 }
1675 if (colliderMode == MODE_STREAMING) {
1676 cout << "Stream block count: " << g_streamBlocks << endl;
1677 }
1678 if (colliderMode == MODE_VECTORED) {
1679 cout << "Vectored block count: " << g_streamBlocks << endl;
1680 }
1681
1682 if (use_UE2) {
1683 if (max_ue2_align == min_ue2_align + 1) {
1684 cout << "UE2 scan alignment: " << min_ue2_align << endl;
1685 } else {
1686 cout << "UE2 scan alignment: [" << min_ue2_align << ", "
1687 << max_ue2_align << ")" << endl;
1688 }
1689 }
1690
1691 if (!corporaFiles.empty()) {
1692 for (const auto &file : corporaFiles) {
1693 cout << "Corpora read from file: " << file << endl;
1694 }
1695 } else {
1696 cout << "Corpora properties: \n"
1697 << " random seed: " << corpus_gen_prop.getSeed() << "\n"
1698 << " percentages: " << corpus_gen_prop.percentMatch()
1699 << "% match, "
1700 << corpus_gen_prop.percentUnmatch() << "% unmatch, "
1701 << corpus_gen_prop.percentRandom() << "% random" << endl;
1702
1703 // prefix and suffix info
1704 const min_max &prefixSpan = corpus_gen_prop.prefixRange;
1705 const min_max &suffixSpan = corpus_gen_prop.suffixRange;
1706 if (prefixSpan.max) {
1707 cout << " random prefix: " << prefixSpan.min << " to "
1708 << prefixSpan.max << endl;
1709 } else {
1710 cout << " random prefix: none" << endl;
1711 }
1712 if (suffixSpan.max) {
1713 cout << " random suffix: " << suffixSpan.min
1714 << " to " << suffixSpan.max << endl;
1715 } else {
1716 cout << " random suffix: none" << endl;
1717 }
1718
1719 // cycle info
1720 pair<unsigned, unsigned> cycleSpan = corpus_gen_prop.getCycleLimit();
1721 cout << " follow cycles: " << cycleSpan.first << " to "
1722 << cycleSpan.second << " times" << endl;
1723 }
1724
1725 if (saveCorpora) {
1726 cout << "Saving corpora to: " << saveCorporaFile << endl;
1727 }
1728
1729 cout << endl;
1730 }
1731
1732 static
printSettingsQ(const vector<string> & corporaFiles,const hs_platform_info * platform)1733 void printSettingsQ(const vector<string> &corporaFiles,
1734 const hs_platform_info *platform) {
1735 cout << "Number of threads: " << numThreads << endl
1736 << "Expression path: " << g_exprPath << endl
1737 << "Signature files: ";
1738 if (g_signatureFiles.empty()) {
1739 cout << "none" << endl;
1740 } else {
1741 for (unsigned i = 0; i < g_signatureFiles.size(); i++) {
1742 string &fname = g_signatureFiles[i];
1743 if (i > 0) {
1744 cout << string(20, ' ');
1745 }
1746 cout << fname << endl;
1747 }
1748 }
1749 cout << "Mode of operation: ";
1750
1751 switch (colliderMode) {
1752 case MODE_BLOCK: cout << "block mode"; break;
1753 case MODE_STREAMING: cout << "streaming mode"; break;
1754 case MODE_VECTORED: cout << "vectored mode"; break;
1755 case MODE_HYBRID: cout << "hybrid mode"; break;
1756 }
1757 cout << endl;
1758
1759 if (limit_matches) {
1760 cout << "Terminate scanning after " << limit_matches << " matches."
1761 << endl;
1762 }
1763
1764 if (platform) {
1765 cout << "Cross-compile for: " << to_string(*platform) << endl;
1766 }
1767
1768 if (colliderMode == MODE_STREAMING) {
1769 cout << "Stream block count: " << g_streamBlocks << endl;
1770 }
1771 if (colliderMode == MODE_VECTORED) {
1772 cout << "Vectored block count: " << g_streamBlocks << endl;
1773 }
1774
1775 if (max_ue2_align == min_ue2_align + 1) {
1776 cout << "UE2 scan alignment: " << min_ue2_align << endl;
1777 } else {
1778 cout << "UE2 scan alignment: [" << min_ue2_align << ", "
1779 << max_ue2_align << ")" << endl;
1780 }
1781
1782 if (!g_corpora_prefix.empty()) {
1783 cout << "Prefix of " << g_corpora_prefix.size() << "bytes" << endl;
1784 }
1785 if (!g_corpora_suffix.empty()) {
1786 cout << "Suffix of " << g_corpora_suffix.size() << "bytes" << endl;
1787 }
1788
1789 if (!corporaFiles.empty()) {
1790 cout << "Corpora: from file" << endl;
1791 } else {
1792 cout << "Corpora: -R " << corpus_gen_prop.getSeed() << " -p "
1793 << corpus_gen_prop.percentMatch() << ","
1794 << corpus_gen_prop.percentUnmatch() << ","
1795 << corpus_gen_prop.percentRandom();
1796
1797 // prefix and suffix info
1798 const min_max &prefixSpan = corpus_gen_prop.prefixRange;
1799 const min_max &suffixSpan = corpus_gen_prop.suffixRange;
1800 if (prefixSpan.max) {
1801 cout << " -P " << prefixSpan.min << "," << prefixSpan.max;
1802 }
1803 if (suffixSpan.max) {
1804 cout << " -S " << suffixSpan.min << "," << suffixSpan.max;
1805 }
1806
1807 // cycle info
1808 pair<unsigned, unsigned> cycleSpan = corpus_gen_prop.getCycleLimit();
1809 cout << " -C " << cycleSpan.first << "," << cycleSpan.second;
1810 cout << endl;
1811 }
1812 }
1813
1814 static
printSettings(const vector<string> & c,const hs_platform_info * plat)1815 void printSettings(const vector<string> &c, const hs_platform_info *plat) {
1816 if (g_quiet > 1) {
1817 printSettingsQ(c, plat);
1818 } else {
1819 printSettingsV(c, plat);
1820 }
1821 }
1822
1823 static
buildCorpora(const vector<string> & corporaFiles,const ExpressionMap & exprMap)1824 unique_ptr<CorporaSource> buildCorpora(const vector<string> &corporaFiles,
1825 const ExpressionMap &exprMap) {
1826 if (!corporaFiles.empty()) {
1827 auto c = ue2::make_unique<FileCorpora>();
1828 for (const auto &file : corporaFiles) {
1829 if (!c->readFile(file)) {
1830 cout << "Error reading corpora from file: " << file << endl;
1831 exit_with_fail();
1832 }
1833 }
1834 return move(c); /* move allows unique_ptr<CorporaSource> conversion */
1835 } else {
1836 auto c = ue2::make_unique<NfaGeneratedCorpora>(
1837 exprMap, corpus_gen_prop, force_utf8, force_prefilter);
1838 return move(c);
1839 }
1840 }
1841
1842 static
needsQuotes(const char * s)1843 bool needsQuotes(const char *s) {
1844 size_t len = strlen(s);
1845
1846 if (len == 0) {
1847 return true;
1848 }
1849 #ifndef _WIN32
1850 // don't confuse the correct isblank for the one in locale
1851 int (*blank)(int) = &std::isblank;
1852 if (find_if(s, s + len, blank) != s + len) {
1853 #else
1854 if (find_if(s, s + len, [](unsigned char c){ return std::isblank(c); }) != s + len) {
1855 #endif
1856 return true;
1857 }
1858
1859 return false;
1860 }
1861
1862 static
1863 void storeCmdline(int argc, char **argv) {
1864 for (int i = 0; i < argc; i++) {
1865 const char *s = argv[i];
1866 if (needsQuotes(s)) {
1867 g_cmdline += '"';
1868 g_cmdline += s;
1869 g_cmdline += '"';
1870 } else {
1871 g_cmdline += s;
1872 }
1873 if (i != argc - 1) {
1874 g_cmdline += " ";
1875 }
1876 }
1877 }
1878
1879 static
1880 bool runTests(CorporaSource &corpora_source, const ExpressionMap &exprMap,
1881 const hs_platform_info *plat, const Grey &grey) {
1882 TestSummary summary;
1883 summary.totalExpressions = exprMap.size();
1884 BoundedQueue<TestUnit> testq(numScannerThreads, max_scan_queue_len);
1885
1886 // Start scanning threads.
1887 vector<unique_ptr<ScanThread>> scanners;
1888 for (size_t i = 0; i < numScannerThreads; i++) {
1889 auto s = ue2::make_unique<ScanThread>(i, testq, exprMap, plat, grey);
1890 s->start();
1891 scanners.push_back(move(s));
1892 }
1893
1894 generateTests(corpora_source, exprMap, summary, plat, grey, testq);
1895
1896 // Push a sentinel per scanning thread to ensure that everyone finishes
1897 // work.
1898 for (size_t i = 0; i < scanners.size(); i++) {
1899 testq.push(nullptr);
1900 }
1901
1902 // Wait for consumers to end and retrieve their results.
1903 for (size_t i = 0; i < scanners.size(); i++) {
1904 const auto &s = scanners[i];
1905 s->join();
1906
1907 if (g_verbose) {
1908 cout << "Thread " << i << " processed " << s->count << " units."
1909 << endl;
1910 }
1911
1912 summary.merge(s->getSummary());
1913 }
1914
1915 printSummary(summary);
1916 return !summary.hasFailure();
1917 }
1918
1919 int HS_CDECL main(int argc, char *argv[]) {
1920 Grey grey;
1921 vector<string> corporaFiles;
1922
1923 for (int i = 1; i < argc - 1; i++) {
1924 if (!strcmp(argv[i], "-G")) {
1925 cout << "Override: " << argv[i + 1] << endl;
1926 }
1927 }
1928
1929 setDefaults();
1930 storeCmdline(argc, argv);
1931 unique_ptr<hs_platform_info> plat;
1932 corpus_gen_prop.seed(randomSeed);
1933
1934 processArgs(argc, argv, corpus_gen_prop, &corporaFiles, &grey, &plat);
1935
1936 // If the user has asked for a random alignment, we select it here (after
1937 // random number seed applied).
1938 if (use_random_alignment) {
1939 min_ue2_align = corpus_gen_prop.rand(0, 15);
1940 max_ue2_align = min_ue2_align + 1;
1941 }
1942
1943 // Limit memory usage, unless the user has specified zero on the command
1944 // line or in a config file.
1945 if (g_memoryLimit) {
1946 setMemoryLimit(g_memoryLimit * numThreads);
1947 }
1948
1949 // Split threads available up amongst scanner and generator threads.
1950 numGeneratorThreads = max(1u, static_cast<unsigned int>(numThreads * 0.5));
1951 numScannerThreads = max(1u, numThreads - numGeneratorThreads);
1952
1953 ExpressionMap exprMap;
1954 loadExpressions(g_exprPath, exprMap);
1955
1956 if (!g_allSignatures) {
1957 SignatureSet signatures;
1958 if (!g_signatureFiles.empty()) {
1959 for (string &fname : g_signatureFiles) {
1960 loadSignatureList(fname, signatures);
1961 }
1962 } else {
1963 signatures.insert(signatures.end(), g_signatures.begin(),
1964 g_signatures.end());
1965 }
1966
1967 exprMap = limitToSignatures(exprMap, signatures);
1968 }
1969
1970 printSettings(corporaFiles, plat.get());
1971
1972 if (exprMap.empty()) {
1973 cout << "Warning: no signatures to scan. Exiting." << endl;
1974 exit(0);
1975 }
1976
1977 if (!no_signal_handler) {
1978 installSignalHandler();
1979 }
1980
1981 if (saveDatabases || loadDatabases) {
1982 struct stat st;
1983 if (stat(serializePath.c_str(), &st) < 0) {
1984 cout << "Unable to stat serialize path '" << serializePath
1985 << "': " << strerror(errno) << endl;
1986 exit_with_fail();
1987 }
1988 }
1989
1990 // If we're saving corpora out, truncate the output file.
1991 if (saveCorpora) {
1992 corporaOut = ue2::make_unique<CorpusWriter>(saveCorporaFile);
1993 }
1994
1995 GroundTruth::global_prep();
1996
1997 auto corpora_source = buildCorpora(corporaFiles, exprMap);
1998
1999 if (!g_verbose && g_quiet < 2) {
2000 cout << "Only failed tests are displayed." << endl;
2001 }
2002
2003 SimpleTimer timer;
2004 bool success = runTests(*corpora_source, exprMap, plat.get(), grey);
2005 cout << "\nTotal elapsed time: " << timer.elapsed() << " secs." << endl;
2006 exprMap.clear();
2007
2008 if (!success) {
2009 exit_with_fail();
2010 }
2011
2012 return 0;
2013 }
2014