1 /*
2  * Copyright (c) 2015-2019, Intel Corporation
3  *
4  * Redistribution and use in source and binary forms, with or without
5  * modification, are permitted provided that the following conditions are met:
6  *
7  *  * Redistributions of source code must retain the above copyright notice,
8  *    this list of conditions and the following disclaimer.
9  *  * Redistributions in binary form must reproduce the above copyright
10  *    notice, this list of conditions and the following disclaimer in the
11  *    documentation and/or other materials provided with the distribution.
12  *  * Neither the name of Intel Corporation nor the names of its contributors
13  *    may be used to endorse or promote products derived from this software
14  *    without specific prior written permission.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
20  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26  * POSSIBILITY OF SUCH DAMAGE.
27  */
28 
29 #include "config.h"
30 
31 #include "BoundedQueue.h"
32 #include "DatabaseProxy.h"
33 #include "FileCorpora.h"
34 #include "GraphTruth.h"
35 #include "GroundTruth.h"
36 #include "NfaGeneratedCorpora.h"
37 #include "Thread.h"
38 #include "UltimateTruth.h"
39 #include "args.h"
40 #include "common.h"
41 #include "cross_compile.h"
42 #include "expressions.h"
43 #include "limit.h"
44 #include "ng_corpus_properties.h"
45 #include "sig.h"
46 #include "simple_timer.h"
47 #include "util/expression_path.h"
48 #include "util/string_util.h"
49 
50 #include "grey.h"
51 #include "hs.h"
52 #include "parser/utf8_validate.h"
53 #include "ue2common.h"
54 #include "util/container.h"
55 #include "util/make_unique.h"
56 
57 #include <algorithm>
58 #include <cassert>
59 #include <cctype>
60 #include <cstdio>
61 #include <cstdlib>
62 #include <fstream>
63 #include <iostream>
64 #include <iterator>
65 #include <mutex>
66 #include <queue>
67 #include <string>
68 #include <thread>
69 #include <vector>
70 
71 #include <errno.h>
72 #include <time.h>
73 #include <sys/types.h>
74 #include <sys/stat.h>
75 
76 using namespace std;
77 using namespace ue2;
78 
79 unsigned int numThreads = 1;
80 unsigned int numScannerThreads = 1;
81 unsigned int numGeneratorThreads = 1;
82 enum ColliderMode colliderMode = MODE_BLOCK;
83 bool echo_matches = false;
84 int g_quiet = 0;
85 bool g_verbose = false;
86 bool g_allSignatures = false;
87 string g_exprPath;
88 vector<string> g_signatureFiles;
89 string g_cmdline;
90 bool g_ue2CompileAll = false;
91 unsigned g_streamBlocks = 0;
92 unsigned long long g_streamOffset = 0;
93 unsigned multicompile_bands = 0;
94 vector<unsigned> g_signatures;
95 unsigned long int g_matchLimit = DEFAULT_PCRE_MATCH_LIMIT;
96 unsigned long int g_matchLimitRecursion = DEFAULT_PCRE_MATCH_RECURSION_LIMIT;
97 string g_corpora_prefix;
98 string g_corpora_suffix;
99 size_t g_memoryLimit = 1000; // megabytes per thread
100 unsigned int somFlags = 0;
101 bool loadDatabases = false;
102 bool saveDatabases = false;
103 bool saveCorpora = false;
104 string saveCorporaFile;
105 string serializePath;
106 bool force_utf8 = false;
107 int force_prefilter = 0;
108 int no_groups = 0;
109 unsigned somPrecisionMode = HS_MODE_SOM_HORIZON_LARGE;
110 unsigned limit_matches = 0;
111 unsigned randomSeed = 0;
112 bool use_random_alignment = false;
113 bool use_PCRE = true;
114 bool use_NFA = true;
115 bool use_UE2 = true;
116 bool use_copy_scratch = false;
117 bool use_copy_stream = false;
118 bool use_mangle_scratch = false;
119 bool use_compress_expand = false;
120 bool use_compress_reset_expand = false;
121 bool use_literal_api = false;
122 int abort_on_failure = 0;
123 int no_signal_handler = 0;
124 size_t max_scan_queue_len = 25000;
125 size_t max_generator_queue_len = 25000;
126 bool force_edit_distance = false;
127 unsigned edit_distance = 0;
128 CorpusProperties corpus_gen_prop;
129 
130 // Semi constants
131 unsigned min_ue2_align = 0;
132 unsigned max_ue2_align = MAX_MAX_UE2_ALIGN;
133 
134 #define DEDUPE_MATCHES
135 
136 static
countCores()137 unsigned countCores() {
138     unsigned count = std::thread::hardware_concurrency();
139     return count ? count : 1;
140 }
141 
142 // Detect the Address Sanitizer with either GCC or Clang.
143 #if defined(__SANITIZE_ADDRESS__)
144 #  define BUILT_WITH_ASAN
145 #elif defined(__has_feature)
146 #  if __has_feature(address_sanitizer)
147 #    define BUILT_WITH_ASAN
148 #  endif
149 #endif
150 
151 // Set the default params that can be overridden with commandline args
152 static
setDefaults()153 void setDefaults() {
154     // Seed random number generator for corpora
155     randomSeed = time(nullptr);
156     // Overcommit since we have generators and scanners running.
157     numThreads = countCores() * 2;
158 
159 #ifdef BUILT_WITH_ASAN
160     cout << "NOTE: Built with AddressSanitizer.\n"
161          << "Defaulting to no memory limit and no signal handler.\n"
162          << endl;
163     g_memoryLimit = 0;
164     no_signal_handler = 1;
165 #endif
166 }
167 
168 static
exit_with_fail(void)169 void exit_with_fail(void) {
170     cout << "Failing cmdline was:\n  " << g_cmdline << endl;
171     if (abort_on_failure) {
172         cout << "Calling abort()" << endl;
173         abort();
174     }
175     exit(1);
176 }
177 
178 namespace /* anonymous */ {
179 
180 // For saving corpora out if the -w flag is specified. Note that we need a
181 // mutex to serialise writes from different threads.
182 class CorpusWriter {
183 public:
CorpusWriter(const string & filename)184     explicit CorpusWriter(const string &filename)
185         : out(filename.c_str(), ios_base::trunc) {}
186 
write(const string & str)187     void write(const string &str) {
188         std::lock_guard<std::mutex> lock(mutex);
189         out << str << flush;
190     }
191 
192 private:
193     ofstream out;
194     std::mutex mutex;
195 };
196 
197 unique_ptr<CorpusWriter> corporaOut = nullptr;
198 
199 // Encapsulates all of the data reported from a test
200 struct TestSummary {
201     unsigned totalCorpora = 0;
202     unsigned totalExpressions = 0;
203     unsigned failCorpora = 0;
204     unsigned failPcreCompile = 0;
205     unsigned failNGCompile = 0;
206     unsigned failUe2Compile = 0;
207     unsigned failCompileDifference = 0; // failed in pcre but not ue2
208     unsigned failPcreScan = 0;
209     unsigned failNGScan = 0;
210     unsigned failUe2Scan = 0;
211     unsigned failDiff = 0;
212     unsigned failNoGroundTruth = 0;
213     set<unsigned> failIds;
214     set<unsigned> nogtIds;
215 
216     // true if we've got a failure
hasFailure__anonce675f310111::TestSummary217     bool hasFailure() const {
218         return failDiff != 0 || !failIds.empty() || failCompileDifference != 0;
219     }
220 
merge__anonce675f310111::TestSummary221     void merge(const TestSummary &a) {
222         totalCorpora += a.totalCorpora;
223         totalExpressions += a.totalExpressions;
224         failCorpora += a.failCorpora;
225         failPcreCompile += a.failPcreCompile;
226         failNGCompile += a.failNGCompile;
227         failUe2Compile += a.failUe2Compile;
228         failCompileDifference += a.failCompileDifference;
229         failPcreScan += a.failPcreScan;
230         failNGScan += a.failNGScan;
231         failUe2Scan += a.failUe2Scan;
232         failDiff += a.failDiff;
233         failNoGroundTruth += a.failNoGroundTruth;
234         failIds.insert(begin(a.failIds), end(a.failIds));
235         nogtIds.insert(begin(a.nogtIds), end(a.nogtIds));
236     }
237 };
238 
239 enum TestResult {
240     TEST_NO_GROUND_TRUTH,
241     TEST_PASSED,
242     TEST_SKIPPED,
243     TEST_FAILED_COMPILE,
244     TEST_FAILED
245 };
246 
247 struct TestUnit {
248     shared_ptr<CompiledPcre> pcre; // libpcre bytecode
249     shared_ptr<CNGInfo> cngi; // NFA graph info (compilation is deferred)
250     shared_ptr<DatabaseProxy> ue2; // ue2 bytecode
251     Corpus corpus; // a local copy, as we may modify it
252 
253     unsigned id; // expression id
254     unsigned corpus_id; // corpus id
255     bool highlander; // single match flag
256     bool prefilter; // prefilter flag
257     bool som; // start of match flag
258     bool multi; // if false, we're in single mode.
259     bool utf8; // at least one of our patterns is utf8
260 
261     enum TestResult result;
262 
TestUnit__anonce675f310111::TestUnit263     TestUnit(unsigned sig_id, unsigned c_id, const Corpus &c,
264              shared_ptr<CompiledPcre> pcre_in, shared_ptr<CNGInfo> cngi_in,
265              shared_ptr<DatabaseProxy> ue2_in, bool multi_in, bool utf8_in,
266              bool highlander_in, bool prefilter_in, bool som_in)
267         : pcre(pcre_in), cngi(cngi_in), ue2(ue2_in), corpus(c), id(sig_id),
268           corpus_id(c_id), highlander(highlander_in), prefilter(prefilter_in),
269           som(som_in), multi(multi_in), utf8(utf8_in),
270           result(TEST_NO_GROUND_TRUTH) {}
271 };
272 
273 } // namespace
274 
275 // For ease of printing match sets
276 static
operator <<(std::ostream & os,const set<MatchResult> & v)277 std::ostream &operator<<(std::ostream &os, const set<MatchResult> &v) {
278     auto vi = v.begin(), ve = v.end();
279     while (vi != ve) {
280         // match offsets
281         os << '(' << vi->from << ',' << vi->to << ')';
282         if (++vi != ve) {
283             os << ", ";
284         }
285     }
286     return os;
287 }
288 
289 static
printCorpus(ostream & out,const Corpus & corpus)290 void printCorpus(ostream &out, const Corpus &corpus) {
291     // Print the offending corpus
292     string corpus_data(corpus.data.begin() + g_corpora_prefix.size(),
293                        corpus.data.end()   - g_corpora_suffix.size());
294     bool trimmed = false;
295     if (corpus_data.size() > 1000) {
296         corpus_data.resize(1000);
297         trimmed = true;
298     }
299     out << "  Corpus data: '" << printable(corpus_data) << "'";
300     if (trimmed) {
301         out << " ...";
302     }
303     out << "\n";
304 }
305 
306 static
printGroundTruthDifference(ostream & out,const ExpressionMap & exprMap,const TestUnit & unit,const ResultSet & pcre_results,const ResultSet & ngw_results)307 void printGroundTruthDifference(ostream &out, const ExpressionMap &exprMap,
308                                 const TestUnit &unit,
309                                 const ResultSet &pcre_results,
310                                 const ResultSet &ngw_results) {
311     assert(contains(exprMap, unit.id));
312     // Print the expression itself
313     out << "  Expression: '" << exprMap.at(unit.id) << "'\n";
314     printCorpus(out, unit.corpus);
315     out << "  PCRE matches: " << pcre_results.matches << "\n";
316     out << "  NFA matches: " << ngw_results.matches << "\n";
317 
318     vector<MatchResult> diff;
319 
320     set_difference(pcre_results.matches.begin(), pcre_results.matches.end(),
321                    ngw_results.matches.begin(), ngw_results.matches.end(),
322                    back_inserter(diff));
323 
324     for (const auto &match : diff) {
325         out << "  PCRE only: match (" << match.from << "," << match.to << ")\n";
326     }
327 
328     diff.clear();
329 
330     set_difference(ngw_results.matches.begin(), ngw_results.matches.end(),
331                    pcre_results.matches.begin(), pcre_results.matches.end(),
332                    back_inserter(diff));
333 
334     for (const auto &match : diff) {
335         out << "  NFA only: match (" << match.from << "," << match.to << ")\n";
336     }
337     out.flush();
338 }
339 
340 // Report the difference information when a pattern causes different matches in
341 // our engines.
342 static
printDifference(ostream & out,const ExpressionMap & exprMap,const TestUnit & unit,const ResultSet & gt_results,const vector<ResultSet> & ue2_results,const vector<bool> & pass)343 void printDifference(ostream &out, const ExpressionMap &exprMap,
344                      const TestUnit &unit, const ResultSet &gt_results,
345                      const vector<ResultSet> &ue2_results,
346                      const vector<bool> &pass) {
347     assert(contains(exprMap, unit.id));
348     // Print the expression itself
349     out << "  Expression: '" << exprMap.at(unit.id) << "'\n";
350     printCorpus(out, unit.corpus);
351     out << "  " << gt_results.src << " matches: " << gt_results.matches << endl;
352 
353     for (u32 align = min_ue2_align; align < max_ue2_align; align++) {
354         if (pass[align]) {
355             continue;
356         }
357 
358         u32 align_in = align;
359         out << "  UE2 (" << align;
360         while (align + 1 < max_ue2_align) {
361             if (pass[align + 1] ||
362                 ue2_results[align] != ue2_results[align + 1]) {
363                 break;
364             }
365             align++;
366         }
367 
368         if (align != align_in) {
369             out << " - " << align;
370         }
371 
372         out << ") matches: " << ue2_results[align].matches;
373         out << endl;
374 
375         vector<MatchResult> only;
376 
377         // Print matches only returned by ground truth
378         set_difference(gt_results.matches.begin(),
379                        gt_results.matches.end(),
380                        ue2_results[align].matches.begin(),
381                        ue2_results[align].matches.end(),
382                        back_inserter(only));
383         for (const auto &match : only) {
384             out << "  " << gt_results.src << " only: match ("
385                 << match.from << "," << match.to << ')' << endl;
386         }
387 
388         // Print matches only returned by UE2
389         only.clear();
390 
391         set_difference(ue2_results[align].matches.begin(),
392                        ue2_results[align].matches.end(),
393                        gt_results.matches.begin(),
394                        gt_results.matches.end(),
395                        back_inserter(only));
396 
397         for (const auto &match : only) {
398             out << "  UE2 only: match (" << match.from << "," << match.to << ')'
399                 << endl;
400         }
401 
402 #ifdef DEDUPE_MATCHES
403         for (const auto &match : ue2_results[align].dupe_matches) {
404             out << "  UE2 dupe:  match (" << match.from << "," << match.to
405                 << ')' << endl;
406         }
407 #endif
408 
409         if (ue2_results[align].uoom) {
410             out << "  *** UE2 produced matches out of order" << endl;
411         }
412         if (ue2_results[align].match_after_halt) {
413             out << "  *** UE2 produced matches after termination" << endl;
414         }
415         if (ue2_results[align].invalid_id) {
416             out << "  *** UE2 produced matches for invalid ids" << endl;
417         }
418     }
419 }
420 
421 static
printMode(void)422 void printMode(void) {
423     if (!g_ue2CompileAll) {
424         cout << "Single/";
425     } else if  (!multicompile_bands) {
426         cout << "Multi/";
427     } else {
428         cout << "Multi-" << multicompile_bands << "/";
429     }
430 
431     switch (colliderMode) {
432         case MODE_BLOCK:
433             cout << "Block";
434             break;
435         case MODE_STREAMING:
436             cout << "Streaming-" << g_streamBlocks;
437             if (g_streamOffset) {
438                 cout << " offset " << g_streamOffset;
439             }
440             if (use_copy_stream) {
441                 cout << " [copy stream]";
442             }
443             if (use_compress_expand) {
444                 cout << " [compress]";
445             }
446             if (use_compress_reset_expand) {
447                 cout << " [compress+reset]";
448             }
449             break;
450         case MODE_VECTORED:
451             cout << "Vectored-" << g_streamBlocks;
452             break;
453         case MODE_HYBRID:
454             cout << "Hybrid";
455             break;
456     }
457 
458     if (use_copy_scratch) {
459         cout << " [copy scratch]";
460     }
461     if (use_mangle_scratch) {
462         cout << " [mangle]";
463     }
464     cout << endl;
465 }
466 
467 static
printSummaryV(const TestSummary & sum)468 void printSummaryV(const TestSummary &sum) {
469     cout << endl;
470     cout << "Summary:" << endl;
471     cout << "Mode:                           ";
472     printMode();
473     cout << "=========" << endl;
474     cout << "Expressions processed:          " << sum.totalExpressions << endl;
475     cout << "Corpora processed:              " << sum.totalCorpora << endl;
476     cout << "Expressions with failures:      " << sum.failIds.size() << endl;
477     cout << "  Corpora generation failures:  " << sum.failCorpora << endl;
478     cout << "  Compilation failures:         ";
479     cout << "pcre:" << sum.failPcreCompile << ", ";
480     cout << "ng:" << sum.failNGCompile << ", ";
481     cout << "ue2:" << sum.failUe2Compile << endl;
482 
483     cout << "  Matching failures:            ";
484     cout << "pcre:" << sum.failPcreScan << ", ";
485     cout << "ng:" << sum.failNGScan << ", ";
486     cout << "ue2:" << sum.failUe2Scan << endl;
487     cout << "  Match differences:            " << sum.failIds.size() << endl;
488     cout << "  No ground truth:              " << sum.nogtIds.size() << endl;
489     cout << "Total match differences:        " << sum.failDiff << endl;
490 }
491 
492 static
printSummaryQ(const TestSummary & sum)493 void printSummaryQ(const TestSummary &sum) {
494     cout << "Summary:     ";
495     printMode();
496 
497     cout << "Processed:   " << sum.totalExpressions << " expressions, "
498          << sum.totalCorpora << " corpora" << endl;
499     cout << "Failures:    " << sum.failIds.size()
500          << " (corpora:   " << sum.failCorpora << "; compile: ";
501     cout << "pcre:" << sum.failPcreCompile << ", ";
502     cout << "ng:" << sum.failNGCompile << ", ";
503     cout << "ue2:" << sum.failUe2Compile << "; match: ";
504 
505     cout << "pcre:" << sum.failPcreScan << ", ";
506     cout << "ng:" << sum.failNGScan << ", ";
507     cout << "ue2:" << sum.failUe2Scan << ")" << endl;
508     cout << "Differences: " << sum.failIds.size() << " expressions, "
509          << sum.failDiff << " total" << endl;
510     cout << "No ground truth: " << sum.nogtIds.size() << " expressions" << endl;
511 }
512 
513 static
printSummary(const TestSummary & sum)514 void printSummary(const TestSummary &sum) {
515     if (g_quiet > 1) {
516         printSummaryQ(sum);
517     } else {
518         printSummaryV(sum);
519     }
520 }
521 
522 // Returns true if this Highlander mode test succeeded.
523 static
checkSingleMatch(const ResultSet & ground_truth,const ResultSet & ue2)524 bool checkSingleMatch(const ResultSet &ground_truth, const ResultSet &ue2) {
525     // In Highlander (single-match) mode, UE2 must return only one of the
526     // matches returned by PCRE/GraphTruth. It need not be the earliest one.
527     if (ground_truth.matches.empty()) {
528         return ue2.matches.empty();
529     } else if (ue2.matches.size() != 1) {
530         return false;
531     } else {
532         return contains(ground_truth.matches, *ue2.matches.begin());
533     }
534 }
535 
536 // Returns true if this prefiltering mode test succeeded.
537 static
checkPrefilterMatch(const ResultSet & ground_truth,const ResultSet & ue2,bool highlander)538 bool checkPrefilterMatch(const ResultSet &ground_truth, const ResultSet &ue2,
539                          bool highlander) {
540     if (highlander) {
541         // Highlander + prefilter is tricky. Best we can do is say that if PCRE
542         // returns matches, UE2 must return a match, though it may not be one
543         // of the ones returned by PCRE (it may be an earlier match).
544         if (!ground_truth.matches.empty()) {
545             return ue2.matches.size() == 1;
546         }
547         // We can't verify anything more.
548         return true;
549     } else if (!limit_matches || ue2.matches.size() < limit_matches) {
550         // In prefilter mode, every match found by PCRE must be found by UE2,
551         // but the UE2 set may be a superset of the PCRE match set.
552         return std::includes(ue2.matches.begin(), ue2.matches.end(),
553                 ground_truth.matches.begin(), ground_truth.matches.end());
554     }
555 
556     // Otherwise, we've hit our match limit. Prefilter mode is quite difficult
557     // to verify in this case, so we just verify that "something happened".
558     return true;
559 }
560 
561 static
makeEndOfMatchOnly(const ResultSet & rs)562 ResultSet makeEndOfMatchOnly(const ResultSet &rs) {
563     ResultSet out(rs.src);
564     for (const auto &match : rs.matches) {
565         out.addMatch(0, match.to);
566     }
567     return out;
568 }
569 
570 static
checkMultiMatch(const ResultSet & ground_truth,const ResultSet & ue2)571 bool checkMultiMatch(const ResultSet &ground_truth, const ResultSet &ue2) {
572     // If we had out-of-order matches or matches after termination, we have a
573     // bug!
574     if (ue2.uoom || ue2.match_after_halt || ue2.invalid_id) {
575         return false;
576     }
577 
578     // If we have more UE2 matches than our limit, we have a bug!
579     if (limit_matches && ue2.matches.size() > limit_matches) {
580         return false;
581     }
582 
583     // If we have more UE2 matches than PCRE matches, we have a bug!
584     if (ue2.matches.size() > ground_truth.matches.size()) {
585         return false;
586     }
587 
588     // If we've got fewer matches than our limit to test, then the match sets
589     // must be identical.
590     if (!limit_matches || ground_truth.matches.size() < limit_matches) {
591         return ground_truth == ue2;
592     }
593 
594     // We're in limit_matches mode _and_ we have hit the limit.  Every match in
595     // 'ue2' must be in 'pcre'. (We can't just trim pcre and do an equality
596     // test as matches may come out of UE2 a little out of order.)
597 
598     // In streaming mode, the limit may mean that we get a different SOM from
599     // the leftmost one. So we compare only end offsets.
600     if (colliderMode == MODE_STREAMING || colliderMode == MODE_VECTORED) {
601         ResultSet gt_eom = makeEndOfMatchOnly(ground_truth);
602         ResultSet ue2_eom = makeEndOfMatchOnly(ue2);
603         return std::includes(gt_eom.matches.begin(), gt_eom.matches.end(),
604                              ue2_eom.matches.begin(), ue2_eom.matches.end());
605     }
606 
607     return std::includes(ground_truth.matches.begin(),
608                          ground_truth.matches.end(),
609                          ue2.matches.begin(), ue2.matches.end());
610 }
611 
612 // Check results, returns true if there has any failure.
613 static
checkTestResults(ostream & out,TestSummary & summary,const ExpressionMap & exprMap,TestUnit & unit,const ResultSet & gt_results,const vector<ResultSet> & ue2_results)614 bool checkTestResults(ostream &out, TestSummary &summary,
615                       const ExpressionMap &exprMap, TestUnit &unit,
616                       const ResultSet &gt_results,
617                       const vector<ResultSet> &ue2_results) {
618     bool failed = false;
619     bool any_fail = false;
620     vector<bool> pass(max_ue2_align, false);
621 
622     for (unsigned align = min_ue2_align; align != max_ue2_align; ++align) {
623         if (unit.prefilter) {
624             failed = !checkPrefilterMatch(gt_results, ue2_results[align],
625                                           unit.highlander);
626         } else if (unit.highlander) {
627             failed = !checkSingleMatch(gt_results, ue2_results[align]);
628         } else {
629             // In non-Highlander mode, the two result sets MUST be equal
630             // don't check PCRE if the scan didn't succeed
631             failed = !checkMultiMatch(gt_results, ue2_results[align]);
632         }
633 
634 #ifdef DEDUPE_MATCHES
635         if (!failed) {
636             failed |= !ue2_results[align].dupe_matches.empty();
637         }
638 #endif
639 
640         pass[align] = !failed;
641 
642         any_fail |= failed;
643 
644         summary.failDiff += failed ? 1 : 0;
645 
646         if (g_verbose) {
647             if (failed) {
648                 out << "FAILED: id " << unit.id << ", alignment " << align
649                     << ", corpus " << unit.corpus_id << ", results differ"
650                     << endl;
651             } else {
652                 out << "PASSED: id " << unit.id << ", alignment " << align
653                     << ", corpus " << unit.corpus_id
654                     << " (matched "<< gt_results.src << ":"
655                     << gt_results.matches.size()
656                     << ", ue2:" << ue2_results[align].matches.size() << ")"
657                     << endl;
658             }
659         }
660     }
661 
662     if (!any_fail) {
663         return false;
664     }
665 
666     if (!g_verbose) {
667         out << "FAILED: id " << unit.id << ", alignment";
668         for (unsigned align = min_ue2_align; align != max_ue2_align; ++align) {
669             if (!pass[align]) {
670                 out << " " << align;
671 
672                 if (align + 1 < max_ue2_align && !pass[align + 1]) {
673                     while (align + 1 < max_ue2_align && !pass[align + 1]) {
674                         align++;
675                     }
676 
677                     out << "-" << align;
678                 }
679             }
680         }
681 
682         out << ", corpus " << unit.corpus_id << ", results differ" << endl;
683     }
684     printDifference(out, exprMap, unit, gt_results, ue2_results, pass);
685 
686     return true;
687 }
688 
689 // Construct a UE2 database, taking care of loading/saving to disk when
690 // appropriate
691 static
constructDatabase(const set<unsigned int> & ids,const UltimateTruth & ultimate)692 shared_ptr<DatabaseProxy> constructDatabase(const set<unsigned int> &ids,
693                                             const UltimateTruth &ultimate) {
694     assert(!ids.empty());
695 
696     if (loadDatabases) {
697         string filename = ultimate.dbFilename(ids);
698         shared_ptr<BaseDB> db = ultimate.loadDatabase(filename, ids);
699         if (!db) {
700             if (!g_quiet) {
701                 cout << "FAILED: could not load database " << filename << endl;
702             }
703             return nullptr;
704         }
705         return make_shared<DatabaseProxy>(db);
706     }
707 
708     shared_ptr<DatabaseProxy> ue2 = make_shared<DatabaseProxy>(ids);
709 
710     try {
711         // If we're not runnable (i.e. we're cross-compiling), let's at least
712         // try to build the database.
713         if (!ultimate.runnable()) {
714             shared_ptr<BaseDB> db = ue2->get(ultimate);
715             assert(db); // throws otherwise
716         }
717 
718         // Compile and save if we've been told to.
719         if (saveDatabases) {
720             string filename = ultimate.dbFilename(ids);
721             if (!ultimate.saveDatabase(*(ue2->get(ultimate)),
722                                        filename.c_str())) {
723                 cout << "FAILED: could not save database to file: " << filename
724                      << endl;
725             }
726         }
727     } catch (const CompileFailed &fail) {
728         if (!g_quiet) {
729             cout << "FAILED: ue2 compile failed for " << *ids.begin() << ": "
730                  << fail.error << endl;
731         }
732         // Return null database to indicate failure.
733         ue2 = nullptr;
734     }
735 
736     return ue2;
737 }
738 
739 static
getGraphTruth(ostream & out,CNGInfo & cngi,GraphTruth & graph,TestUnit & unit,ResultSet & ngw_results,TestSummary & summary,const string & expression)740 bool getGraphTruth(ostream &out, CNGInfo &cngi, GraphTruth &graph,
741                     TestUnit &unit, ResultSet &ngw_results,
742                     TestSummary &summary, const string &expression) {
743     debug_stage = STAGE_GRAPH_RUN;
744 
745     // Skip patterns we've previously marked as bad.
746     if (cngi.is_bad()) {
747         summary.failNGScan++;
748         return false;
749     }
750 
751     // If we already have match information for this corpus, we don't need to
752     // run PCRE at all. At the moment our on-disk format for corpora with match
753     // information only includes the end-of-match offset, so we only use these
754     // in non-som modes. If edit distance is forced, all bets are off so we
755     // ignore this as well.
756     if (!g_streamOffset && unit.corpus.hasMatches && !force_utf8 && !cngi.som &&
757         !force_edit_distance) {
758         if (g_verbose) {
759             out << "Using corpus match set rather than NFA graph" << endl;
760         }
761         ngw_results = ResultSet(unit.corpus.matches, RESULT_FROM_GRAPH);
762     } else {
763         // compile the actual graph
764         const CompiledNG *cng;
765         try {
766             debug_stage = STAGE_GRAPH_COMPILE;
767             cng = cngi.get();
768             debug_stage = STAGE_UNDEFINED;
769         }
770         catch (const NGCompileFailure &err) {
771             debug_stage = STAGE_UNDEFINED;
772             summary.failNGCompile++;
773             summary.failNGScan++;
774             cngi.mark_bad();
775             if (!g_quiet) {
776                 cout << "FAILED: id " << unit.id
777                      << ", NFA graph compile failed (" << err.msg << ")"
778                      << endl;
779             }
780             return false;
781         }
782         debug_stage = STAGE_GRAPH_RUN;
783 
784         // Run NFA graph and collect match information.
785         string error;
786         assert(cng);
787         if (!graph.run(unit.id, *cng, cngi, unit.corpus.data, ngw_results,
788                        error)) {
789             if (!g_quiet) {
790                 out << "FAILED: id " << unit.id
791                     << ", NFA graph scan failed: " << error << "\n"
792                     << "  Expression: '" << expression << "'\n"
793                     << "  Corpus data: '" << printable(unit.corpus.data)
794                     << "'\n"
795                     << "  (note: marking bad, skipping subsequent tests)"
796                     << endl;
797             }
798             summary.failNGScan++;
799             cngi.mark_bad();
800             return false;
801         }
802     }
803 
804     return true;
805 }
806 
807 static
getGroundTruth(ostream & out,CompiledPcre & cpcre,GroundTruth & ground,TestUnit & unit,ResultSet & pcre_results,TestSummary & summary)808 bool getGroundTruth(ostream &out, CompiledPcre &cpcre, GroundTruth &ground,
809                     TestUnit &unit, ResultSet &pcre_results,
810                     TestSummary &summary) {
811     debug_stage = STAGE_PCRE_RUN;
812 
813     // Skip patterns we've previously marked as bad.
814     if (cpcre.is_bad()) {
815         summary.failPcreScan++;
816         return false;
817     }
818 
819     // If we already have match information for this corpus, we don't need to
820     // run PCRE at all. At the moment our on-disk format for corpora with match
821     // information only includes the end-of-match offset, so we only use these
822     // in non-som modes. Also, we can't trust corpus matches if there was an
823     // edit distance requested for all patterns.
824     if (!g_streamOffset && unit.corpus.hasMatches && !force_utf8 && !cpcre.som
825         && !force_edit_distance) {
826         if (g_verbose) {
827             out << "Using corpus match set rather than PCRE" << endl;
828         }
829         pcre_results = ResultSet(unit.corpus.matches, RESULT_FROM_PCRE);
830     } else {
831         // Run PCRE and collect match information.
832         string error;
833         if (!ground.run(unit.id, cpcre, unit.corpus.data, pcre_results,
834                         error)) {
835             if (!g_quiet) {
836                 out << "FAILED: id " << unit.id
837                     << ", libpcre scan failed: " << error << "\n"
838                     << "  Expression: '" << cpcre.expression << "'\n"
839                     << "  Corpus data: '" << printable(unit.corpus.data)
840                     << "'\n"
841                     << "  (note: marking PCRE bad, skipping subsequent tests)"
842                     << endl;
843             }
844             summary.failPcreScan++;
845             cpcre.mark_bad();
846             return false;
847         }
848     }
849 
850     return true;
851 }
852 
853 static
writeCorpus(unsigned id,const Corpus & corpus,const ResultSet & results)854 void writeCorpus(unsigned id, const Corpus &corpus, const ResultSet &results) {
855         assert(corporaOut);
856         ostringstream oss;
857         oss << id << "=\"" << printable(corpus.data) << "\": ";
858 
859         auto vi = results.matches.begin();
860         auto ve = results.matches.end();
861 
862         // Print match end offsets only.
863         while (vi != ve) {
864             oss << vi->to;
865             if (++vi != ve) {
866                 oss << ",";
867             }
868         }
869         oss << "\n";
870         corporaOut->write(oss.str());
871 }
872 
873 static
runTestUnit(ostream & out,GroundTruth & ground,GraphTruth & graph,UltimateTruth & ultimate,TestUnit & unit,TestSummary & summary,const ExpressionMap & exprMap)874 void runTestUnit(ostream &out, GroundTruth &ground, GraphTruth &graph,
875                  UltimateTruth &ultimate, TestUnit &unit, TestSummary &summary,
876                  const ExpressionMap &exprMap) {
877     assert(use_UE2);
878     Corpus &corpus = unit.corpus;
879 
880     shared_ptr<const BaseDB> db;
881     if (use_UE2) {
882         // Acquire UE2 database.
883         debug_stage = STAGE_UE2_COMPILE;
884         try {
885             db = unit.ue2->get(ultimate);
886         } catch (const CompileFailed &fail) {
887             summary.failUe2Compile++;
888             if (!g_quiet) {
889                 out << "FAILED: ue2 compile failed for " << unit.id << ": "
890                     << fail.error << endl;
891                 unit.result = TEST_FAILED_COMPILE;
892                 debug_stage = STAGE_UNDEFINED;
893                 return;
894             }
895         }
896         debug_stage = STAGE_UNDEFINED;
897 
898         if (!db) {
899             // Database previously failed compilation.
900             unit.result = TEST_SKIPPED;
901             return;
902         }
903     }
904 
905     // If the user has specified that they want prefix/suffix data added to
906     // their corpora, we do it here; this is as local as possible to the
907     // test, so we don't keep piles of HUGE corpora hanging around.
908     if (!g_corpora_prefix.empty()) {
909         corpus.data.insert(0, g_corpora_prefix);
910         corpus.hasMatches = false;
911     }
912     if (!g_corpora_suffix.empty()) {
913         corpus.data.append(g_corpora_suffix);
914         corpus.hasMatches = false;
915     }
916 
917     ResultSet gt_results(RESULT_FROM_PCRE);
918     vector<ResultSet> ue2_results(max_ue2_align, ResultSet(RESULT_FROM_UE2));
919 
920     bool gt_done = false;
921 
922     // run PCRE test if enabled and if compile succeeded
923     if (unit.pcre && use_PCRE) {
924         gt_done = getGroundTruth(out, *unit.pcre, ground, unit, gt_results,
925                                  summary);
926     }
927 
928     // run NFA if PCRE failed (or wasn't run), or if we don't run UE2
929     if (unit.cngi && (use_NFA && !gt_done)) {
930         gt_done = getGraphTruth(out, *unit.cngi, graph, unit, gt_results,
931                                 summary, exprMap.find(unit.id)->second);
932     }
933 
934     // both ground truth methods either failed or didn't run
935     if (!gt_done) {
936         unit.result = TEST_NO_GROUND_TRUTH;
937         return;
938     }
939 
940     // Write out corpora if we've been told to
941     if (saveCorpora) {
942         writeCorpus(unit.id, unit.corpus, gt_results);
943     }
944 
945     debug_stage = STAGE_UE2_RUN;
946     for (unsigned int align = min_ue2_align; align != max_ue2_align; ++align) {
947         bool ok = ultimate.run(unit.id, db, corpus.data, !unit.multi, align,
948                                ue2_results[align]);
949 
950         if (!ok) {
951             if (!g_quiet) {
952                 out << "FAILED: id " << unit.id << ", ue2 scan at alignment "
953                     << align << " failed" << endl;
954             }
955             unit.result = TEST_FAILED;
956             debug_stage = STAGE_UNDEFINED;
957             return;
958         }
959     }
960 
961     // if we're using UE2, check all the different results modes
962     if (checkTestResults(out, summary, exprMap, unit, gt_results,
963                          ue2_results)) {
964         unit.result = TEST_FAILED;
965     } else {
966         unit.result = TEST_PASSED;
967     }
968 
969     debug_stage = STAGE_UNDEFINED;
970 }
971 
972 /* Used for testing the graph truth agains PCE */
973 static
runGroundCompTestUnit(ostream & out,GroundTruth & ground,GraphTruth & graph,TestUnit & unit,TestSummary & summary,const ExpressionMap & exprMap)974 void runGroundCompTestUnit(ostream &out, GroundTruth &ground, GraphTruth &graph,
975                            TestUnit &unit, TestSummary &summary,
976                            const ExpressionMap &exprMap) {
977     assert(!use_UE2);
978     assert(use_PCRE);
979     assert(use_NFA);
980     Corpus &corpus = unit.corpus;
981 
982     // If the user has specified that they want prefix/suffix data added to
983     // their corpora, we do it here; this is as local as possible to the
984     // test, so we don't keep piles of HUGE corpora hanging around.
985     if (!g_corpora_prefix.empty()) {
986         corpus.data.insert(0, g_corpora_prefix);
987         corpus.hasMatches = false;
988     }
989     if (!g_corpora_suffix.empty()) {
990         corpus.data.append(g_corpora_suffix);
991         corpus.hasMatches = false;
992     }
993 
994     ResultSet pcre_results(RESULT_FROM_PCRE);
995     ResultSet ngw_results(RESULT_FROM_GRAPH);
996 
997     bool pcreResult = false;
998     bool graphResult = false;
999 
1000     if (unit.pcre) {
1001         pcreResult = getGroundTruth(out, *unit.pcre, ground, unit, pcre_results,
1002                                     summary);
1003     }
1004 
1005     if (unit.cngi) {
1006         graphResult = getGraphTruth(out, *unit.cngi, graph, unit, ngw_results,
1007                                     summary, exprMap.find(unit.id)->second);
1008     }
1009 
1010     // no ground truth found either NFA or PCRE failed
1011     if (!pcreResult || !graphResult) {
1012         unit.result = TEST_NO_GROUND_TRUTH;
1013         return;
1014     }
1015 
1016     // Write out corpora if we've been told to
1017     if (saveCorpora) {
1018         writeCorpus(unit.id, unit.corpus, pcre_results);
1019     }
1020 
1021     if (pcre_results.matches != ngw_results.matches) {
1022         unit.result = TEST_FAILED;
1023         out << "FAILED: id " << unit.id << ", corpus " << unit.corpus_id
1024             << ", results differ" << endl;
1025 
1026         printGroundTruthDifference(out, exprMap, unit, pcre_results,
1027                                    ngw_results);
1028     } else {
1029         unit.result = TEST_PASSED;
1030         if (g_verbose) {
1031             out << "PASSED: id " << unit.id << ", corpus " << unit.corpus_id
1032                 << " (matched pcre:" << pcre_results.matches.size()
1033                 << ", matched ng:" << ngw_results.matches.size() << ")" << endl;
1034         }
1035     }
1036 
1037     debug_stage = STAGE_UNDEFINED;
1038 }
1039 
1040 static
addCorporaToQueue(ostream & out,BoundedQueue<TestUnit> & testq,unsigned id,CorporaSource & corpora,TestSummary & summary,shared_ptr<CompiledPcre> cpcre,shared_ptr<CNGInfo> cngi,shared_ptr<DatabaseProxy> ue2,bool multi,bool utf8)1041 void addCorporaToQueue(ostream &out, BoundedQueue<TestUnit> &testq, unsigned id,
1042                        CorporaSource &corpora, TestSummary &summary,
1043                        shared_ptr<CompiledPcre> cpcre, shared_ptr<CNGInfo> cngi,
1044                        shared_ptr<DatabaseProxy> ue2, bool multi, bool utf8) {
1045     // build corpora
1046     vector<Corpus> c;
1047     try {
1048         corpora.generate(id, c);
1049     }
1050     catch (CorpusFailure &err) {
1051         if (!g_quiet) {
1052             out << "FAILED: id " << id << ", corpora failure: " << err.message
1053                 << endl;
1054         }
1055         summary.failCorpora++;
1056         return;
1057     }
1058 
1059     const bool som = cpcre ? cpcre->som : cngi->som;
1060     const bool prefilter = cpcre ? cpcre->prefilter : cngi->prefilter;
1061     const bool highlander = cpcre ? cpcre->highlander : cngi->highlander;
1062 
1063     // If we're in UTF-8 mode and the corpus isn't valid UTF-8, skip it:
1064     // Hyperscan's behaviour when scanning invalid UTF-8 data in UTF-8 mode
1065     // is undefined.
1066     if (utf8) {
1067         auto is_invalid_utf8 = [](const Corpus &corpus) {
1068             return !isValidUtf8(corpus.data.c_str(), corpus.data.size());
1069         };
1070         c.erase(remove_if(begin(c), end(c), is_invalid_utf8), end(c));
1071     }
1072 
1073     // Collect together corpora units in a container so that we don't have to
1074     // repeatedly lock the queue.
1075     vector<unique_ptr<TestUnit>> tests;
1076     tests.reserve(c.size());
1077 
1078     size_t corpus_id = 0;
1079     for (const Corpus &corpus : c) {
1080         tests.push_back(ue2::make_unique<TestUnit>(id, corpus_id, corpus, cpcre,
1081                                                    cngi, ue2, multi, utf8,
1082                                                    highlander, prefilter, som));
1083         corpus_id++;
1084     }
1085 
1086     testq.push(begin(tests), end(tests));
1087 }
1088 
1089 namespace /* anonymous */ {
1090 
1091 // A subclass of Thread that stores its own output in a stringstream, flushing
1092 // it to cout when necessary.
1093 class OutputThread : public Thread {
1094 public:
OutputThread(size_t id)1095     OutputThread(size_t id) : Thread(id) {}
~OutputThread()1096     ~OutputThread() override {
1097         flush_output();
1098     }
1099 
1100 protected:
flush_output()1101     void flush_output() {
1102         const string &s = out.str();
1103         if (!s.empty()) {
1104             cout << s;
1105             out.str(""); // make empty
1106         }
1107     }
1108 
1109     // Output stream, flushed to cout after every test unit.
1110     stringstream out;
1111 };
1112 
1113 class ScanThread : public OutputThread {
1114 public:
ScanThread(size_t id,BoundedQueue<TestUnit> & testq,const ExpressionMap & e,const hs_platform_info * plat,const Grey & grey)1115     ScanThread(size_t id, BoundedQueue<TestUnit> &testq, const ExpressionMap &e,
1116                const hs_platform_info *plat, const Grey &grey)
1117         : OutputThread(id), q(testq),
1118           ground(out, e, g_matchLimit, g_matchLimitRecursion), graph(out, e),
1119           ultimate(out, e, plat, grey, g_streamBlocks), exprMap(e) {}
1120 
run()1121     void run() override {
1122         DEBUG_PRINTF("thread %zu running\n", thread_id);
1123         for (;;) {
1124             const auto unit = q.pop(thread_id);
1125             if (!unit) {
1126                 // Sentinel value, indicates that we have run out of units to
1127                 // process.
1128                 DEBUG_PRINTF("thread %zu stopped\n", thread_id);
1129                 break;
1130             }
1131 
1132             assert(unit);
1133             assert(exprMap.find(unit->id) != exprMap.end());
1134 
1135             // Debug information is stored in TLS and (hopefully) printed out in
1136             // the event of a crash.
1137             debug_expr = unit->id;
1138             debug_corpus = unit->corpus_id;
1139             debug_corpus_ptr = unit->corpus.data.c_str();
1140             debug_corpus_len = unit->corpus.data.size();
1141             debug_expr_ptr = exprMap.find(unit->id)->second.c_str();
1142 
1143             if (use_UE2) {
1144                 runTestUnit(out, ground, graph, ultimate, *unit, summary,
1145                             exprMap);
1146             } else {
1147                 runGroundCompTestUnit(out, ground, graph, *unit, summary,
1148                                       exprMap);
1149             }
1150 
1151             if (unit->result == TEST_NO_GROUND_TRUTH) {
1152                 summary.nogtIds.insert(unit->id);
1153                 // this is fine, continue
1154             } else if (unit->result == TEST_FAILED) {
1155                 summary.failIds.insert(unit->id);
1156             }
1157 
1158             count++;
1159             summary.totalCorpora++;
1160             flush_output();
1161         }
1162     }
1163 
getSummary() const1164     const TestSummary &getSummary() const { return summary; }
1165 
1166 public:
1167     size_t count = 0; // number of units processed
1168 
1169 private:
1170     // Shared queue.
1171     BoundedQueue<TestUnit> &q;
1172 
1173     // Thread-local data.
1174     GroundTruth ground; // independent copy
1175     GraphTruth graph; // independent copy
1176     UltimateTruth ultimate; // independent copy
1177     TestSummary summary;
1178 
1179     // Constant shared data.
1180     const ExpressionMap &exprMap;
1181 };
1182 
1183 /** Represent a work item for the corpus generation threads. This contains
1184  *  all information relating to an expression. The corpus generator will
1185  *  generate corpora for this expression and enqueue work items representing
1186  *  complete test cases for the scanning threads.
1187  */
1188 struct CorpusGenUnit {
CorpusGenUnit__anonce675f310311::CorpusGenUnit1189     CorpusGenUnit(unique_ptr<CNGInfo> cngi_in, unique_ptr<CompiledPcre> pcre_in,
1190                shared_ptr<DatabaseProxy> ue2_in, unsigned expr_id,
1191                bool multi_in, bool utf8_in)
1192         : cngi(move(cngi_in)), pcre(move(pcre_in)), ue2(ue2_in), id(expr_id),
1193           multi(multi_in), utf8(utf8_in) {}
1194 
1195     unique_ptr<CNGInfo> cngi;
1196     unique_ptr<CompiledPcre> pcre;
1197 
1198     /* ue2 shared_ptr as in multicompile and banded compile it is shared amongst
1199      * various corpus units (with differing expression ids). */
1200     shared_ptr<DatabaseProxy> ue2;
1201 
1202     unsigned id; // expression id
1203     bool multi; // ue2 contains more than one expression
1204     bool utf8; // ue2 can be run against utf8 corpora
1205 };
1206 
1207 class CorpusGenThread : public OutputThread {
1208 public:
CorpusGenThread(size_t id,BoundedQueue<TestUnit> & testq_in,BoundedQueue<CorpusGenUnit> & corpq_in,const CorporaSource & corpora_in)1209     CorpusGenThread(size_t id, BoundedQueue<TestUnit> &testq_in,
1210                     BoundedQueue<CorpusGenUnit> &corpq_in,
1211                     const CorporaSource &corpora_in)
1212         : OutputThread(id), testq(testq_in), corpq(corpq_in),
1213           corpora(corpora_in.clone()) {}
1214 
run()1215     void run() override {
1216         DEBUG_PRINTF("thread %zu running\n", thread_id);
1217         for (;;) {
1218             auto c = corpq.pop(thread_id);
1219             if (!c) {
1220                 break;
1221             }
1222 
1223             addCorporaToQueue(out, testq, c->id, *corpora, summary,
1224                               move(c->pcre), move(c->cngi), c->ue2, c->multi,
1225                               c->utf8);
1226 
1227             count++;
1228             flush_output();
1229         }
1230     }
1231 
getSummary() const1232     const TestSummary &getSummary() const { return summary; }
1233 
1234 public:
1235     size_t count = 0; // number of units processed
1236 
1237 private:
1238     // Output queue, shared between threads.
1239     BoundedQueue<TestUnit> &testq;
1240 
1241     // Input queue, shared between corpus generator threads.
1242     BoundedQueue<CorpusGenUnit> &corpq;
1243 
1244     // Thread-local data.
1245     const unique_ptr<CorporaSource> corpora; // independent copy
1246     TestSummary summary;
1247 };
1248 
1249 } // namespace
1250 
1251 static
makeNGInfo(const unsigned id,TestSummary & summary,GraphTruth & graph,UltimateTruth & ultimate,shared_ptr<DatabaseProxy> ue2)1252 unique_ptr<CNGInfo> makeNGInfo(const unsigned id, TestSummary &summary,
1253                                GraphTruth &graph, UltimateTruth &ultimate,
1254                                shared_ptr<DatabaseProxy> ue2) {
1255     string nfaErr;
1256 
1257     try {
1258         debug_stage = STAGE_GRAPH_PREPROCESS;
1259         auto cngi = graph.preprocess(id);
1260         debug_stage = STAGE_UNDEFINED;
1261         return cngi;
1262     }
1263     catch (const NGCompileFailure &err) {
1264         nfaErr = err.msg;
1265         debug_stage = STAGE_UNDEFINED;
1266         // fall through
1267     }
1268     catch (const NGUnsupportedFailure &err) {
1269         // unsupported error happens when the pattern appears to be valid, but
1270         // there are things that we don't yet support (e.g. SOM).
1271         // in this case, try again, suppressing the errors
1272         debug_stage = STAGE_UNDEFINED;
1273         summary.failNGCompile++;
1274 
1275         // try again and suppress unsupported errors
1276         try {
1277             debug_stage = STAGE_GRAPH_PREPROCESS;
1278             auto cngi = graph.preprocess(id, true);
1279             debug_stage = STAGE_UNDEFINED;
1280 
1281             // preprocess succeeded - that means the pattern itself is valid.
1282             // however, we can't use it, so we have to mark it as bad
1283             // only print the error in the following cases:
1284             // 1) if verbose is specified
1285             // 2) if we are not using UE2 and quiet is NOT specified
1286             if ((!use_UE2 && !g_quiet) || g_verbose) {
1287                 cout << "FAILED: id " << id << ", NFA graph preprocess failed ("
1288                      << err.msg << ")" << endl;
1289             }
1290             cngi->mark_bad();
1291             return cngi;
1292         }
1293         catch (const NGCompileFailure &e) {
1294             // compile failed
1295             nfaErr = e.msg;
1296             debug_stage = STAGE_UNDEFINED;
1297             // fall through
1298         }
1299     }
1300 
1301     // We should ensure that we also fail compilation with UE2, otherwise we
1302     // likely have a pattern support bug.
1303     try {
1304         auto db = ue2->get(ultimate);
1305         if (db) {
1306             // if we made it this far, that means UE2 compile succeeded while
1307             // NFA compile failed.
1308             cout << "FAILED: id " << id << ", NFA graph preprocess failed ("
1309                  << nfaErr << ") but UE2 compile succeeded." << endl;
1310             summary.failNGCompile++;
1311             summary.failCompileDifference++;
1312             return nullptr;
1313         }
1314         // If db is nullptr, we have previously failed compilation of this
1315         // database.
1316     }
1317     catch (const CompileFailed &) {
1318         // Everything's OK: compilation failed in Hyperscan as well. Fall
1319         // through.
1320     }
1321     summary.failNGCompile++;
1322     if (!g_quiet) {
1323         cout << "FAILED: id " << id << ", NFA graph preprocess failed ("
1324              << nfaErr << ")" << endl;
1325     }
1326     return nullptr;
1327 }
1328 
1329 static
makePcre(const unsigned id,TestSummary & summary,GroundTruth & ground,UltimateTruth & ultimate,shared_ptr<DatabaseProxy> ue2)1330 unique_ptr<CompiledPcre> makePcre(const unsigned id, TestSummary &summary,
1331                                   GroundTruth &ground, UltimateTruth &ultimate,
1332                                   shared_ptr<DatabaseProxy> ue2) {
1333     string pcreErr;
1334 
1335     try {
1336         debug_stage = STAGE_PCRE_COMPILE;
1337         auto cpcre = ground.compile(id);
1338         debug_stage = STAGE_UNDEFINED;
1339         return cpcre;
1340     }
1341     catch (const SoftPcreCompileFailure &err) {
1342         debug_stage = STAGE_UNDEFINED;
1343         summary.failPcreCompile++;
1344         if (g_verbose) {
1345             cout << "FAILED: id " << id
1346                  << ", libpcre compile failed with soft error: " << err.msg
1347                  << endl;
1348         }
1349         return nullptr;
1350     }
1351     catch (const PcreCompileFailure &err) {
1352         debug_stage = STAGE_UNDEFINED;
1353         pcreErr = err.msg;
1354         // fall through
1355     }
1356 
1357     // We should ensure that we also fail compilation with UE2, otherwise we
1358     // likely have a pattern support bug.
1359     try {
1360         auto db = ue2->get(ultimate);
1361         if (db) {
1362             // OK, so now we have a situation: PCRE failed but UE2 succeeded.
1363             // There is one situation where this is legal: patterns beginning
1364             // with (*UTF8), which will throw an error due to the callback
1365             // wrapping we do for PCRE. We can check these by trying to compile
1366             // an "unwrapped" PCRE.
1367             ground.compile(id, true);
1368             // If we didn't throw, PCRE failed above but succeeded when not
1369             // wrapped in a callback, and UE2 succeeded. Not worth reporting,
1370             // fall through.
1371         }
1372     }
1373     catch (const CompileFailed &) {
1374         // Everything's OK: compilation failed in Hyperscan as well. Fall
1375         // through.
1376     }
1377     catch (const PcreCompileFailure &) {
1378         cout << "FAILED: id " << id << ", libpcre compile failed (" << pcreErr
1379              << ") but UE2 compile succeeded." << endl;
1380         summary.failPcreCompile++;
1381         summary.failCompileDifference++;
1382         return nullptr;
1383     }
1384 
1385     if (!g_quiet) {
1386         cout << "FAILED: id " << id << ", libpcre compile failed: " << pcreErr
1387              << endl;
1388     }
1389 
1390     summary.failPcreCompile++;
1391     return nullptr;
1392 }
1393 
1394 static
drainGenerators(BoundedQueue<CorpusGenUnit> & corpq,vector<unique_ptr<CorpusGenThread>> & generators,TestSummary & summary)1395 void drainGenerators(BoundedQueue<CorpusGenUnit> &corpq,
1396                      vector<unique_ptr<CorpusGenThread>> &generators,
1397                      TestSummary &summary) {
1398     // Push a sentinel per thread.
1399     for (size_t i = 0; i < generators.size(); i++) {
1400         corpq.push(nullptr);
1401     }
1402 
1403     // Wait for workers to end and retrieve their results.
1404     for (auto &c : generators) {
1405         c->join();
1406         summary.merge(c->getSummary());
1407     }
1408 }
1409 
1410 // Note: In multi-pattern cases, utf8 is true if any pattern to be run against
1411 // this corpus is in UTF-8 mode.
1412 static
makeCorpusGenUnit(unsigned id,TestSummary & summary,GroundTruth & ground,GraphTruth & graph,UltimateTruth & ultimate,shared_ptr<DatabaseProxy> ue2,bool multi,bool utf8)1413 unique_ptr<CorpusGenUnit> makeCorpusGenUnit(unsigned id, TestSummary &summary,
1414                                             GroundTruth &ground,
1415                                             GraphTruth &graph,
1416                                             UltimateTruth &ultimate,
1417                                             shared_ptr<DatabaseProxy> ue2,
1418                                             bool multi, bool utf8) {
1419     unique_ptr<CompiledPcre> cpcre;
1420     unique_ptr<CNGInfo> cngi;
1421 
1422     // compile PCRE bytecode
1423     if (use_PCRE) {
1424         cpcre = makePcre(id, summary, ground, ultimate, ue2);
1425     }
1426     if (use_NFA) {
1427         cngi = makeNGInfo(id, summary, graph, ultimate, ue2);
1428     }
1429 
1430     // if both compiles failed, skip the test
1431     if (!cpcre && !cngi) {
1432         return nullptr;
1433     }
1434 
1435     // Caller may already have set the UTF-8 property (in multi cases)
1436     utf8 |= cpcre ? cpcre->utf8 : cngi->utf8;
1437 
1438     return ue2::make_unique<CorpusGenUnit>(move(cngi), move(cpcre), ue2, id,
1439                                            multi, utf8);
1440 }
1441 
1442 static
hasUTF8Pattern(GroundTruth & ground,ExpressionMap::const_iterator it,ExpressionMap::const_iterator end)1443 bool hasUTF8Pattern(GroundTruth &ground, ExpressionMap::const_iterator it,
1444                     ExpressionMap::const_iterator end) {
1445     /* note: we cannot just check the flags as utf8 can be enabled in the
1446      * pattern itself with (*UTF) */
1447     debug_stage = STAGE_PCRE_COMPILE;
1448     for (; it != end; ++it) {
1449         try {
1450             auto cpcre = ground.compile(it->first);
1451             assert(cpcre); // Would have thrown PcreCompileFailure otherwise.
1452             if (cpcre->utf8) {
1453                 DEBUG_PRINTF("UTF8 mode\n");
1454                 debug_stage = STAGE_UNDEFINED;
1455                 return true;
1456             }
1457         }
1458         catch (const PcreCompileFailure &) {
1459             continue;
1460         }
1461     }
1462     debug_stage = STAGE_UNDEFINED;
1463     return false;
1464 }
1465 
1466 // Fill a test queue with single-pattern tests.
1467 static
buildSingle(BoundedQueue<CorpusGenUnit> & corpq,TestSummary & summary,GroundTruth & ground,GraphTruth & graph,UltimateTruth & ultimate,const ExpressionMap & exprMap)1468 void buildSingle(BoundedQueue<CorpusGenUnit> &corpq, TestSummary &summary,
1469                  GroundTruth &ground, GraphTruth &graph,
1470                  UltimateTruth &ultimate, const ExpressionMap &exprMap) {
1471     for (const auto &m : exprMap) {
1472         unsigned id = m.first;
1473         debug_expr = id;
1474         debug_expr_ptr = m.second.c_str();
1475 
1476         shared_ptr<DatabaseProxy> ue2 = constructDatabase({id}, ultimate);
1477         if (!ue2) {
1478             summary.failUe2Compile++;
1479             continue;
1480         }
1481 
1482         // if we're cross-compiling, then we don't bother building PCRE and
1483         // running scans, we're just going to output the database bytecode.
1484         if (!ultimate.runnable()) {
1485             continue;
1486         }
1487 
1488         bool multi = false;
1489         bool utf8 = false;
1490         auto u = makeCorpusGenUnit(id, summary, ground, graph, ultimate, ue2,
1491                                    multi, utf8);
1492         if (u) {
1493             corpq.push(move(u));
1494         }
1495     }
1496 }
1497 
1498 // Fill a test queue with multi-pattern tests of size N, where N is the band
1499 // size specified on the command line.
1500 static
buildBanded(BoundedQueue<CorpusGenUnit> & corpq,TestSummary & summary,GroundTruth & ground,GraphTruth & graph,UltimateTruth & ultimate,const ExpressionMap & exprMap)1501 void buildBanded(BoundedQueue<CorpusGenUnit> &corpq, TestSummary &summary,
1502                  GroundTruth &ground, GraphTruth &graph,
1503                  UltimateTruth &ultimate, const ExpressionMap &exprMap) {
1504     for (auto i = exprMap.begin(), e = exprMap.end(); i != e;) {
1505         debug_expr = i->first;
1506         debug_expr_ptr = i->second.c_str();
1507 
1508         // Build a set of IDs in this band from the expression map
1509         set<unsigned> bandIds;
1510 
1511         if (g_verbose) {
1512             cout << "Building set:";
1513         }
1514 
1515         ExpressionMap::const_iterator band_end = i;
1516         for (u32 j = 0; j < multicompile_bands && band_end != e;
1517              j++, ++band_end) {
1518             bandIds.insert(bandIds.end(), band_end->first);
1519             if (g_verbose) {
1520                 cout << " " << band_end->first;
1521             }
1522         }
1523 
1524         if (g_verbose) {
1525             cout << endl;
1526         }
1527 
1528         // compile UE2 bytecode
1529         shared_ptr<DatabaseProxy> ue2 = constructDatabase(bandIds, ultimate);
1530         if (!ue2) {
1531             summary.failUe2Compile++;
1532             i = band_end;
1533             continue;
1534         }
1535 
1536         // if we're cross-compiling, then we don't bother building PCRE and
1537         // running scans, we're just going to output the database bytecode.
1538         if (!ultimate.runnable()) {
1539             i = band_end;
1540             continue;
1541         }
1542 
1543         bool utf8 = hasUTF8Pattern(ground, i, band_end);
1544 
1545         for (; i != band_end; ++i) {
1546             unsigned id = i->first;
1547             bool multi = true;
1548             auto u = makeCorpusGenUnit(id, summary, ground, graph, ultimate,
1549                                        ue2, multi, utf8);
1550             if (u) {
1551                 corpq.push(move(u));
1552             }
1553         }
1554     }
1555 }
1556 
1557 // Fill a test queue with multi-pattern tests.
1558 static
buildMulti(BoundedQueue<CorpusGenUnit> & corpq,TestSummary & summary,GroundTruth & ground,GraphTruth & graph,UltimateTruth & ultimate,const ExpressionMap & exprMap)1559 void buildMulti(BoundedQueue<CorpusGenUnit> &corpq, TestSummary &summary,
1560                 GroundTruth &ground, GraphTruth &graph, UltimateTruth &ultimate,
1561                 const ExpressionMap &exprMap) {
1562     // Build a set of all IDs from the expression map
1563     set<unsigned> idsAll;
1564     for (const auto &e : exprMap) {
1565         idsAll.insert(e.first);
1566     }
1567 
1568     // Compile in UE2
1569     shared_ptr<DatabaseProxy> ue2 = constructDatabase(idsAll, ultimate);
1570     if (!ue2) {
1571         summary.failUe2Compile++;
1572         return;
1573     }
1574 
1575     // if we're cross-compiling, then we don't bother building PCRE and
1576     // running scans, we're just going to output the database bytecode.
1577     if (!ultimate.runnable()) {
1578         return;
1579     }
1580 
1581     bool utf8 = hasUTF8Pattern(ground, exprMap.begin(), exprMap.end());
1582 
1583     for (const auto &m : exprMap) {
1584         unsigned id = m.first;
1585         debug_expr = id;
1586         debug_expr_ptr = m.second.c_str();
1587         bool multi = true;
1588         auto u = makeCorpusGenUnit(id, summary, ground, graph, ultimate, ue2,
1589                                    multi, utf8);
1590         if (u) {
1591             corpq.push(move(u));
1592         }
1593     }
1594 }
1595 
1596 static
generateTests(CorporaSource & corpora_src,const ExpressionMap & exprMap,TestSummary & summary,const hs_platform_info * plat,const Grey & grey,BoundedQueue<TestUnit> & testq)1597 void generateTests(CorporaSource &corpora_src, const ExpressionMap &exprMap,
1598                    TestSummary &summary, const hs_platform_info *plat,
1599                    const Grey &grey, BoundedQueue<TestUnit> &testq) {
1600     GraphTruth graph(cout, exprMap);
1601     GroundTruth ground(cout, exprMap, g_matchLimit, g_matchLimitRecursion);
1602     UltimateTruth ultimate(cout, exprMap, plat, grey, g_streamBlocks);
1603 
1604     // Construct corpus generator queue and threads.
1605     BoundedQueue<CorpusGenUnit> corpq(numGeneratorThreads,
1606                                       max_generator_queue_len);
1607     vector<unique_ptr<CorpusGenThread>> generators;
1608     for (size_t i = 0; i < numGeneratorThreads; i++) {
1609         auto c = make_unique<CorpusGenThread>(i, testq, corpq, corpora_src);
1610         c->start();
1611         generators.push_back(move(c));
1612     }
1613 
1614     if (g_ue2CompileAll && multicompile_bands) {
1615         printf("Running single-pattern/banded-multi-compile test for %zu "
1616                "expressions.\n\n", exprMap.size());
1617         buildBanded(corpq, summary, ground, graph, ultimate, exprMap);
1618     } else if (g_ue2CompileAll) {
1619         printf("Running single-pattern/multi-compile test for %zu "
1620                "expressions.\n\n", exprMap.size());
1621         buildMulti(corpq, summary, ground, graph, ultimate, exprMap);
1622     } else {
1623         printf("Running single-pattern/single-compile test for %zu "
1624                "expressions.\n\n", exprMap.size());
1625         buildSingle(corpq, summary, ground, graph, ultimate, exprMap);
1626     }
1627 
1628     drainGenerators(corpq, generators, summary);
1629 }
1630 
1631 static
printSettingsV(const vector<string> & corporaFiles,const hs_platform_info * platform)1632 void printSettingsV(const vector<string> &corporaFiles,
1633                     const hs_platform_info *platform) {
1634     cout << "hscollider: The Pattern Collider Mark II\n\n"
1635          << "Number of threads:  " << numThreads << " (" << numScannerThreads
1636          << " scanner, " << numGeneratorThreads << " generator)\n"
1637          << "Expression path:    " << g_exprPath << "\n"
1638          << "Signature files:    ";
1639     if (g_signatureFiles.empty()) {
1640         cout << "none" << endl;
1641     } else {
1642         for (unsigned i = 0; i < g_signatureFiles.size(); i++) {
1643             string &fname = g_signatureFiles[i];
1644             if (i > 0) {
1645                 cout << string(20, ' ');
1646             }
1647             cout << fname << endl;
1648         }
1649     }
1650     cout << "Mode of operation:  ";
1651 
1652     switch (colliderMode) {
1653         case MODE_BLOCK:        cout << "block mode"; break;
1654         case MODE_STREAMING:    cout << "streaming mode"; break;
1655         case MODE_VECTORED:     cout << "vectored mode"; break;
1656         case MODE_HYBRID:       cout << "hybrid mode"; break;
1657     }
1658     cout << endl;
1659 
1660     if (limit_matches) {
1661         cout << "Terminate scanning after " << limit_matches << " matches."
1662              << endl;
1663     }
1664 
1665     if (platform) {
1666         cout << "Cross-compile for:  " << to_string(*platform) << endl;
1667     }
1668 
1669     if (loadDatabases) {
1670         cout << "Loading DBs from:   " << serializePath << endl;
1671     }
1672     if (saveDatabases) {
1673         cout << "Saving DBs to:      " << serializePath << endl;
1674     }
1675     if (colliderMode == MODE_STREAMING) {
1676         cout << "Stream block count: " << g_streamBlocks << endl;
1677     }
1678     if (colliderMode == MODE_VECTORED) {
1679         cout << "Vectored block count: " << g_streamBlocks << endl;
1680     }
1681 
1682     if (use_UE2) {
1683         if (max_ue2_align == min_ue2_align + 1) {
1684             cout << "UE2 scan alignment: " << min_ue2_align << endl;
1685         } else {
1686             cout << "UE2 scan alignment: [" << min_ue2_align << ", "
1687                  << max_ue2_align << ")" << endl;
1688         }
1689     }
1690 
1691     if (!corporaFiles.empty()) {
1692         for (const auto &file : corporaFiles) {
1693             cout << "Corpora read from file: " << file << endl;
1694         }
1695     } else {
1696         cout << "Corpora properties: \n"
1697              << "  random seed:      " << corpus_gen_prop.getSeed() << "\n"
1698              << "  percentages:      " << corpus_gen_prop.percentMatch()
1699              << "% match, "
1700              << corpus_gen_prop.percentUnmatch() << "% unmatch, "
1701              << corpus_gen_prop.percentRandom() << "% random" << endl;
1702 
1703         // prefix and suffix info
1704         const min_max &prefixSpan = corpus_gen_prop.prefixRange;
1705         const min_max &suffixSpan = corpus_gen_prop.suffixRange;
1706         if (prefixSpan.max) {
1707             cout << "  random prefix:    " << prefixSpan.min << " to "
1708                  << prefixSpan.max << endl;
1709         } else {
1710             cout << "  random prefix:    none" << endl;
1711         }
1712         if (suffixSpan.max) {
1713             cout << "  random suffix:    " << suffixSpan.min
1714                  << " to " << suffixSpan.max << endl;
1715         } else {
1716             cout << "  random suffix:    none" << endl;
1717         }
1718 
1719         // cycle info
1720         pair<unsigned, unsigned> cycleSpan = corpus_gen_prop.getCycleLimit();
1721         cout << "  follow cycles:    " << cycleSpan.first << " to "
1722              << cycleSpan.second << " times" << endl;
1723     }
1724 
1725     if (saveCorpora) {
1726         cout << "Saving corpora to:  " << saveCorporaFile << endl;
1727     }
1728 
1729     cout << endl;
1730 }
1731 
1732 static
printSettingsQ(const vector<string> & corporaFiles,const hs_platform_info * platform)1733 void printSettingsQ(const vector<string> &corporaFiles,
1734                     const hs_platform_info *platform) {
1735     cout << "Number of threads:  " << numThreads << endl
1736          << "Expression path:    " << g_exprPath << endl
1737          << "Signature files:    ";
1738     if (g_signatureFiles.empty()) {
1739         cout << "none" << endl;
1740     } else {
1741         for (unsigned i = 0; i < g_signatureFiles.size(); i++) {
1742             string &fname = g_signatureFiles[i];
1743             if (i > 0) {
1744                 cout << string(20, ' ');
1745             }
1746             cout << fname << endl;
1747         }
1748     }
1749     cout << "Mode of operation:  ";
1750 
1751     switch (colliderMode) {
1752         case MODE_BLOCK:        cout << "block mode"; break;
1753         case MODE_STREAMING:    cout << "streaming mode"; break;
1754         case MODE_VECTORED:     cout << "vectored mode"; break;
1755         case MODE_HYBRID:       cout << "hybrid mode"; break;
1756     }
1757     cout << endl;
1758 
1759     if (limit_matches) {
1760         cout << "Terminate scanning after " << limit_matches << " matches."
1761              << endl;
1762     }
1763 
1764     if (platform) {
1765         cout << "Cross-compile for:  " << to_string(*platform) << endl;
1766     }
1767 
1768     if (colliderMode == MODE_STREAMING) {
1769         cout << "Stream block count: " << g_streamBlocks << endl;
1770     }
1771     if (colliderMode == MODE_VECTORED) {
1772         cout << "Vectored block count: " << g_streamBlocks << endl;
1773     }
1774 
1775     if (max_ue2_align == min_ue2_align + 1) {
1776         cout << "UE2 scan alignment: " << min_ue2_align << endl;
1777     } else {
1778         cout << "UE2 scan alignment: [" << min_ue2_align << ", "
1779              << max_ue2_align << ")" << endl;
1780     }
1781 
1782     if (!g_corpora_prefix.empty()) {
1783         cout << "Prefix of " << g_corpora_prefix.size() << "bytes" << endl;
1784     }
1785     if (!g_corpora_suffix.empty()) {
1786         cout << "Suffix of " << g_corpora_suffix.size() << "bytes" << endl;
1787     }
1788 
1789     if (!corporaFiles.empty()) {
1790         cout << "Corpora: from file" << endl;
1791     } else {
1792         cout << "Corpora: -R " << corpus_gen_prop.getSeed() << " -p "
1793              << corpus_gen_prop.percentMatch() << ","
1794              << corpus_gen_prop.percentUnmatch() << ","
1795              << corpus_gen_prop.percentRandom();
1796 
1797         // prefix and suffix info
1798         const min_max &prefixSpan = corpus_gen_prop.prefixRange;
1799         const min_max &suffixSpan = corpus_gen_prop.suffixRange;
1800         if (prefixSpan.max) {
1801             cout << " -P " << prefixSpan.min << "," << prefixSpan.max;
1802         }
1803         if (suffixSpan.max) {
1804             cout << " -S " << suffixSpan.min << "," << suffixSpan.max;
1805         }
1806 
1807         // cycle info
1808         pair<unsigned, unsigned> cycleSpan = corpus_gen_prop.getCycleLimit();
1809         cout << " -C " << cycleSpan.first << "," << cycleSpan.second;
1810         cout << endl;
1811     }
1812 }
1813 
1814 static
printSettings(const vector<string> & c,const hs_platform_info * plat)1815 void printSettings(const vector<string> &c, const hs_platform_info *plat) {
1816     if (g_quiet > 1) {
1817         printSettingsQ(c, plat);
1818     } else {
1819         printSettingsV(c, plat);
1820     }
1821 }
1822 
1823 static
buildCorpora(const vector<string> & corporaFiles,const ExpressionMap & exprMap)1824 unique_ptr<CorporaSource> buildCorpora(const vector<string> &corporaFiles,
1825                                        const ExpressionMap &exprMap) {
1826     if (!corporaFiles.empty()) {
1827         auto c = ue2::make_unique<FileCorpora>();
1828         for (const auto &file : corporaFiles) {
1829             if (!c->readFile(file)) {
1830                 cout << "Error reading corpora from file: " << file << endl;
1831                 exit_with_fail();
1832             }
1833         }
1834         return move(c); /* move allows unique_ptr<CorporaSource> conversion */
1835     } else {
1836         auto c = ue2::make_unique<NfaGeneratedCorpora>(
1837             exprMap, corpus_gen_prop, force_utf8, force_prefilter);
1838         return move(c);
1839     }
1840 }
1841 
1842 static
needsQuotes(const char * s)1843 bool needsQuotes(const char *s) {
1844     size_t len = strlen(s);
1845 
1846     if (len == 0) {
1847         return true;
1848     }
1849 #ifndef _WIN32
1850     // don't confuse the correct isblank for the one in locale
1851     int (*blank)(int) = &std::isblank;
1852     if (find_if(s, s + len, blank) != s + len) {
1853 #else
1854     if (find_if(s, s + len, [](unsigned char c){ return std::isblank(c); }) != s + len) {
1855 #endif
1856         return true;
1857     }
1858 
1859     return false;
1860 }
1861 
1862 static
1863 void storeCmdline(int argc, char **argv) {
1864     for (int i = 0; i < argc; i++) {
1865         const char *s = argv[i];
1866         if (needsQuotes(s)) {
1867             g_cmdline += '"';
1868             g_cmdline += s;
1869             g_cmdline += '"';
1870         } else {
1871             g_cmdline += s;
1872         }
1873         if (i != argc - 1) {
1874             g_cmdline += " ";
1875         }
1876     }
1877 }
1878 
1879 static
1880 bool runTests(CorporaSource &corpora_source, const ExpressionMap &exprMap,
1881               const hs_platform_info *plat, const Grey &grey) {
1882     TestSummary summary;
1883     summary.totalExpressions = exprMap.size();
1884     BoundedQueue<TestUnit> testq(numScannerThreads, max_scan_queue_len);
1885 
1886     // Start scanning threads.
1887     vector<unique_ptr<ScanThread>> scanners;
1888     for (size_t i = 0; i < numScannerThreads; i++) {
1889         auto s = ue2::make_unique<ScanThread>(i, testq, exprMap, plat, grey);
1890         s->start();
1891         scanners.push_back(move(s));
1892     }
1893 
1894     generateTests(corpora_source, exprMap, summary, plat, grey, testq);
1895 
1896     // Push a sentinel per scanning thread to ensure that everyone finishes
1897     // work.
1898     for (size_t i = 0; i < scanners.size(); i++) {
1899         testq.push(nullptr);
1900     }
1901 
1902     // Wait for consumers to end and retrieve their results.
1903     for (size_t i = 0; i < scanners.size(); i++) {
1904         const auto &s = scanners[i];
1905         s->join();
1906 
1907         if (g_verbose) {
1908             cout << "Thread " << i << " processed " << s->count << " units."
1909                  << endl;
1910         }
1911 
1912         summary.merge(s->getSummary());
1913     }
1914 
1915     printSummary(summary);
1916     return !summary.hasFailure();
1917 }
1918 
1919 int HS_CDECL main(int argc, char *argv[]) {
1920     Grey grey;
1921     vector<string> corporaFiles;
1922 
1923     for (int i = 1; i < argc - 1; i++) {
1924         if (!strcmp(argv[i], "-G")) {
1925             cout << "Override: " << argv[i + 1] << endl;
1926         }
1927     }
1928 
1929     setDefaults();
1930     storeCmdline(argc, argv);
1931     unique_ptr<hs_platform_info> plat;
1932     corpus_gen_prop.seed(randomSeed);
1933 
1934     processArgs(argc, argv, corpus_gen_prop, &corporaFiles, &grey, &plat);
1935 
1936     // If the user has asked for a random alignment, we select it here (after
1937     // random number seed applied).
1938     if (use_random_alignment) {
1939         min_ue2_align = corpus_gen_prop.rand(0, 15);
1940         max_ue2_align = min_ue2_align + 1;
1941     }
1942 
1943     // Limit memory usage, unless the user has specified zero on the command
1944     // line or in a config file.
1945     if (g_memoryLimit) {
1946         setMemoryLimit(g_memoryLimit * numThreads);
1947     }
1948 
1949     // Split threads available up amongst scanner and generator threads.
1950     numGeneratorThreads = max(1u, static_cast<unsigned int>(numThreads * 0.5));
1951     numScannerThreads = max(1u, numThreads - numGeneratorThreads);
1952 
1953     ExpressionMap exprMap;
1954     loadExpressions(g_exprPath, exprMap);
1955 
1956     if (!g_allSignatures) {
1957         SignatureSet signatures;
1958         if (!g_signatureFiles.empty()) {
1959             for (string &fname : g_signatureFiles) {
1960                 loadSignatureList(fname, signatures);
1961             }
1962         } else {
1963             signatures.insert(signatures.end(), g_signatures.begin(),
1964                               g_signatures.end());
1965         }
1966 
1967         exprMap = limitToSignatures(exprMap, signatures);
1968     }
1969 
1970     printSettings(corporaFiles, plat.get());
1971 
1972     if (exprMap.empty()) {
1973         cout << "Warning: no signatures to scan. Exiting." << endl;
1974         exit(0);
1975     }
1976 
1977     if (!no_signal_handler) {
1978         installSignalHandler();
1979     }
1980 
1981     if (saveDatabases || loadDatabases) {
1982         struct stat st;
1983         if (stat(serializePath.c_str(), &st) < 0) {
1984             cout << "Unable to stat serialize path '" <<  serializePath
1985                  << "': " << strerror(errno) << endl;
1986             exit_with_fail();
1987         }
1988     }
1989 
1990     // If we're saving corpora out, truncate the output file.
1991     if (saveCorpora) {
1992         corporaOut = ue2::make_unique<CorpusWriter>(saveCorporaFile);
1993     }
1994 
1995     GroundTruth::global_prep();
1996 
1997     auto corpora_source = buildCorpora(corporaFiles, exprMap);
1998 
1999     if (!g_verbose && g_quiet < 2) {
2000         cout << "Only failed tests are displayed." << endl;
2001     }
2002 
2003     SimpleTimer timer;
2004     bool success = runTests(*corpora_source, exprMap, plat.get(), grey);
2005     cout << "\nTotal elapsed time: " << timer.elapsed() << " secs." << endl;
2006     exprMap.clear();
2007 
2008     if (!success) {
2009         exit_with_fail();
2010     }
2011 
2012     return 0;
2013 }
2014