/* * Copyright (c) 2015-2019, Intel Corporation * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * * Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of Intel Corporation nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ #include "config.h" #include "BoundedQueue.h" #include "DatabaseProxy.h" #include "FileCorpora.h" #include "GraphTruth.h" #include "GroundTruth.h" #include "NfaGeneratedCorpora.h" #include "Thread.h" #include "UltimateTruth.h" #include "args.h" #include "common.h" #include "cross_compile.h" #include "expressions.h" #include "limit.h" #include "ng_corpus_properties.h" #include "sig.h" #include "simple_timer.h" #include "util/expression_path.h" #include "util/string_util.h" #include "grey.h" #include "hs.h" #include "parser/utf8_validate.h" #include "ue2common.h" #include "util/container.h" #include "util/make_unique.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include using namespace std; using namespace ue2; unsigned int numThreads = 1; unsigned int numScannerThreads = 1; unsigned int numGeneratorThreads = 1; enum ColliderMode colliderMode = MODE_BLOCK; bool echo_matches = false; int g_quiet = 0; bool g_verbose = false; bool g_allSignatures = false; string g_exprPath; vector g_signatureFiles; string g_cmdline; bool g_ue2CompileAll = false; unsigned g_streamBlocks = 0; unsigned long long g_streamOffset = 0; unsigned multicompile_bands = 0; vector g_signatures; unsigned long int g_matchLimit = DEFAULT_PCRE_MATCH_LIMIT; unsigned long int g_matchLimitRecursion = DEFAULT_PCRE_MATCH_RECURSION_LIMIT; string g_corpora_prefix; string g_corpora_suffix; size_t g_memoryLimit = 1000; // megabytes per thread unsigned int somFlags = 0; bool loadDatabases = false; bool saveDatabases = false; bool saveCorpora = false; string saveCorporaFile; string serializePath; bool force_utf8 = false; int force_prefilter = 0; int no_groups = 0; unsigned somPrecisionMode = HS_MODE_SOM_HORIZON_LARGE; unsigned limit_matches = 0; unsigned randomSeed = 0; bool use_random_alignment = false; bool use_PCRE = true; bool use_NFA = true; bool use_UE2 = true; bool use_copy_scratch = false; bool use_copy_stream = false; bool use_mangle_scratch = false; bool use_compress_expand = false; bool use_compress_reset_expand = false; bool use_literal_api = false; int abort_on_failure = 0; int no_signal_handler = 0; size_t max_scan_queue_len = 25000; size_t max_generator_queue_len = 25000; bool force_edit_distance = false; unsigned edit_distance = 0; CorpusProperties corpus_gen_prop; // Semi constants unsigned min_ue2_align = 0; unsigned max_ue2_align = MAX_MAX_UE2_ALIGN; #define DEDUPE_MATCHES static unsigned countCores() { unsigned count = std::thread::hardware_concurrency(); return count ? count : 1; } // Detect the Address Sanitizer with either GCC or Clang. #if defined(__SANITIZE_ADDRESS__) # define BUILT_WITH_ASAN #elif defined(__has_feature) # if __has_feature(address_sanitizer) # define BUILT_WITH_ASAN # endif #endif // Set the default params that can be overridden with commandline args static void setDefaults() { // Seed random number generator for corpora randomSeed = time(nullptr); // Overcommit since we have generators and scanners running. numThreads = countCores() * 2; #ifdef BUILT_WITH_ASAN cout << "NOTE: Built with AddressSanitizer.\n" << "Defaulting to no memory limit and no signal handler.\n" << endl; g_memoryLimit = 0; no_signal_handler = 1; #endif } static void exit_with_fail(void) { cout << "Failing cmdline was:\n " << g_cmdline << endl; if (abort_on_failure) { cout << "Calling abort()" << endl; abort(); } exit(1); } namespace /* anonymous */ { // For saving corpora out if the -w flag is specified. Note that we need a // mutex to serialise writes from different threads. class CorpusWriter { public: explicit CorpusWriter(const string &filename) : out(filename.c_str(), ios_base::trunc) {} void write(const string &str) { std::lock_guard lock(mutex); out << str << flush; } private: ofstream out; std::mutex mutex; }; unique_ptr corporaOut = nullptr; // Encapsulates all of the data reported from a test struct TestSummary { unsigned totalCorpora = 0; unsigned totalExpressions = 0; unsigned failCorpora = 0; unsigned failPcreCompile = 0; unsigned failNGCompile = 0; unsigned failUe2Compile = 0; unsigned failCompileDifference = 0; // failed in pcre but not ue2 unsigned failPcreScan = 0; unsigned failNGScan = 0; unsigned failUe2Scan = 0; unsigned failDiff = 0; unsigned failNoGroundTruth = 0; set failIds; set nogtIds; // true if we've got a failure bool hasFailure() const { return failDiff != 0 || !failIds.empty() || failCompileDifference != 0; } void merge(const TestSummary &a) { totalCorpora += a.totalCorpora; totalExpressions += a.totalExpressions; failCorpora += a.failCorpora; failPcreCompile += a.failPcreCompile; failNGCompile += a.failNGCompile; failUe2Compile += a.failUe2Compile; failCompileDifference += a.failCompileDifference; failPcreScan += a.failPcreScan; failNGScan += a.failNGScan; failUe2Scan += a.failUe2Scan; failDiff += a.failDiff; failNoGroundTruth += a.failNoGroundTruth; failIds.insert(begin(a.failIds), end(a.failIds)); nogtIds.insert(begin(a.nogtIds), end(a.nogtIds)); } }; enum TestResult { TEST_NO_GROUND_TRUTH, TEST_PASSED, TEST_SKIPPED, TEST_FAILED_COMPILE, TEST_FAILED }; struct TestUnit { shared_ptr pcre; // libpcre bytecode shared_ptr cngi; // NFA graph info (compilation is deferred) shared_ptr ue2; // ue2 bytecode Corpus corpus; // a local copy, as we may modify it unsigned id; // expression id unsigned corpus_id; // corpus id bool highlander; // single match flag bool prefilter; // prefilter flag bool som; // start of match flag bool multi; // if false, we're in single mode. bool utf8; // at least one of our patterns is utf8 enum TestResult result; TestUnit(unsigned sig_id, unsigned c_id, const Corpus &c, shared_ptr pcre_in, shared_ptr cngi_in, shared_ptr ue2_in, bool multi_in, bool utf8_in, bool highlander_in, bool prefilter_in, bool som_in) : pcre(pcre_in), cngi(cngi_in), ue2(ue2_in), corpus(c), id(sig_id), corpus_id(c_id), highlander(highlander_in), prefilter(prefilter_in), som(som_in), multi(multi_in), utf8(utf8_in), result(TEST_NO_GROUND_TRUTH) {} }; } // namespace // For ease of printing match sets static std::ostream &operator<<(std::ostream &os, const set &v) { auto vi = v.begin(), ve = v.end(); while (vi != ve) { // match offsets os << '(' << vi->from << ',' << vi->to << ')'; if (++vi != ve) { os << ", "; } } return os; } static void printCorpus(ostream &out, const Corpus &corpus) { // Print the offending corpus string corpus_data(corpus.data.begin() + g_corpora_prefix.size(), corpus.data.end() - g_corpora_suffix.size()); bool trimmed = false; if (corpus_data.size() > 1000) { corpus_data.resize(1000); trimmed = true; } out << " Corpus data: '" << printable(corpus_data) << "'"; if (trimmed) { out << " ..."; } out << "\n"; } static void printGroundTruthDifference(ostream &out, const ExpressionMap &exprMap, const TestUnit &unit, const ResultSet &pcre_results, const ResultSet &ngw_results) { assert(contains(exprMap, unit.id)); // Print the expression itself out << " Expression: '" << exprMap.at(unit.id) << "'\n"; printCorpus(out, unit.corpus); out << " PCRE matches: " << pcre_results.matches << "\n"; out << " NFA matches: " << ngw_results.matches << "\n"; vector diff; set_difference(pcre_results.matches.begin(), pcre_results.matches.end(), ngw_results.matches.begin(), ngw_results.matches.end(), back_inserter(diff)); for (const auto &match : diff) { out << " PCRE only: match (" << match.from << "," << match.to << ")\n"; } diff.clear(); set_difference(ngw_results.matches.begin(), ngw_results.matches.end(), pcre_results.matches.begin(), pcre_results.matches.end(), back_inserter(diff)); for (const auto &match : diff) { out << " NFA only: match (" << match.from << "," << match.to << ")\n"; } out.flush(); } // Report the difference information when a pattern causes different matches in // our engines. static void printDifference(ostream &out, const ExpressionMap &exprMap, const TestUnit &unit, const ResultSet >_results, const vector &ue2_results, const vector &pass) { assert(contains(exprMap, unit.id)); // Print the expression itself out << " Expression: '" << exprMap.at(unit.id) << "'\n"; printCorpus(out, unit.corpus); out << " " << gt_results.src << " matches: " << gt_results.matches << endl; for (u32 align = min_ue2_align; align < max_ue2_align; align++) { if (pass[align]) { continue; } u32 align_in = align; out << " UE2 (" << align; while (align + 1 < max_ue2_align) { if (pass[align + 1] || ue2_results[align] != ue2_results[align + 1]) { break; } align++; } if (align != align_in) { out << " - " << align; } out << ") matches: " << ue2_results[align].matches; out << endl; vector only; // Print matches only returned by ground truth set_difference(gt_results.matches.begin(), gt_results.matches.end(), ue2_results[align].matches.begin(), ue2_results[align].matches.end(), back_inserter(only)); for (const auto &match : only) { out << " " << gt_results.src << " only: match (" << match.from << "," << match.to << ')' << endl; } // Print matches only returned by UE2 only.clear(); set_difference(ue2_results[align].matches.begin(), ue2_results[align].matches.end(), gt_results.matches.begin(), gt_results.matches.end(), back_inserter(only)); for (const auto &match : only) { out << " UE2 only: match (" << match.from << "," << match.to << ')' << endl; } #ifdef DEDUPE_MATCHES for (const auto &match : ue2_results[align].dupe_matches) { out << " UE2 dupe: match (" << match.from << "," << match.to << ')' << endl; } #endif if (ue2_results[align].uoom) { out << " *** UE2 produced matches out of order" << endl; } if (ue2_results[align].match_after_halt) { out << " *** UE2 produced matches after termination" << endl; } if (ue2_results[align].invalid_id) { out << " *** UE2 produced matches for invalid ids" << endl; } } } static void printMode(void) { if (!g_ue2CompileAll) { cout << "Single/"; } else if (!multicompile_bands) { cout << "Multi/"; } else { cout << "Multi-" << multicompile_bands << "/"; } switch (colliderMode) { case MODE_BLOCK: cout << "Block"; break; case MODE_STREAMING: cout << "Streaming-" << g_streamBlocks; if (g_streamOffset) { cout << " offset " << g_streamOffset; } if (use_copy_stream) { cout << " [copy stream]"; } if (use_compress_expand) { cout << " [compress]"; } if (use_compress_reset_expand) { cout << " [compress+reset]"; } break; case MODE_VECTORED: cout << "Vectored-" << g_streamBlocks; break; case MODE_HYBRID: cout << "Hybrid"; break; } if (use_copy_scratch) { cout << " [copy scratch]"; } if (use_mangle_scratch) { cout << " [mangle]"; } cout << endl; } static void printSummaryV(const TestSummary &sum) { cout << endl; cout << "Summary:" << endl; cout << "Mode: "; printMode(); cout << "=========" << endl; cout << "Expressions processed: " << sum.totalExpressions << endl; cout << "Corpora processed: " << sum.totalCorpora << endl; cout << "Expressions with failures: " << sum.failIds.size() << endl; cout << " Corpora generation failures: " << sum.failCorpora << endl; cout << " Compilation failures: "; cout << "pcre:" << sum.failPcreCompile << ", "; cout << "ng:" << sum.failNGCompile << ", "; cout << "ue2:" << sum.failUe2Compile << endl; cout << " Matching failures: "; cout << "pcre:" << sum.failPcreScan << ", "; cout << "ng:" << sum.failNGScan << ", "; cout << "ue2:" << sum.failUe2Scan << endl; cout << " Match differences: " << sum.failIds.size() << endl; cout << " No ground truth: " << sum.nogtIds.size() << endl; cout << "Total match differences: " << sum.failDiff << endl; } static void printSummaryQ(const TestSummary &sum) { cout << "Summary: "; printMode(); cout << "Processed: " << sum.totalExpressions << " expressions, " << sum.totalCorpora << " corpora" << endl; cout << "Failures: " << sum.failIds.size() << " (corpora: " << sum.failCorpora << "; compile: "; cout << "pcre:" << sum.failPcreCompile << ", "; cout << "ng:" << sum.failNGCompile << ", "; cout << "ue2:" << sum.failUe2Compile << "; match: "; cout << "pcre:" << sum.failPcreScan << ", "; cout << "ng:" << sum.failNGScan << ", "; cout << "ue2:" << sum.failUe2Scan << ")" << endl; cout << "Differences: " << sum.failIds.size() << " expressions, " << sum.failDiff << " total" << endl; cout << "No ground truth: " << sum.nogtIds.size() << " expressions" << endl; } static void printSummary(const TestSummary &sum) { if (g_quiet > 1) { printSummaryQ(sum); } else { printSummaryV(sum); } } // Returns true if this Highlander mode test succeeded. static bool checkSingleMatch(const ResultSet &ground_truth, const ResultSet &ue2) { // In Highlander (single-match) mode, UE2 must return only one of the // matches returned by PCRE/GraphTruth. It need not be the earliest one. if (ground_truth.matches.empty()) { return ue2.matches.empty(); } else if (ue2.matches.size() != 1) { return false; } else { return contains(ground_truth.matches, *ue2.matches.begin()); } } // Returns true if this prefiltering mode test succeeded. static bool checkPrefilterMatch(const ResultSet &ground_truth, const ResultSet &ue2, bool highlander) { if (highlander) { // Highlander + prefilter is tricky. Best we can do is say that if PCRE // returns matches, UE2 must return a match, though it may not be one // of the ones returned by PCRE (it may be an earlier match). if (!ground_truth.matches.empty()) { return ue2.matches.size() == 1; } // We can't verify anything more. return true; } else if (!limit_matches || ue2.matches.size() < limit_matches) { // In prefilter mode, every match found by PCRE must be found by UE2, // but the UE2 set may be a superset of the PCRE match set. return std::includes(ue2.matches.begin(), ue2.matches.end(), ground_truth.matches.begin(), ground_truth.matches.end()); } // Otherwise, we've hit our match limit. Prefilter mode is quite difficult // to verify in this case, so we just verify that "something happened". return true; } static ResultSet makeEndOfMatchOnly(const ResultSet &rs) { ResultSet out(rs.src); for (const auto &match : rs.matches) { out.addMatch(0, match.to); } return out; } static bool checkMultiMatch(const ResultSet &ground_truth, const ResultSet &ue2) { // If we had out-of-order matches or matches after termination, we have a // bug! if (ue2.uoom || ue2.match_after_halt || ue2.invalid_id) { return false; } // If we have more UE2 matches than our limit, we have a bug! if (limit_matches && ue2.matches.size() > limit_matches) { return false; } // If we have more UE2 matches than PCRE matches, we have a bug! if (ue2.matches.size() > ground_truth.matches.size()) { return false; } // If we've got fewer matches than our limit to test, then the match sets // must be identical. if (!limit_matches || ground_truth.matches.size() < limit_matches) { return ground_truth == ue2; } // We're in limit_matches mode _and_ we have hit the limit. Every match in // 'ue2' must be in 'pcre'. (We can't just trim pcre and do an equality // test as matches may come out of UE2 a little out of order.) // In streaming mode, the limit may mean that we get a different SOM from // the leftmost one. So we compare only end offsets. if (colliderMode == MODE_STREAMING || colliderMode == MODE_VECTORED) { ResultSet gt_eom = makeEndOfMatchOnly(ground_truth); ResultSet ue2_eom = makeEndOfMatchOnly(ue2); return std::includes(gt_eom.matches.begin(), gt_eom.matches.end(), ue2_eom.matches.begin(), ue2_eom.matches.end()); } return std::includes(ground_truth.matches.begin(), ground_truth.matches.end(), ue2.matches.begin(), ue2.matches.end()); } // Check results, returns true if there has any failure. static bool checkTestResults(ostream &out, TestSummary &summary, const ExpressionMap &exprMap, TestUnit &unit, const ResultSet >_results, const vector &ue2_results) { bool failed = false; bool any_fail = false; vector pass(max_ue2_align, false); for (unsigned align = min_ue2_align; align != max_ue2_align; ++align) { if (unit.prefilter) { failed = !checkPrefilterMatch(gt_results, ue2_results[align], unit.highlander); } else if (unit.highlander) { failed = !checkSingleMatch(gt_results, ue2_results[align]); } else { // In non-Highlander mode, the two result sets MUST be equal // don't check PCRE if the scan didn't succeed failed = !checkMultiMatch(gt_results, ue2_results[align]); } #ifdef DEDUPE_MATCHES if (!failed) { failed |= !ue2_results[align].dupe_matches.empty(); } #endif pass[align] = !failed; any_fail |= failed; summary.failDiff += failed ? 1 : 0; if (g_verbose) { if (failed) { out << "FAILED: id " << unit.id << ", alignment " << align << ", corpus " << unit.corpus_id << ", results differ" << endl; } else { out << "PASSED: id " << unit.id << ", alignment " << align << ", corpus " << unit.corpus_id << " (matched "<< gt_results.src << ":" << gt_results.matches.size() << ", ue2:" << ue2_results[align].matches.size() << ")" << endl; } } } if (!any_fail) { return false; } if (!g_verbose) { out << "FAILED: id " << unit.id << ", alignment"; for (unsigned align = min_ue2_align; align != max_ue2_align; ++align) { if (!pass[align]) { out << " " << align; if (align + 1 < max_ue2_align && !pass[align + 1]) { while (align + 1 < max_ue2_align && !pass[align + 1]) { align++; } out << "-" << align; } } } out << ", corpus " << unit.corpus_id << ", results differ" << endl; } printDifference(out, exprMap, unit, gt_results, ue2_results, pass); return true; } // Construct a UE2 database, taking care of loading/saving to disk when // appropriate static shared_ptr constructDatabase(const set &ids, const UltimateTruth &ultimate) { assert(!ids.empty()); if (loadDatabases) { string filename = ultimate.dbFilename(ids); shared_ptr db = ultimate.loadDatabase(filename, ids); if (!db) { if (!g_quiet) { cout << "FAILED: could not load database " << filename << endl; } return nullptr; } return make_shared(db); } shared_ptr ue2 = make_shared(ids); try { // If we're not runnable (i.e. we're cross-compiling), let's at least // try to build the database. if (!ultimate.runnable()) { shared_ptr db = ue2->get(ultimate); assert(db); // throws otherwise } // Compile and save if we've been told to. if (saveDatabases) { string filename = ultimate.dbFilename(ids); if (!ultimate.saveDatabase(*(ue2->get(ultimate)), filename.c_str())) { cout << "FAILED: could not save database to file: " << filename << endl; } } } catch (const CompileFailed &fail) { if (!g_quiet) { cout << "FAILED: ue2 compile failed for " << *ids.begin() << ": " << fail.error << endl; } // Return null database to indicate failure. ue2 = nullptr; } return ue2; } static bool getGraphTruth(ostream &out, CNGInfo &cngi, GraphTruth &graph, TestUnit &unit, ResultSet &ngw_results, TestSummary &summary, const string &expression) { debug_stage = STAGE_GRAPH_RUN; // Skip patterns we've previously marked as bad. if (cngi.is_bad()) { summary.failNGScan++; return false; } // If we already have match information for this corpus, we don't need to // run PCRE at all. At the moment our on-disk format for corpora with match // information only includes the end-of-match offset, so we only use these // in non-som modes. If edit distance is forced, all bets are off so we // ignore this as well. if (!g_streamOffset && unit.corpus.hasMatches && !force_utf8 && !cngi.som && !force_edit_distance) { if (g_verbose) { out << "Using corpus match set rather than NFA graph" << endl; } ngw_results = ResultSet(unit.corpus.matches, RESULT_FROM_GRAPH); } else { // compile the actual graph const CompiledNG *cng; try { debug_stage = STAGE_GRAPH_COMPILE; cng = cngi.get(); debug_stage = STAGE_UNDEFINED; } catch (const NGCompileFailure &err) { debug_stage = STAGE_UNDEFINED; summary.failNGCompile++; summary.failNGScan++; cngi.mark_bad(); if (!g_quiet) { cout << "FAILED: id " << unit.id << ", NFA graph compile failed (" << err.msg << ")" << endl; } return false; } debug_stage = STAGE_GRAPH_RUN; // Run NFA graph and collect match information. string error; assert(cng); if (!graph.run(unit.id, *cng, cngi, unit.corpus.data, ngw_results, error)) { if (!g_quiet) { out << "FAILED: id " << unit.id << ", NFA graph scan failed: " << error << "\n" << " Expression: '" << expression << "'\n" << " Corpus data: '" << printable(unit.corpus.data) << "'\n" << " (note: marking bad, skipping subsequent tests)" << endl; } summary.failNGScan++; cngi.mark_bad(); return false; } } return true; } static bool getGroundTruth(ostream &out, CompiledPcre &cpcre, GroundTruth &ground, TestUnit &unit, ResultSet &pcre_results, TestSummary &summary) { debug_stage = STAGE_PCRE_RUN; // Skip patterns we've previously marked as bad. if (cpcre.is_bad()) { summary.failPcreScan++; return false; } // If we already have match information for this corpus, we don't need to // run PCRE at all. At the moment our on-disk format for corpora with match // information only includes the end-of-match offset, so we only use these // in non-som modes. Also, we can't trust corpus matches if there was an // edit distance requested for all patterns. if (!g_streamOffset && unit.corpus.hasMatches && !force_utf8 && !cpcre.som && !force_edit_distance) { if (g_verbose) { out << "Using corpus match set rather than PCRE" << endl; } pcre_results = ResultSet(unit.corpus.matches, RESULT_FROM_PCRE); } else { // Run PCRE and collect match information. string error; if (!ground.run(unit.id, cpcre, unit.corpus.data, pcre_results, error)) { if (!g_quiet) { out << "FAILED: id " << unit.id << ", libpcre scan failed: " << error << "\n" << " Expression: '" << cpcre.expression << "'\n" << " Corpus data: '" << printable(unit.corpus.data) << "'\n" << " (note: marking PCRE bad, skipping subsequent tests)" << endl; } summary.failPcreScan++; cpcre.mark_bad(); return false; } } return true; } static void writeCorpus(unsigned id, const Corpus &corpus, const ResultSet &results) { assert(corporaOut); ostringstream oss; oss << id << "=\"" << printable(corpus.data) << "\": "; auto vi = results.matches.begin(); auto ve = results.matches.end(); // Print match end offsets only. while (vi != ve) { oss << vi->to; if (++vi != ve) { oss << ","; } } oss << "\n"; corporaOut->write(oss.str()); } static void runTestUnit(ostream &out, GroundTruth &ground, GraphTruth &graph, UltimateTruth &ultimate, TestUnit &unit, TestSummary &summary, const ExpressionMap &exprMap) { assert(use_UE2); Corpus &corpus = unit.corpus; shared_ptr db; if (use_UE2) { // Acquire UE2 database. debug_stage = STAGE_UE2_COMPILE; try { db = unit.ue2->get(ultimate); } catch (const CompileFailed &fail) { summary.failUe2Compile++; if (!g_quiet) { out << "FAILED: ue2 compile failed for " << unit.id << ": " << fail.error << endl; unit.result = TEST_FAILED_COMPILE; debug_stage = STAGE_UNDEFINED; return; } } debug_stage = STAGE_UNDEFINED; if (!db) { // Database previously failed compilation. unit.result = TEST_SKIPPED; return; } } // If the user has specified that they want prefix/suffix data added to // their corpora, we do it here; this is as local as possible to the // test, so we don't keep piles of HUGE corpora hanging around. if (!g_corpora_prefix.empty()) { corpus.data.insert(0, g_corpora_prefix); corpus.hasMatches = false; } if (!g_corpora_suffix.empty()) { corpus.data.append(g_corpora_suffix); corpus.hasMatches = false; } ResultSet gt_results(RESULT_FROM_PCRE); vector ue2_results(max_ue2_align, ResultSet(RESULT_FROM_UE2)); bool gt_done = false; // run PCRE test if enabled and if compile succeeded if (unit.pcre && use_PCRE) { gt_done = getGroundTruth(out, *unit.pcre, ground, unit, gt_results, summary); } // run NFA if PCRE failed (or wasn't run), or if we don't run UE2 if (unit.cngi && (use_NFA && !gt_done)) { gt_done = getGraphTruth(out, *unit.cngi, graph, unit, gt_results, summary, exprMap.find(unit.id)->second); } // both ground truth methods either failed or didn't run if (!gt_done) { unit.result = TEST_NO_GROUND_TRUTH; return; } // Write out corpora if we've been told to if (saveCorpora) { writeCorpus(unit.id, unit.corpus, gt_results); } debug_stage = STAGE_UE2_RUN; for (unsigned int align = min_ue2_align; align != max_ue2_align; ++align) { bool ok = ultimate.run(unit.id, db, corpus.data, !unit.multi, align, ue2_results[align]); if (!ok) { if (!g_quiet) { out << "FAILED: id " << unit.id << ", ue2 scan at alignment " << align << " failed" << endl; } unit.result = TEST_FAILED; debug_stage = STAGE_UNDEFINED; return; } } // if we're using UE2, check all the different results modes if (checkTestResults(out, summary, exprMap, unit, gt_results, ue2_results)) { unit.result = TEST_FAILED; } else { unit.result = TEST_PASSED; } debug_stage = STAGE_UNDEFINED; } /* Used for testing the graph truth agains PCE */ static void runGroundCompTestUnit(ostream &out, GroundTruth &ground, GraphTruth &graph, TestUnit &unit, TestSummary &summary, const ExpressionMap &exprMap) { assert(!use_UE2); assert(use_PCRE); assert(use_NFA); Corpus &corpus = unit.corpus; // If the user has specified that they want prefix/suffix data added to // their corpora, we do it here; this is as local as possible to the // test, so we don't keep piles of HUGE corpora hanging around. if (!g_corpora_prefix.empty()) { corpus.data.insert(0, g_corpora_prefix); corpus.hasMatches = false; } if (!g_corpora_suffix.empty()) { corpus.data.append(g_corpora_suffix); corpus.hasMatches = false; } ResultSet pcre_results(RESULT_FROM_PCRE); ResultSet ngw_results(RESULT_FROM_GRAPH); bool pcreResult = false; bool graphResult = false; if (unit.pcre) { pcreResult = getGroundTruth(out, *unit.pcre, ground, unit, pcre_results, summary); } if (unit.cngi) { graphResult = getGraphTruth(out, *unit.cngi, graph, unit, ngw_results, summary, exprMap.find(unit.id)->second); } // no ground truth found either NFA or PCRE failed if (!pcreResult || !graphResult) { unit.result = TEST_NO_GROUND_TRUTH; return; } // Write out corpora if we've been told to if (saveCorpora) { writeCorpus(unit.id, unit.corpus, pcre_results); } if (pcre_results.matches != ngw_results.matches) { unit.result = TEST_FAILED; out << "FAILED: id " << unit.id << ", corpus " << unit.corpus_id << ", results differ" << endl; printGroundTruthDifference(out, exprMap, unit, pcre_results, ngw_results); } else { unit.result = TEST_PASSED; if (g_verbose) { out << "PASSED: id " << unit.id << ", corpus " << unit.corpus_id << " (matched pcre:" << pcre_results.matches.size() << ", matched ng:" << ngw_results.matches.size() << ")" << endl; } } debug_stage = STAGE_UNDEFINED; } static void addCorporaToQueue(ostream &out, BoundedQueue &testq, unsigned id, CorporaSource &corpora, TestSummary &summary, shared_ptr cpcre, shared_ptr cngi, shared_ptr ue2, bool multi, bool utf8) { // build corpora vector c; try { corpora.generate(id, c); } catch (CorpusFailure &err) { if (!g_quiet) { out << "FAILED: id " << id << ", corpora failure: " << err.message << endl; } summary.failCorpora++; return; } const bool som = cpcre ? cpcre->som : cngi->som; const bool prefilter = cpcre ? cpcre->prefilter : cngi->prefilter; const bool highlander = cpcre ? cpcre->highlander : cngi->highlander; // If we're in UTF-8 mode and the corpus isn't valid UTF-8, skip it: // Hyperscan's behaviour when scanning invalid UTF-8 data in UTF-8 mode // is undefined. if (utf8) { auto is_invalid_utf8 = [](const Corpus &corpus) { return !isValidUtf8(corpus.data.c_str(), corpus.data.size()); }; c.erase(remove_if(begin(c), end(c), is_invalid_utf8), end(c)); } // Collect together corpora units in a container so that we don't have to // repeatedly lock the queue. vector> tests; tests.reserve(c.size()); size_t corpus_id = 0; for (const Corpus &corpus : c) { tests.push_back(ue2::make_unique(id, corpus_id, corpus, cpcre, cngi, ue2, multi, utf8, highlander, prefilter, som)); corpus_id++; } testq.push(begin(tests), end(tests)); } namespace /* anonymous */ { // A subclass of Thread that stores its own output in a stringstream, flushing // it to cout when necessary. class OutputThread : public Thread { public: OutputThread(size_t id) : Thread(id) {} ~OutputThread() override { flush_output(); } protected: void flush_output() { const string &s = out.str(); if (!s.empty()) { cout << s; out.str(""); // make empty } } // Output stream, flushed to cout after every test unit. stringstream out; }; class ScanThread : public OutputThread { public: ScanThread(size_t id, BoundedQueue &testq, const ExpressionMap &e, const hs_platform_info *plat, const Grey &grey) : OutputThread(id), q(testq), ground(out, e, g_matchLimit, g_matchLimitRecursion), graph(out, e), ultimate(out, e, plat, grey, g_streamBlocks), exprMap(e) {} void run() override { DEBUG_PRINTF("thread %zu running\n", thread_id); for (;;) { const auto unit = q.pop(thread_id); if (!unit) { // Sentinel value, indicates that we have run out of units to // process. DEBUG_PRINTF("thread %zu stopped\n", thread_id); break; } assert(unit); assert(exprMap.find(unit->id) != exprMap.end()); // Debug information is stored in TLS and (hopefully) printed out in // the event of a crash. debug_expr = unit->id; debug_corpus = unit->corpus_id; debug_corpus_ptr = unit->corpus.data.c_str(); debug_corpus_len = unit->corpus.data.size(); debug_expr_ptr = exprMap.find(unit->id)->second.c_str(); if (use_UE2) { runTestUnit(out, ground, graph, ultimate, *unit, summary, exprMap); } else { runGroundCompTestUnit(out, ground, graph, *unit, summary, exprMap); } if (unit->result == TEST_NO_GROUND_TRUTH) { summary.nogtIds.insert(unit->id); // this is fine, continue } else if (unit->result == TEST_FAILED) { summary.failIds.insert(unit->id); } count++; summary.totalCorpora++; flush_output(); } } const TestSummary &getSummary() const { return summary; } public: size_t count = 0; // number of units processed private: // Shared queue. BoundedQueue &q; // Thread-local data. GroundTruth ground; // independent copy GraphTruth graph; // independent copy UltimateTruth ultimate; // independent copy TestSummary summary; // Constant shared data. const ExpressionMap &exprMap; }; /** Represent a work item for the corpus generation threads. This contains * all information relating to an expression. The corpus generator will * generate corpora for this expression and enqueue work items representing * complete test cases for the scanning threads. */ struct CorpusGenUnit { CorpusGenUnit(unique_ptr cngi_in, unique_ptr pcre_in, shared_ptr ue2_in, unsigned expr_id, bool multi_in, bool utf8_in) : cngi(move(cngi_in)), pcre(move(pcre_in)), ue2(ue2_in), id(expr_id), multi(multi_in), utf8(utf8_in) {} unique_ptr cngi; unique_ptr pcre; /* ue2 shared_ptr as in multicompile and banded compile it is shared amongst * various corpus units (with differing expression ids). */ shared_ptr ue2; unsigned id; // expression id bool multi; // ue2 contains more than one expression bool utf8; // ue2 can be run against utf8 corpora }; class CorpusGenThread : public OutputThread { public: CorpusGenThread(size_t id, BoundedQueue &testq_in, BoundedQueue &corpq_in, const CorporaSource &corpora_in) : OutputThread(id), testq(testq_in), corpq(corpq_in), corpora(corpora_in.clone()) {} void run() override { DEBUG_PRINTF("thread %zu running\n", thread_id); for (;;) { auto c = corpq.pop(thread_id); if (!c) { break; } addCorporaToQueue(out, testq, c->id, *corpora, summary, move(c->pcre), move(c->cngi), c->ue2, c->multi, c->utf8); count++; flush_output(); } } const TestSummary &getSummary() const { return summary; } public: size_t count = 0; // number of units processed private: // Output queue, shared between threads. BoundedQueue &testq; // Input queue, shared between corpus generator threads. BoundedQueue &corpq; // Thread-local data. const unique_ptr corpora; // independent copy TestSummary summary; }; } // namespace static unique_ptr makeNGInfo(const unsigned id, TestSummary &summary, GraphTruth &graph, UltimateTruth &ultimate, shared_ptr ue2) { string nfaErr; try { debug_stage = STAGE_GRAPH_PREPROCESS; auto cngi = graph.preprocess(id); debug_stage = STAGE_UNDEFINED; return cngi; } catch (const NGCompileFailure &err) { nfaErr = err.msg; debug_stage = STAGE_UNDEFINED; // fall through } catch (const NGUnsupportedFailure &err) { // unsupported error happens when the pattern appears to be valid, but // there are things that we don't yet support (e.g. SOM). // in this case, try again, suppressing the errors debug_stage = STAGE_UNDEFINED; summary.failNGCompile++; // try again and suppress unsupported errors try { debug_stage = STAGE_GRAPH_PREPROCESS; auto cngi = graph.preprocess(id, true); debug_stage = STAGE_UNDEFINED; // preprocess succeeded - that means the pattern itself is valid. // however, we can't use it, so we have to mark it as bad // only print the error in the following cases: // 1) if verbose is specified // 2) if we are not using UE2 and quiet is NOT specified if ((!use_UE2 && !g_quiet) || g_verbose) { cout << "FAILED: id " << id << ", NFA graph preprocess failed (" << err.msg << ")" << endl; } cngi->mark_bad(); return cngi; } catch (const NGCompileFailure &e) { // compile failed nfaErr = e.msg; debug_stage = STAGE_UNDEFINED; // fall through } } // We should ensure that we also fail compilation with UE2, otherwise we // likely have a pattern support bug. try { auto db = ue2->get(ultimate); if (db) { // if we made it this far, that means UE2 compile succeeded while // NFA compile failed. cout << "FAILED: id " << id << ", NFA graph preprocess failed (" << nfaErr << ") but UE2 compile succeeded." << endl; summary.failNGCompile++; summary.failCompileDifference++; return nullptr; } // If db is nullptr, we have previously failed compilation of this // database. } catch (const CompileFailed &) { // Everything's OK: compilation failed in Hyperscan as well. Fall // through. } summary.failNGCompile++; if (!g_quiet) { cout << "FAILED: id " << id << ", NFA graph preprocess failed (" << nfaErr << ")" << endl; } return nullptr; } static unique_ptr makePcre(const unsigned id, TestSummary &summary, GroundTruth &ground, UltimateTruth &ultimate, shared_ptr ue2) { string pcreErr; try { debug_stage = STAGE_PCRE_COMPILE; auto cpcre = ground.compile(id); debug_stage = STAGE_UNDEFINED; return cpcre; } catch (const SoftPcreCompileFailure &err) { debug_stage = STAGE_UNDEFINED; summary.failPcreCompile++; if (g_verbose) { cout << "FAILED: id " << id << ", libpcre compile failed with soft error: " << err.msg << endl; } return nullptr; } catch (const PcreCompileFailure &err) { debug_stage = STAGE_UNDEFINED; pcreErr = err.msg; // fall through } // We should ensure that we also fail compilation with UE2, otherwise we // likely have a pattern support bug. try { auto db = ue2->get(ultimate); if (db) { // OK, so now we have a situation: PCRE failed but UE2 succeeded. // There is one situation where this is legal: patterns beginning // with (*UTF8), which will throw an error due to the callback // wrapping we do for PCRE. We can check these by trying to compile // an "unwrapped" PCRE. ground.compile(id, true); // If we didn't throw, PCRE failed above but succeeded when not // wrapped in a callback, and UE2 succeeded. Not worth reporting, // fall through. } } catch (const CompileFailed &) { // Everything's OK: compilation failed in Hyperscan as well. Fall // through. } catch (const PcreCompileFailure &) { cout << "FAILED: id " << id << ", libpcre compile failed (" << pcreErr << ") but UE2 compile succeeded." << endl; summary.failPcreCompile++; summary.failCompileDifference++; return nullptr; } if (!g_quiet) { cout << "FAILED: id " << id << ", libpcre compile failed: " << pcreErr << endl; } summary.failPcreCompile++; return nullptr; } static void drainGenerators(BoundedQueue &corpq, vector> &generators, TestSummary &summary) { // Push a sentinel per thread. for (size_t i = 0; i < generators.size(); i++) { corpq.push(nullptr); } // Wait for workers to end and retrieve their results. for (auto &c : generators) { c->join(); summary.merge(c->getSummary()); } } // Note: In multi-pattern cases, utf8 is true if any pattern to be run against // this corpus is in UTF-8 mode. static unique_ptr makeCorpusGenUnit(unsigned id, TestSummary &summary, GroundTruth &ground, GraphTruth &graph, UltimateTruth &ultimate, shared_ptr ue2, bool multi, bool utf8) { unique_ptr cpcre; unique_ptr cngi; // compile PCRE bytecode if (use_PCRE) { cpcre = makePcre(id, summary, ground, ultimate, ue2); } if (use_NFA) { cngi = makeNGInfo(id, summary, graph, ultimate, ue2); } // if both compiles failed, skip the test if (!cpcre && !cngi) { return nullptr; } // Caller may already have set the UTF-8 property (in multi cases) utf8 |= cpcre ? cpcre->utf8 : cngi->utf8; return ue2::make_unique(move(cngi), move(cpcre), ue2, id, multi, utf8); } static bool hasUTF8Pattern(GroundTruth &ground, ExpressionMap::const_iterator it, ExpressionMap::const_iterator end) { /* note: we cannot just check the flags as utf8 can be enabled in the * pattern itself with (*UTF) */ debug_stage = STAGE_PCRE_COMPILE; for (; it != end; ++it) { try { auto cpcre = ground.compile(it->first); assert(cpcre); // Would have thrown PcreCompileFailure otherwise. if (cpcre->utf8) { DEBUG_PRINTF("UTF8 mode\n"); debug_stage = STAGE_UNDEFINED; return true; } } catch (const PcreCompileFailure &) { continue; } } debug_stage = STAGE_UNDEFINED; return false; } // Fill a test queue with single-pattern tests. static void buildSingle(BoundedQueue &corpq, TestSummary &summary, GroundTruth &ground, GraphTruth &graph, UltimateTruth &ultimate, const ExpressionMap &exprMap) { for (const auto &m : exprMap) { unsigned id = m.first; debug_expr = id; debug_expr_ptr = m.second.c_str(); shared_ptr ue2 = constructDatabase({id}, ultimate); if (!ue2) { summary.failUe2Compile++; continue; } // if we're cross-compiling, then we don't bother building PCRE and // running scans, we're just going to output the database bytecode. if (!ultimate.runnable()) { continue; } bool multi = false; bool utf8 = false; auto u = makeCorpusGenUnit(id, summary, ground, graph, ultimate, ue2, multi, utf8); if (u) { corpq.push(move(u)); } } } // Fill a test queue with multi-pattern tests of size N, where N is the band // size specified on the command line. static void buildBanded(BoundedQueue &corpq, TestSummary &summary, GroundTruth &ground, GraphTruth &graph, UltimateTruth &ultimate, const ExpressionMap &exprMap) { for (auto i = exprMap.begin(), e = exprMap.end(); i != e;) { debug_expr = i->first; debug_expr_ptr = i->second.c_str(); // Build a set of IDs in this band from the expression map set bandIds; if (g_verbose) { cout << "Building set:"; } ExpressionMap::const_iterator band_end = i; for (u32 j = 0; j < multicompile_bands && band_end != e; j++, ++band_end) { bandIds.insert(bandIds.end(), band_end->first); if (g_verbose) { cout << " " << band_end->first; } } if (g_verbose) { cout << endl; } // compile UE2 bytecode shared_ptr ue2 = constructDatabase(bandIds, ultimate); if (!ue2) { summary.failUe2Compile++; i = band_end; continue; } // if we're cross-compiling, then we don't bother building PCRE and // running scans, we're just going to output the database bytecode. if (!ultimate.runnable()) { i = band_end; continue; } bool utf8 = hasUTF8Pattern(ground, i, band_end); for (; i != band_end; ++i) { unsigned id = i->first; bool multi = true; auto u = makeCorpusGenUnit(id, summary, ground, graph, ultimate, ue2, multi, utf8); if (u) { corpq.push(move(u)); } } } } // Fill a test queue with multi-pattern tests. static void buildMulti(BoundedQueue &corpq, TestSummary &summary, GroundTruth &ground, GraphTruth &graph, UltimateTruth &ultimate, const ExpressionMap &exprMap) { // Build a set of all IDs from the expression map set idsAll; for (const auto &e : exprMap) { idsAll.insert(e.first); } // Compile in UE2 shared_ptr ue2 = constructDatabase(idsAll, ultimate); if (!ue2) { summary.failUe2Compile++; return; } // if we're cross-compiling, then we don't bother building PCRE and // running scans, we're just going to output the database bytecode. if (!ultimate.runnable()) { return; } bool utf8 = hasUTF8Pattern(ground, exprMap.begin(), exprMap.end()); for (const auto &m : exprMap) { unsigned id = m.first; debug_expr = id; debug_expr_ptr = m.second.c_str(); bool multi = true; auto u = makeCorpusGenUnit(id, summary, ground, graph, ultimate, ue2, multi, utf8); if (u) { corpq.push(move(u)); } } } static void generateTests(CorporaSource &corpora_src, const ExpressionMap &exprMap, TestSummary &summary, const hs_platform_info *plat, const Grey &grey, BoundedQueue &testq) { GraphTruth graph(cout, exprMap); GroundTruth ground(cout, exprMap, g_matchLimit, g_matchLimitRecursion); UltimateTruth ultimate(cout, exprMap, plat, grey, g_streamBlocks); // Construct corpus generator queue and threads. BoundedQueue corpq(numGeneratorThreads, max_generator_queue_len); vector> generators; for (size_t i = 0; i < numGeneratorThreads; i++) { auto c = make_unique(i, testq, corpq, corpora_src); c->start(); generators.push_back(move(c)); } if (g_ue2CompileAll && multicompile_bands) { printf("Running single-pattern/banded-multi-compile test for %zu " "expressions.\n\n", exprMap.size()); buildBanded(corpq, summary, ground, graph, ultimate, exprMap); } else if (g_ue2CompileAll) { printf("Running single-pattern/multi-compile test for %zu " "expressions.\n\n", exprMap.size()); buildMulti(corpq, summary, ground, graph, ultimate, exprMap); } else { printf("Running single-pattern/single-compile test for %zu " "expressions.\n\n", exprMap.size()); buildSingle(corpq, summary, ground, graph, ultimate, exprMap); } drainGenerators(corpq, generators, summary); } static void printSettingsV(const vector &corporaFiles, const hs_platform_info *platform) { cout << "hscollider: The Pattern Collider Mark II\n\n" << "Number of threads: " << numThreads << " (" << numScannerThreads << " scanner, " << numGeneratorThreads << " generator)\n" << "Expression path: " << g_exprPath << "\n" << "Signature files: "; if (g_signatureFiles.empty()) { cout << "none" << endl; } else { for (unsigned i = 0; i < g_signatureFiles.size(); i++) { string &fname = g_signatureFiles[i]; if (i > 0) { cout << string(20, ' '); } cout << fname << endl; } } cout << "Mode of operation: "; switch (colliderMode) { case MODE_BLOCK: cout << "block mode"; break; case MODE_STREAMING: cout << "streaming mode"; break; case MODE_VECTORED: cout << "vectored mode"; break; case MODE_HYBRID: cout << "hybrid mode"; break; } cout << endl; if (limit_matches) { cout << "Terminate scanning after " << limit_matches << " matches." << endl; } if (platform) { cout << "Cross-compile for: " << to_string(*platform) << endl; } if (loadDatabases) { cout << "Loading DBs from: " << serializePath << endl; } if (saveDatabases) { cout << "Saving DBs to: " << serializePath << endl; } if (colliderMode == MODE_STREAMING) { cout << "Stream block count: " << g_streamBlocks << endl; } if (colliderMode == MODE_VECTORED) { cout << "Vectored block count: " << g_streamBlocks << endl; } if (use_UE2) { if (max_ue2_align == min_ue2_align + 1) { cout << "UE2 scan alignment: " << min_ue2_align << endl; } else { cout << "UE2 scan alignment: [" << min_ue2_align << ", " << max_ue2_align << ")" << endl; } } if (!corporaFiles.empty()) { for (const auto &file : corporaFiles) { cout << "Corpora read from file: " << file << endl; } } else { cout << "Corpora properties: \n" << " random seed: " << corpus_gen_prop.getSeed() << "\n" << " percentages: " << corpus_gen_prop.percentMatch() << "% match, " << corpus_gen_prop.percentUnmatch() << "% unmatch, " << corpus_gen_prop.percentRandom() << "% random" << endl; // prefix and suffix info const min_max &prefixSpan = corpus_gen_prop.prefixRange; const min_max &suffixSpan = corpus_gen_prop.suffixRange; if (prefixSpan.max) { cout << " random prefix: " << prefixSpan.min << " to " << prefixSpan.max << endl; } else { cout << " random prefix: none" << endl; } if (suffixSpan.max) { cout << " random suffix: " << suffixSpan.min << " to " << suffixSpan.max << endl; } else { cout << " random suffix: none" << endl; } // cycle info pair cycleSpan = corpus_gen_prop.getCycleLimit(); cout << " follow cycles: " << cycleSpan.first << " to " << cycleSpan.second << " times" << endl; } if (saveCorpora) { cout << "Saving corpora to: " << saveCorporaFile << endl; } cout << endl; } static void printSettingsQ(const vector &corporaFiles, const hs_platform_info *platform) { cout << "Number of threads: " << numThreads << endl << "Expression path: " << g_exprPath << endl << "Signature files: "; if (g_signatureFiles.empty()) { cout << "none" << endl; } else { for (unsigned i = 0; i < g_signatureFiles.size(); i++) { string &fname = g_signatureFiles[i]; if (i > 0) { cout << string(20, ' '); } cout << fname << endl; } } cout << "Mode of operation: "; switch (colliderMode) { case MODE_BLOCK: cout << "block mode"; break; case MODE_STREAMING: cout << "streaming mode"; break; case MODE_VECTORED: cout << "vectored mode"; break; case MODE_HYBRID: cout << "hybrid mode"; break; } cout << endl; if (limit_matches) { cout << "Terminate scanning after " << limit_matches << " matches." << endl; } if (platform) { cout << "Cross-compile for: " << to_string(*platform) << endl; } if (colliderMode == MODE_STREAMING) { cout << "Stream block count: " << g_streamBlocks << endl; } if (colliderMode == MODE_VECTORED) { cout << "Vectored block count: " << g_streamBlocks << endl; } if (max_ue2_align == min_ue2_align + 1) { cout << "UE2 scan alignment: " << min_ue2_align << endl; } else { cout << "UE2 scan alignment: [" << min_ue2_align << ", " << max_ue2_align << ")" << endl; } if (!g_corpora_prefix.empty()) { cout << "Prefix of " << g_corpora_prefix.size() << "bytes" << endl; } if (!g_corpora_suffix.empty()) { cout << "Suffix of " << g_corpora_suffix.size() << "bytes" << endl; } if (!corporaFiles.empty()) { cout << "Corpora: from file" << endl; } else { cout << "Corpora: -R " << corpus_gen_prop.getSeed() << " -p " << corpus_gen_prop.percentMatch() << "," << corpus_gen_prop.percentUnmatch() << "," << corpus_gen_prop.percentRandom(); // prefix and suffix info const min_max &prefixSpan = corpus_gen_prop.prefixRange; const min_max &suffixSpan = corpus_gen_prop.suffixRange; if (prefixSpan.max) { cout << " -P " << prefixSpan.min << "," << prefixSpan.max; } if (suffixSpan.max) { cout << " -S " << suffixSpan.min << "," << suffixSpan.max; } // cycle info pair cycleSpan = corpus_gen_prop.getCycleLimit(); cout << " -C " << cycleSpan.first << "," << cycleSpan.second; cout << endl; } } static void printSettings(const vector &c, const hs_platform_info *plat) { if (g_quiet > 1) { printSettingsQ(c, plat); } else { printSettingsV(c, plat); } } static unique_ptr buildCorpora(const vector &corporaFiles, const ExpressionMap &exprMap) { if (!corporaFiles.empty()) { auto c = ue2::make_unique(); for (const auto &file : corporaFiles) { if (!c->readFile(file)) { cout << "Error reading corpora from file: " << file << endl; exit_with_fail(); } } return move(c); /* move allows unique_ptr conversion */ } else { auto c = ue2::make_unique( exprMap, corpus_gen_prop, force_utf8, force_prefilter); return move(c); } } static bool needsQuotes(const char *s) { size_t len = strlen(s); if (len == 0) { return true; } #ifndef _WIN32 // don't confuse the correct isblank for the one in locale int (*blank)(int) = &std::isblank; if (find_if(s, s + len, blank) != s + len) { #else if (find_if(s, s + len, [](unsigned char c){ return std::isblank(c); }) != s + len) { #endif return true; } return false; } static void storeCmdline(int argc, char **argv) { for (int i = 0; i < argc; i++) { const char *s = argv[i]; if (needsQuotes(s)) { g_cmdline += '"'; g_cmdline += s; g_cmdline += '"'; } else { g_cmdline += s; } if (i != argc - 1) { g_cmdline += " "; } } } static bool runTests(CorporaSource &corpora_source, const ExpressionMap &exprMap, const hs_platform_info *plat, const Grey &grey) { TestSummary summary; summary.totalExpressions = exprMap.size(); BoundedQueue testq(numScannerThreads, max_scan_queue_len); // Start scanning threads. vector> scanners; for (size_t i = 0; i < numScannerThreads; i++) { auto s = ue2::make_unique(i, testq, exprMap, plat, grey); s->start(); scanners.push_back(move(s)); } generateTests(corpora_source, exprMap, summary, plat, grey, testq); // Push a sentinel per scanning thread to ensure that everyone finishes // work. for (size_t i = 0; i < scanners.size(); i++) { testq.push(nullptr); } // Wait for consumers to end and retrieve their results. for (size_t i = 0; i < scanners.size(); i++) { const auto &s = scanners[i]; s->join(); if (g_verbose) { cout << "Thread " << i << " processed " << s->count << " units." << endl; } summary.merge(s->getSummary()); } printSummary(summary); return !summary.hasFailure(); } int HS_CDECL main(int argc, char *argv[]) { Grey grey; vector corporaFiles; for (int i = 1; i < argc - 1; i++) { if (!strcmp(argv[i], "-G")) { cout << "Override: " << argv[i + 1] << endl; } } setDefaults(); storeCmdline(argc, argv); unique_ptr plat; corpus_gen_prop.seed(randomSeed); processArgs(argc, argv, corpus_gen_prop, &corporaFiles, &grey, &plat); // If the user has asked for a random alignment, we select it here (after // random number seed applied). if (use_random_alignment) { min_ue2_align = corpus_gen_prop.rand(0, 15); max_ue2_align = min_ue2_align + 1; } // Limit memory usage, unless the user has specified zero on the command // line or in a config file. if (g_memoryLimit) { setMemoryLimit(g_memoryLimit * numThreads); } // Split threads available up amongst scanner and generator threads. numGeneratorThreads = max(1u, static_cast(numThreads * 0.5)); numScannerThreads = max(1u, numThreads - numGeneratorThreads); ExpressionMap exprMap; loadExpressions(g_exprPath, exprMap); if (!g_allSignatures) { SignatureSet signatures; if (!g_signatureFiles.empty()) { for (string &fname : g_signatureFiles) { loadSignatureList(fname, signatures); } } else { signatures.insert(signatures.end(), g_signatures.begin(), g_signatures.end()); } exprMap = limitToSignatures(exprMap, signatures); } printSettings(corporaFiles, plat.get()); if (exprMap.empty()) { cout << "Warning: no signatures to scan. Exiting." << endl; exit(0); } if (!no_signal_handler) { installSignalHandler(); } if (saveDatabases || loadDatabases) { struct stat st; if (stat(serializePath.c_str(), &st) < 0) { cout << "Unable to stat serialize path '" << serializePath << "': " << strerror(errno) << endl; exit_with_fail(); } } // If we're saving corpora out, truncate the output file. if (saveCorpora) { corporaOut = ue2::make_unique(saveCorporaFile); } GroundTruth::global_prep(); auto corpora_source = buildCorpora(corporaFiles, exprMap); if (!g_verbose && g_quiet < 2) { cout << "Only failed tests are displayed." << endl; } SimpleTimer timer; bool success = runTests(*corpora_source, exprMap, plat.get(), grey); cout << "\nTotal elapsed time: " << timer.elapsed() << " secs." << endl; exprMap.clear(); if (!success) { exit_with_fail(); } return 0; }