1 /*
2 * Copyright (c) 2015-2019, Intel Corporation
3 *
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions are met:
6 *
7 * * Redistributions of source code must retain the above copyright notice,
8 * this list of conditions and the following disclaimer.
9 * * Redistributions in binary form must reproduce the above copyright
10 * notice, this list of conditions and the following disclaimer in the
11 * documentation and/or other materials provided with the distribution.
12 * * Neither the name of Intel Corporation nor the names of its contributors
13 * may be used to endorse or promote products derived from this software
14 * without specific prior written permission.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
20 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26 * POSSIBILITY OF SUCH DAMAGE.
27 */
28
29 #include "config.h"
30
31 #include "ResultSet.h"
32 #include "UltimateTruth.h"
33 #include "util/database_util.h"
34 #include "util/ExpressionParser.h"
35 #include "util/string_util.h"
36
37 #include "ue2common.h"
38 #include "common.h"
39 #include "crc32.h"
40 #include "hs.h"
41 #include "hs_internal.h"
42 #include "util/make_unique.h"
43
44 #include "scratch.h"
45 #include "nfa/nfa_api_queue.h"
46 #include "rose/rose_internal.h"
47
48 #include <algorithm>
49 #include <cassert>
50 #include <cstdlib>
51 #include <cstring>
52 #include <fstream>
53 #include <iomanip>
54 #include <iostream>
55 #include <map>
56 #include <set>
57 #include <sstream>
58 #include <unordered_set>
59 #include <vector>
60
61 #include <boost/ptr_container/ptr_vector.hpp>
62
63 using namespace std;
64 using namespace ue2;
65 using boost::ptr_vector;
66
67 #ifndef RELEASE_BUILD
68
69 #include "database.h"
70 #include "state.h"
71
72 static
open_magic_stream(const hs_database_t * db,unsigned flags,hs_stream_t ** stream,hs_scratch_t * scratch,unsigned long long start_offset)73 hs_error_t open_magic_stream(const hs_database_t *db, unsigned flags,
74 hs_stream_t **stream, hs_scratch_t *scratch,
75 unsigned long long start_offset) {
76 hs_error_t ret = hs_open_stream(db, flags, stream);
77 if (ret != HS_SUCCESS) {
78 return ret;
79 }
80
81 const char dummy_data[100] = { 0 };
82 UNUSED const struct RoseEngine *rose
83 = (const struct RoseEngine *)hs_get_bytecode(db);
84 assert(sizeof(dummy_data) >= rose->historyRequired);
85 hs_scan_stream(*stream, dummy_data, MIN(start_offset, sizeof(dummy_data)), 0,
86 scratch, nullptr, nullptr);
87 (*stream)->offset = start_offset;
88 return ret;
89 }
90
91 #endif // RELEASE_BUILD
92
93 class BaseDB : boost::noncopyable {
94 public:
95 // Constructor takes iterators over a container of pattern IDs.
96 template <class Iter>
BaseDB(Iter ids_begin,Iter ids_end)97 BaseDB(Iter ids_begin, Iter ids_end)
98 : ids(ids_begin, ids_end) {}
99
100 virtual ~BaseDB();
101
102 // The set of expression IDs that must return their matches in order.
103 unordered_set<unsigned> ordered;
104
105 // The complete set of expression IDs associated with this database.
106 unordered_set<unsigned> ids;
107 };
108
~BaseDB()109 BaseDB::~BaseDB() { }
110
111 class HyperscanDB : public BaseDB {
112 public:
113 // Constructor takes iterators over a container of pattern IDs.
114 template <class Iter>
HyperscanDB(hs_database_t * db_in,Iter ids_begin,Iter ids_end)115 HyperscanDB(hs_database_t *db_in, Iter ids_begin, Iter ids_end)
116 : BaseDB(ids_begin, ids_end), db(db_in) {}
117
118 ~HyperscanDB();
119
120 // Underlying Hyperscan database pointer.
121 hs_database_t *db;
122 };
123
~HyperscanDB()124 HyperscanDB::~HyperscanDB() {
125 hs_free_database(db);
126 }
127
128 #ifdef HS_HYBRID
129
130 class HybridDB : public BaseDB {
131 public:
132 // Constructor takes iterators over a container of pattern IDs.
133 template <class Iter>
HybridDB(ch_database_t * db_in,Iter ids_begin,Iter ids_end)134 HybridDB(ch_database_t *db_in, Iter ids_begin, Iter ids_end)
135 : BaseDB(ids_begin, ids_end), db(db_in) {}
136
137 ~HybridDB();
138
139 // Underlying Hyperscan database pointer.
140 ch_database_t *db;
141 };
142
~HybridDB()143 HybridDB::~HybridDB() {
144 ch_free_database(db);
145 }
146
147 #endif // HS_HYBRID
148
149 // Used to track the ID and result set.
150 namespace {
151 struct MultiContext {
MultiContext__anon03de32280111::MultiContext152 MultiContext(unsigned int id_in, const BaseDB &db_in, ResultSet *rs_in,
153 bool single_in, ostream &os)
154 : id(id_in), db(db_in), rs(rs_in), single(single_in), out(os) {}
155 unsigned int id;
156 int block = 0;
157 const BaseDB &db;
158 ResultSet *rs;
159 u64a lastRawMatch = 0; /* store last known unadjusted match location */
160 u64a lastOrderMatch = 0;
161 bool single;
162 bool use_max_offset = false;
163 unsigned long long max_offset = 0; /* don't record matches beyond this */
164 bool terminated = false; //!< user has instructed us to stop
165 bool in_scan_call = false;
166 ostream &out;
167 };
168 }
169
170 // Callback used for all (both single and multi-mode) scans.
171 static
callbackMulti(unsigned int id,unsigned long long from,unsigned long long to,UNUSED unsigned int flags,void * ctx)172 int HS_CDECL callbackMulti(unsigned int id, unsigned long long from,
173 unsigned long long to,
174 UNUSED unsigned int flags, void *ctx) {
175 MultiContext *mctx = static_cast<MultiContext *>(ctx);
176 assert(mctx);
177 assert(mctx->rs);
178 assert(mctx->in_scan_call);
179
180 ostream &out = mctx->out;
181
182 // Sanity check: in single mode, we'd better not be getting matches for the
183 // wrong ID!
184 if (mctx->single && id != mctx->id) {
185 out << "UE2 Match @ (" << from << "," << to << ") for " << id
186 << " which is not the id we're looking for" << endl;
187 mctx->rs->invalid_id = true;
188 return 1;
189 }
190
191 // In any mode, we should NEVER get a match from an ID outside our known set.
192 if (mctx->db.ids.find(id) == mctx->db.ids.end()) {
193 out << "UE2 Match @ (" << from << "," << to << ") for " << id
194 << " which is not in the pattern set" << endl;
195 mctx->rs->invalid_id = true;
196 return 1;
197 }
198
199 if (mctx->terminated) {
200 out << "UE2 Match @ (" << from << "," << to << ") for " << id
201 << " after termination" << endl;
202 mctx->rs->match_after_halt = true;
203 }
204
205 #ifndef RELEASE_BUILD
206 unsigned int adjustment = flags & HS_MATCH_FLAG_ADJUSTED ? 1 : 0;
207 if (mctx->lastRawMatch > to + adjustment) {
208 out << "UE2 Match @ (" << from << "," << to << ") for " << id
209 << " unordered" << endl;
210 mctx->rs->uoom = true;
211 }
212 mctx->lastRawMatch = to + adjustment;
213 #endif
214
215 if (mctx->db.ordered.find(id) != mctx->db.ordered.end()) {
216 if (mctx->lastOrderMatch > to) {
217 out << "UE2 Match @ (" << from << "," << to << ") for " << id
218 << " unordered" << endl;
219 mctx->rs->uoom = true;
220 }
221 mctx->lastOrderMatch = to;
222 }
223
224 if (mctx->use_max_offset && to > mctx->max_offset) {
225 if (echo_matches) {
226 out << "UE2 Match @ (" << from << "," << to << ") for " << id
227 << " ignored" << endl;
228 }
229 return 0;
230 }
231
232 if (to - g_streamOffset < g_corpora_prefix.size()) {
233 if (echo_matches) {
234 out << "UE2 Match @ (" << from << "," << to << ") for " << id
235 << " too early" << endl;
236 }
237 return 0;
238 }
239
240 u64a offsetDelta = g_corpora_prefix.size() + g_streamOffset;
241
242 if (from) {
243 // from only set in SOM mode, otherwise zero. If we wanted to be REALLY
244 // principled about this, we'd probably want to stash the flags
245 // somewhere at compile time.
246 from -= (from > offsetDelta ? offsetDelta : from);
247 }
248
249 to -= offsetDelta;
250
251 if (echo_matches) {
252 out << "UE2 Match @ (" << from << "," << to << ") for " << id << endl;
253 }
254
255 if (mctx->single || id == mctx->id) {
256 mctx->rs->addMatch(from, to, mctx->block);
257 if (limit_matches && mctx->rs->matches.size() == limit_matches) {
258 if (echo_matches) {
259 out << "Terminating matching (hit match limit)" << endl;
260 }
261 mctx->terminated = true;
262 return 1; // terminate matching.
263 }
264 }
265
266 return 0;
267 }
268
269 #ifdef HS_HYBRID
270
271 // Hybrid matcher callback.
272 static
callbackHybrid(unsigned id,unsigned long long from,unsigned long long to,unsigned,unsigned size,const ch_capture_t * captured,void * ctx)273 ch_callback_t HS_CDECL callbackHybrid(unsigned id, unsigned long long from,
274 unsigned long long to, unsigned, unsigned size,
275 const ch_capture_t *captured, void *ctx) {
276 MultiContext *mctx = static_cast<MultiContext *>(ctx);
277 assert(mctx);
278 assert(mctx->rs);
279 assert(mctx->in_scan_call);
280
281 ostream &out = mctx->out;
282
283 to -= g_corpora_prefix.size();
284
285 if (mctx->terminated) {
286 out << "UE2 Match @ (" << from << "," << to << ") for " << id
287 << " after termination" << endl;
288 mctx->rs->match_after_halt = true;
289 }
290
291 if (mctx->single || id == mctx->id) {
292 CaptureVec cap;
293 for (unsigned int i = 0; i < size; i++) {
294 if (!(captured[i].flags & CH_CAPTURE_FLAG_ACTIVE)) {
295 cap.push_back(make_pair(-1, -1));
296 } else {
297 cap.push_back(make_pair(captured[i].from, captured[i].to));
298 }
299 }
300 mctx->rs->addMatch(from, to, cap);
301 }
302
303 if (echo_matches) {
304 out << "Match @ [" << from << "," << to << "] for " << id << endl;
305 out << " Captured " << size << " groups: ";
306 for (unsigned int i = 0; i < size; i++) {
307 if (!(captured[i].flags & CH_CAPTURE_FLAG_ACTIVE)) {
308 out << "{} ";
309 } else {
310 out << "{" << captured[i].from << "," << captured[i].to << "} ";
311 }
312 }
313 out << endl;
314 }
315
316 if (limit_matches && mctx->rs->matches.size() == limit_matches) {
317 mctx->terminated = true;
318 return CH_CALLBACK_TERMINATE;
319 }
320
321 return CH_CALLBACK_CONTINUE;
322 }
323
324 // Hybrid matcher error callback.
325 static
errorCallback(UNUSED ch_error_event_t errorType,UNUSED unsigned int id,void *,void * ctx)326 ch_callback_t HS_CDECL errorCallback(UNUSED ch_error_event_t errorType,
327 UNUSED unsigned int id, void *,
328 void *ctx) {
329 UNUSED MultiContext *mctx = static_cast<MultiContext *>(ctx);
330 assert(mctx);
331 assert(mctx->rs);
332 assert(mctx->in_scan_call);
333
334 return CH_CALLBACK_SKIP_PATTERN;
335 }
336
337 #endif // HS_HYBRID
338
339 static
filterLeftmostSom(ResultSet & rs)340 void filterLeftmostSom(ResultSet &rs) {
341 if (rs.matches.size() <= 1) {
342 return;
343 }
344
345 set<u64a> seen; // End offsets.
346 auto it = rs.matches.begin();
347 while (it != rs.matches.end()) {
348 if (seen.insert(it->to).second) {
349 ++it; // First time we've seen this end-offset.
350 } else {
351 rs.matches.erase(it++);
352 }
353 }
354 }
355
UltimateTruth(ostream & os,const ExpressionMap & expr,const hs_platform_info_t * plat,const Grey & grey_in,unsigned int streamBlocks)356 UltimateTruth::UltimateTruth(ostream &os, const ExpressionMap &expr,
357 const hs_platform_info_t *plat,
358 const Grey &grey_in, unsigned int streamBlocks)
359 : grey(grey_in), out(os), m_expr(expr), m_xcompile(false),
360 m_streamBlocks(streamBlocks), scratch(nullptr),
361 #ifdef HS_HYBRID
362 chimeraScratch(nullptr),
363 #endif
364 platform(plat) {
365 // Build our mode flags.
366
367 switch (colliderMode) {
368 case MODE_STREAMING:
369 m_mode = HS_MODE_STREAM;
370 break;
371 case MODE_BLOCK:
372 m_mode = HS_MODE_BLOCK;
373 break;
374 case MODE_VECTORED:
375 m_mode = HS_MODE_VECTORED;
376 break;
377 case MODE_HYBRID:
378 m_mode = 0;
379 break;
380 }
381
382 // Set desired SOM precision, if we're in streaming mode.
383 if (colliderMode == MODE_STREAMING) {
384 m_mode |= somPrecisionMode;
385 }
386
387 #ifdef HS_HYBRID
388 if (colliderMode == MODE_HYBRID && !no_groups) {
389 m_mode |= CH_MODE_GROUPS;
390 }
391 #endif
392 }
393
~UltimateTruth()394 UltimateTruth::~UltimateTruth() {
395 #ifdef HS_HYBRID
396 ch_free_scratch(chimeraScratch);
397 #endif
398 hs_free_scratch(scratch);
399 }
400
401 static
mangle_scratch(hs_scratch_t * scratch)402 void mangle_scratch(hs_scratch_t *scratch) {
403 /* Use our knowledge of the internals of scratch to make a mess */
404
405 memset(&scratch->tctxt, 0xc0, sizeof(scratch->tctxt));
406 memset(scratch->bstate, 0xd0, scratch->bStateSize);
407 memset(scratch->tstate, 0xe0, scratch->tStateSize);
408 memset(scratch->fullState, 0xf0, scratch->fullStateSize);
409
410 for (u32 i = 0; i < scratch->queueCount; i++) {
411 struct mq *q = &scratch->queues[i];
412 memset(q, 0x01, sizeof(*q));
413 q->scratch = scratch;
414 }
415
416 memset(scratch->aqa, 0xb0, scratch->activeQueueArraySize);
417 for (u32 i = 0; i < DELAY_SLOT_COUNT; i++) {
418 memset(scratch->delay_slots[i], 0x05, scratch->delay_fatbit_size);
419 }
420
421 memset(scratch->catchup_pq.qm, 0x06,
422 scratch->queueCount * sizeof(struct queue_match));
423 scratch->catchup_pq.qm_size = 45;
424 memset(&scratch->core_info, 0x07, sizeof(scratch->core_info));
425 memset(scratch->deduper.som_start_log[0], 0x90,
426 sizeof(u64a) * scratch->deduper.dkey_count);
427 memset(scratch->deduper.som_start_log[1], 0x09,
428 sizeof(u64a) * scratch->deduper.dkey_count);
429 memset(scratch->deduper.log[0], 0xa0, scratch->deduper.log_size);
430 memset(scratch->deduper.log[1], 0x0a, scratch->deduper.log_size);
431 memset(scratch->deduper.som_log[0], 0xd0, scratch->deduper.log_size);
432 memset(scratch->deduper.som_log[1], 0x0d, scratch->deduper.log_size);
433
434 for (u32 i = 0; i < scratch->anchored_literal_region_len; i++) {
435 memset(scratch->al_log[i], 0xa0, scratch->anchored_literal_fatbit_size);
436 }
437 scratch->al_log_sum=0xf0f;
438
439 memset(scratch->handled_roles, 0x05, scratch->handledKeyFatbitSize);
440 memset(scratch->som_store, 0x06,
441 scratch->som_store_count * sizeof(u64a));
442 memset(scratch->som_attempted_store, 0x06,
443 scratch->som_store_count * sizeof(u64a));
444 memset(scratch->som_set_now, 0x03, scratch->som_fatbit_size);
445 memset(scratch->som_attempted_set, 0x04, scratch->som_fatbit_size);
446 scratch->som_set_now_offset = 45;
447 memset(&scratch->fdr_conf, 0x0d, sizeof(scratch->fdr_conf));
448 scratch->fdr_conf_offset = 0xe4;
449 }
450
blockScan(const BaseDB & bdb,const string & buffer,size_t align,match_event_handler callback,void * ctx_in,ResultSet *)451 bool UltimateTruth::blockScan(const BaseDB &bdb, const string &buffer,
452 size_t align, match_event_handler callback,
453 void *ctx_in, ResultSet *) {
454 assert(colliderMode == MODE_BLOCK);
455 assert(!m_xcompile);
456
457 const hs_database_t *db = reinterpret_cast<const HyperscanDB &>(bdb).db;
458 assert(db);
459 MultiContext *ctx = (MultiContext *)ctx_in;
460
461 char *realigned = setupScanBuffer(buffer.c_str(), buffer.size(), align);
462 if (!realigned) {
463 return false;
464 }
465
466 if (use_copy_scratch && !cloneScratch()) {
467 return false;
468 }
469
470 ctx->in_scan_call = true;
471 hs_error_t ret =
472 hs_scan(db, realigned, buffer.size(), 0, scratch, callback, ctx);
473 ctx->in_scan_call = false;
474
475 if (g_verbose) {
476 out << "Scan call returned " << ret << endl;
477 }
478
479 if (ctx->terminated) {
480 if (g_verbose && ret != HS_SCAN_TERMINATED) {
481 out << "Scan should have returned HS_SCAN_TERMINATED, returned "
482 << ret << " instead." << endl;
483 }
484 return ret == HS_SCAN_TERMINATED;
485 }
486
487 if (g_verbose && ret != HS_SUCCESS) {
488 out << "Scan should have returned HS_SUCCESS, returned " << ret
489 << " instead." << endl;
490 }
491
492 if (use_mangle_scratch) {
493 mangle_scratch(scratch);
494 }
495
496 return ret == HS_SUCCESS;
497 }
498
499 static
compressAndCloseStream(hs_stream_t * stream)500 vector<char> compressAndCloseStream(hs_stream_t *stream) {
501 size_t needed;
502 hs_error_t err = hs_compress_stream(stream, nullptr, 0, &needed);
503 if (err != HS_INSUFFICIENT_SPACE) {
504 return {};
505 }
506
507 vector<char> buf(needed);
508 err = hs_compress_stream(stream, buf.data(), needed, &needed);
509 if (err != HS_SUCCESS) {
510 return {};
511 }
512 assert(needed == buf.size());
513
514 err = hs_close_stream(stream, nullptr, nullptr, nullptr);
515 if (err != HS_SUCCESS) {
516 return {};
517 }
518
519 return buf;
520 }
521
522
523 static
compressAndExpandStream(const hs_database_t * db,hs_stream_t * stream)524 hs_stream_t *compressAndExpandStream(const hs_database_t *db,
525 hs_stream_t *stream) {
526 vector<char> buf = compressAndCloseStream(stream);
527 hs_stream_t *out;
528 hs_error_t err = hs_expand_stream(db, &out, buf.data(), buf.size());
529
530 if (err != HS_SUCCESS) {
531 return nullptr;
532 }
533
534 return out;
535 }
536
537 static
compressAndResetExpandStream(const hs_database_t * db,hs_stream_t * stream)538 hs_stream_t *compressAndResetExpandStream(const hs_database_t *db,
539 hs_stream_t *stream) {
540 vector<char> buf = compressAndCloseStream(stream);
541 if (buf.empty()) {
542 return nullptr;
543 }
544
545 hs_stream_t *out;
546
547 hs_error_t err = hs_open_stream(db, 0, &out);
548
549 if (err != HS_SUCCESS) {
550 return nullptr;
551 }
552
553 err = hs_reset_and_expand_stream(out, buf.data(), buf.size(), nullptr,
554 nullptr, nullptr);
555 if (err != HS_SUCCESS) {
556 return nullptr;
557 }
558
559 return out;
560 }
561
streamingScan(const BaseDB & bdb,const string & buffer,size_t align,match_event_handler callback,void * ctx_in,ResultSet * rs)562 bool UltimateTruth::streamingScan(const BaseDB &bdb, const string &buffer,
563 size_t align, match_event_handler callback,
564 void *ctx_in, ResultSet *rs) {
565 assert(colliderMode == MODE_STREAMING);
566 assert(!m_xcompile);
567
568 const hs_database_t *db = reinterpret_cast<const HyperscanDB &>(bdb).db;
569 assert(db);
570 MultiContext *ctx = (MultiContext *)ctx_in;
571
572 // open a stream
573 hs_stream_t *stream;
574 size_t stream_size;
575 int ret;
576
577 ret = hs_stream_size(db, &stream_size);
578 if (ret != HS_SUCCESS) {
579 out << "Unable to size stream." << endl;
580 return false;
581 }
582
583 if (!g_streamOffset) {
584 ret = hs_open_stream(db, 0, &stream);
585 } else {
586 #ifndef RELEASE_BUILD
587 ret = open_magic_stream(db, 0, &stream, scratch, g_streamOffset);
588 #else
589 ret = HS_INVALID;
590 #endif
591 }
592
593 if (ret != HS_SUCCESS) {
594 out << "Unable to open stream." << endl;
595 return false;
596 }
597
598 // scan our data, split into blocks and copied into a temporary buffer
599 // aligned as requested (out of paranoia)
600 unsigned blockSize = buffer.size() / m_streamBlocks;
601 if (blockSize == 0) {
602 blockSize = 1;
603 }
604 const char *ptr = buffer.c_str();
605 const char *end = ptr + buffer.size();
606 ctx->block = 0;
607
608 // We use a do-while loop here so that zero-byte cases still generate at
609 // least one hs_scan_stream call, since it's something users might try.
610 do {
611 if (ptr + blockSize > end) {
612 // last write is a runt
613 blockSize = end - ptr;
614 }
615 char *realigned = setupScanBuffer(ptr, blockSize, align);
616 if (!realigned) {
617 return false;
618 }
619 ctx->in_scan_call = true;
620 DEBUG_PRINTF("scan stream write %u\n", ctx->block);
621 ret = hs_scan_stream(stream, realigned, blockSize, 0, scratch,
622 callback, ctx);
623 DEBUG_PRINTF("scan %u done\n", ctx->block);
624 ctx->in_scan_call = false;
625
626 if (limit_matches && rs->matches.size() == limit_matches) {
627 if (ret != HS_SCAN_TERMINATED) {
628 DEBUG_PRINTF("failure to scan %d\n", ret);
629 return false;
630 }
631 } else if (ret != HS_SUCCESS) {
632 DEBUG_PRINTF("failure to scan %d\n", ret);
633 return false;
634 }
635
636 if (use_copy_scratch && !cloneScratch()) {
637 return false;
638 }
639
640 if (use_copy_stream) {
641 hs_stream_t *s2;
642 ret = hs_copy_stream(&s2, stream);
643 if (ret != HS_SUCCESS) {
644 DEBUG_PRINTF("failure to copy %d\n", ret);
645 return false;
646 }
647 /* do a short write to the old stream so that it is in the wrong
648 * state. */
649 char temp[2] = {0, 0};
650 ret = hs_scan_stream(stream, temp, sizeof(temp), 0, scratch,
651 nullptr, nullptr);
652
653 hs_error_t expected = HS_SUCCESS;
654 if (limit_matches && rs->matches.size() == limit_matches) {
655 expected = HS_SCAN_TERMINATED;
656 }
657 if (ret != expected) {
658 DEBUG_PRINTF("failure to scan %d\n", ret);
659 return false;
660 }
661 ret = hs_close_stream(stream, nullptr, nullptr, nullptr);
662 if (ret != HS_SUCCESS) {
663 DEBUG_PRINTF("failure to close %d\n", ret);
664 return false;
665 }
666 stream = s2;
667 }
668 if (use_mangle_scratch) {
669 mangle_scratch(scratch);
670 }
671
672 if (use_compress_expand) {
673 auto rv = compressAndExpandStream(db, stream);
674 if (!rv) {
675 if (g_verbose) {
676 out << "Compress/Expand failed." << endl;
677 }
678 return false;
679 } else {
680 stream = rv;
681 }
682 }
683
684 if (use_compress_reset_expand) {
685 auto rv = compressAndResetExpandStream(db, stream);
686 if (!rv) {
687 if (g_verbose) {
688 out << "Compress/Expand failed." << endl;
689 }
690 return false;
691 } else {
692 stream = rv;
693 }
694 }
695
696 ptr += blockSize;
697 ctx->block++;
698 } while (ptr < end);
699
700 // close the stream
701 ctx->in_scan_call = true;
702 DEBUG_PRINTF("close stream %u\n", ctx->block);
703 ret = hs_close_stream(stream, scratch, callback, ctx);
704 DEBUG_PRINTF("close stream done\n");
705 ctx->in_scan_call = false;
706
707 if (ret != HS_SUCCESS) {
708 return false;
709 }
710
711 // UE2 cannot dedupe SOM matches across stream boundaries, so we must
712 // filter them out.
713 filterLeftmostSom(*rs);
714
715 return ret == HS_SUCCESS;
716 }
717
vectoredScan(const BaseDB & bdb,const string & buffer,size_t align,match_event_handler callback,void * ctx_in,ResultSet * rs)718 bool UltimateTruth::vectoredScan(const BaseDB &bdb, const string &buffer,
719 size_t align, match_event_handler callback,
720 void *ctx_in, ResultSet *rs) {
721 assert(colliderMode == MODE_VECTORED);
722 assert(!m_xcompile);
723
724 const hs_database_t *db = reinterpret_cast<const HyperscanDB &>(bdb).db;
725 assert(db);
726 MultiContext *ctx = (MultiContext *)ctx_in;
727
728 int ret;
729
730 assert(!g_streamOffset);
731
732 // scan our data, split into blocks and copied into a temporary buffer
733 // aligned as requested (out of paranoia)
734 unsigned blockSize = buffer.size() / m_streamBlocks;
735 if (blockSize == 0) {
736 blockSize = 1;
737 }
738 const char *ptr = buffer.c_str();
739 const char *end = ptr + buffer.size();
740 ctx->block = 0;
741
742 // We use a do-while loop here so that zero-byte cases still generate at
743 // least one hs_scan_stream call, since it's something users might try.
744
745 vector<const char *> data;
746 vector<unsigned int> length;
747
748 u32 block_count = (buffer.size() + blockSize - 1) / blockSize;
749 block_count = MAX(block_count, 1);
750
751 if (block_count > raw_blocks.size()) {
752 raw_blocks.resize(block_count);
753 }
754
755 do {
756 if (ptr + blockSize > end) {
757 // last write is a runt
758 blockSize = end - ptr;
759 }
760 char *realigned = setupVecScanBuffer(ptr, blockSize, align, ctx->block);
761 if (!realigned) {
762 return false;
763 }
764
765 data.push_back(realigned);
766 length.push_back(blockSize);
767
768 ptr += blockSize;
769 ctx->block++;
770
771 } while (ptr < end);
772
773 if (use_copy_scratch && !cloneScratch()) {
774 return false;
775 }
776
777 DEBUG_PRINTF("scan vectored write %u\n", ctx->block);
778 ctx->in_scan_call = true;
779 ret = hs_scan_vector(db, &data[0], &length[0], ctx->block, 0, scratch,
780 callback, ctx);
781 ctx->in_scan_call = false;
782 DEBUG_PRINTF("scan %u done\n", ctx->block);
783 if (use_mangle_scratch) {
784 mangle_scratch(scratch);
785 }
786
787 rs->dupe_matches.clear(); /* TODO: dedupe across vectored blocks */
788
789 if (limit_matches && rs->matches.size() == limit_matches) {
790 if (ret != HS_SCAN_TERMINATED) {
791 DEBUG_PRINTF("failure to scan %d\n", ret);
792 return false;
793 }
794 } else if (ret != HS_SUCCESS) {
795 DEBUG_PRINTF("failure to scan %d\n", ret);
796 return false;
797 }
798
799 // UE2 cannot dedupe SOM matches across vector block boundaries, so we must
800 // filter them out.
801 filterLeftmostSom(*rs);
802
803 return true;
804 }
805
806 #ifdef HS_HYBRID
hybridScan(const BaseDB & bdb,const string & buffer,size_t align,ch_match_event_handler callback,ch_error_event_handler error_callback,void * ctx_in,ResultSet *)807 bool UltimateTruth::hybridScan(const BaseDB &bdb, const string &buffer,
808 size_t align, ch_match_event_handler callback,
809 ch_error_event_handler error_callback,
810 void *ctx_in, ResultSet *) {
811 assert(colliderMode == MODE_HYBRID);
812 assert(!m_xcompile);
813
814 const ch_database_t *db = reinterpret_cast<const HybridDB &>(bdb).db;
815 assert(db);
816 MultiContext *ctx = (MultiContext *)ctx_in;
817
818 char *realigned = setupScanBuffer(buffer.c_str(), buffer.size(), align);
819 if (!realigned) {
820 return false;
821 }
822
823 if (use_copy_scratch && !cloneScratch()) {
824 return false;
825 }
826
827 ctx->in_scan_call = true;
828 ch_error_t ret =
829 ch_scan(db, realigned, buffer.size(), 0, chimeraScratch, callback,
830 error_callback, ctx);
831 ctx->in_scan_call = false;
832
833 if (g_verbose) {
834 out << "Scan call returned " << ret << endl;
835 }
836
837 if (ctx->terminated) {
838 if (g_verbose && ret != CH_SCAN_TERMINATED) {
839 out << "Scan should have returned CH_SCAN_TERMINATED, returned "
840 << ret << " instead." << endl;
841 }
842 return ret == CH_SCAN_TERMINATED;
843 }
844
845 if (g_verbose && ret != CH_SUCCESS) {
846 out << "Scan should have returned CH_SUCCESS, returned " << ret
847 << " instead." << endl;
848 }
849
850 return ret == CH_SUCCESS;
851 }
852 #endif
853
run(unsigned int id,shared_ptr<const BaseDB> bdb,const string & buffer,bool single_pattern,unsigned int align,ResultSet & rs)854 bool UltimateTruth::run(unsigned int id, shared_ptr<const BaseDB> bdb,
855 const string &buffer, bool single_pattern,
856 unsigned int align, ResultSet &rs) {
857 assert(!m_xcompile);
858 assert(bdb);
859
860 // Ensure that scratch is appropriate for this database.
861 if (!allocScratch(bdb)) {
862 out << "Scratch alloc failed." << endl;
863 return false;
864 }
865
866 MultiContext ctx(id, *bdb, &rs, single_pattern, out);
867 if (!g_corpora_suffix.empty()) {
868 ctx.use_max_offset = true;
869 ctx.max_offset = buffer.size() - g_corpora_suffix.size();
870 }
871
872 switch (colliderMode) {
873 case MODE_BLOCK:
874 return blockScan(*bdb, buffer, align, callbackMulti, &ctx, &rs);
875 case MODE_STREAMING:
876 return streamingScan(*bdb, buffer, align, callbackMulti, &ctx, &rs);
877 case MODE_VECTORED:
878 return vectoredScan(*bdb, buffer, align, callbackMulti, &ctx, &rs);
879 case MODE_HYBRID:
880 #ifdef HS_HYBRID
881 return hybridScan(*bdb, buffer, align, callbackHybrid, errorCallback,
882 &ctx, &rs);
883 #else
884 cerr << "Hybrid mode not available in this build." << endl;
885 abort();
886 #endif
887 break;
888 }
889
890 assert(0);
891 return false;
892 }
893
894 static
isOrdered(const string & expr,unsigned int flags)895 bool isOrdered(const string &expr, unsigned int flags) {
896 // SOM doesn't produce ordered matches?
897 if (flags & HS_FLAG_SOM_LEFTMOST) {
898 return false;
899 }
900
901 hs_expr_info_t *info = nullptr;
902 hs_compile_error_t *error = nullptr;
903 hs_error_t err = hs_expression_info(expr.c_str(), flags, &info, &error);
904 if (err != HS_SUCCESS) {
905 // Expression will fail compilation and report error elsewhere.
906 free(info);
907 hs_free_compile_error(error);
908 return false;
909 }
910
911 assert(info);
912
913 // Any pattern that does not require offset adjustment should produce
914 // matches in order.
915 bool ordered = !info->unordered_matches;
916 free(info);
917 return ordered;
918 }
919
920 static unique_ptr<BaseDB>
compileHyperscan(vector<const char * > & patterns,vector<unsigned> & flags,vector<unsigned> & idsvec,ptr_vector<hs_expr_ext> & ext,unsigned mode,const hs_platform_info * platform,string & error,const Grey & grey)921 compileHyperscan(vector<const char *> &patterns, vector<unsigned> &flags,
922 vector<unsigned> &idsvec, ptr_vector<hs_expr_ext> &ext,
923 unsigned mode, const hs_platform_info *platform, string &error,
924 const Grey &grey) {
925 const unsigned count = patterns.size();
926 hs_database_t *db = nullptr;
927 hs_compile_error_t *compile_err;
928 hs_error_t err;
929
930 if (use_literal_api) {
931 // Compute length of each pattern.
932 vector<size_t> lens(count);
933 for (unsigned int i = 0; i < count; i++) {
934 lens[i] = strlen(patterns[i]);
935 }
936 err = hs_compile_lit_multi_int(&patterns[0], &flags[0], &idsvec[0],
937 ext.c_array(), &lens[0], count, mode,
938 platform, &db, &compile_err, grey);
939 } else {
940 err = hs_compile_multi_int(&patterns[0], &flags[0], &idsvec[0],
941 ext.c_array(), count, mode, platform, &db,
942 &compile_err, grey);
943 }
944
945 if (err != HS_SUCCESS) {
946 error = compile_err->message;
947 hs_free_compile_error(compile_err);
948 return nullptr;
949 }
950
951 return ue2::make_unique<HyperscanDB>(db, idsvec.begin(), idsvec.end());
952 }
953
954 #ifdef HS_HYBRID
955 static unique_ptr<BaseDB>
compileHybrid(vector<const char * > & patterns,vector<unsigned> & flags,vector<unsigned> & idsvec,unsigned mode,const hs_platform_info * platform,string & error)956 compileHybrid(vector<const char *> &patterns,
957 vector<unsigned> &flags, vector<unsigned> &idsvec,
958 unsigned mode, const hs_platform_info *platform, string &error) {
959 const unsigned count = patterns.size();
960 ch_database_t *db = nullptr;
961 ch_compile_error_t *compile_err;
962
963 ch_error_t err = ch_compile_multi(&patterns[0], &flags[0],
964 &idsvec[0], count, mode, platform, &db,
965 &compile_err);
966
967 if (err != HS_SUCCESS) {
968 error = compile_err->message;
969 ch_free_compile_error(compile_err);
970 return nullptr;
971 }
972
973 return ue2::make_unique<HybridDB>(db, idsvec.begin(), idsvec.end());
974 }
975 #endif
976
compile(const set<unsigned> & ids,string & error) const977 shared_ptr<BaseDB> UltimateTruth::compile(const set<unsigned> &ids,
978 string &error) const {
979 // Build our vectors for compilation
980 const size_t count = ids.size();
981 vector<string> expressions(count);
982 vector<unsigned> idsvec(ids.begin(), ids.end());
983 vector<unsigned> flags(count);
984 vector<bool> check_ordered(count, false);
985 ptr_vector<hs_expr_ext> ext;
986 ext.reserve(count);
987
988 size_t n = 0;
989 for (const auto &id : ids) {
990 auto j = m_expr.find(id);
991 if (j == m_expr.end()) {
992 error = "Unable to find ID.";
993 return nullptr;
994 }
995
996 ext.push_back(new hs_expr_ext);
997 bool must_be_ordered;
998 if (!readExpression(j->second, expressions[n], &flags[n], &ext[n],
999 &must_be_ordered)) {
1000 ostringstream oss;
1001 oss << "Unable to decode flags: '" << j->first << ":"
1002 << j->second << "'.";
1003 error = oss.str();
1004 return nullptr;
1005 }
1006
1007 check_ordered[n] = must_be_ordered;
1008
1009 if (force_utf8) {
1010 flags[n] |= HS_FLAG_UTF8;
1011 }
1012
1013 if (force_prefilter) {
1014 flags[n] |= HS_FLAG_PREFILTER;
1015 }
1016
1017 if (somFlags) {
1018 flags[n] |= somFlags;
1019 }
1020
1021 if (force_edit_distance) {
1022 ext[n].flags |= HS_EXT_FLAG_EDIT_DISTANCE;
1023 ext[n].edit_distance = edit_distance;
1024 }
1025
1026 if (colliderMode == MODE_HYBRID) {
1027 if (ext[n].flags) {
1028 error = "Hybrid does not support extended parameters.";
1029 return nullptr;
1030 }
1031 // We can also strip some other flags in the hybrid matcher.
1032 flags[n] &= ~HS_FLAG_PREFILTER; // prefilter always used
1033 flags[n] &= ~HS_FLAG_ALLOWEMPTY; // empty always allowed
1034 flags[n] &= ~HS_FLAG_SOM_LEFTMOST; // SOM always on
1035 }
1036
1037 n++;
1038 }
1039
1040 // Our compiler takes an array of plain ol' C strings.
1041 vector<const char *> patterns(count);
1042 for (unsigned int i = 0; i < count; i++) {
1043 patterns[i] = expressions[i].c_str();
1044 }
1045
1046 // Compile
1047 if (!count) { /* slight hack to allow us to compile empty sets cleanly */
1048 patterns.push_back(nullptr);
1049 flags.push_back(0);
1050 idsvec.push_back(0);
1051 }
1052
1053 unique_ptr<BaseDB> db;
1054 if (colliderMode == MODE_HYBRID) {
1055 #ifdef HS_HYBRID
1056 db = compileHybrid(patterns, flags, idsvec, m_mode, platform, error);
1057 #else
1058 error = "Hybrid mode not available in this build.";
1059 #endif
1060 } else {
1061 db = compileHyperscan(patterns, flags, idsvec, ext, m_mode,
1062 platform, error, grey);
1063 }
1064
1065 if (!db) {
1066 return nullptr;
1067 }
1068
1069 // Track IDs of patterns that require ordering for validation at match
1070 // time.
1071 for (unsigned int i = 0; i < count; i++) {
1072 bool is_ordered = isOrdered(expressions[i], flags[i]);
1073 if (check_ordered[i] && !is_ordered) {
1074 error = "Ordering required, but hs_expression_info suggests "
1075 "that ordering is not guaranteed.";
1076 return nullptr;
1077 }
1078 if (is_ordered) {
1079 db->ordered.insert(idsvec[i]);
1080 }
1081 }
1082
1083 return move(db);
1084 }
1085
allocScratch(shared_ptr<const BaseDB> db)1086 bool UltimateTruth::allocScratch(shared_ptr<const BaseDB> db) {
1087 assert(db);
1088
1089 // We explicitly avoid running scratch allocators for the same BaseDB
1090 // over and over again by retaining a shared_ptr to the last one we saw.
1091 if (db == last_db) {
1092 return true;
1093 }
1094
1095 if (colliderMode == MODE_HYBRID) {
1096 #ifdef HS_HYBRID
1097 ch_error_t err = ch_alloc_scratch(
1098 reinterpret_cast<const HybridDB *>(db.get())->db, &chimeraScratch);
1099 if (err != HS_SUCCESS) {
1100 return false;
1101 }
1102 #endif // HS_HYBRID
1103 } else {
1104 hs_error_t err = hs_alloc_scratch(
1105 reinterpret_cast<const HyperscanDB *>(db.get())->db, &scratch);
1106 if (err != HS_SUCCESS) {
1107 return false;
1108 }
1109 }
1110
1111 last_db = db;
1112 return true;
1113 }
1114
cloneScratch(void)1115 bool UltimateTruth::cloneScratch(void) {
1116 if (colliderMode == MODE_HYBRID) {
1117 #ifdef HS_HYBRID
1118 ch_scratch_t *old_scratch = chimeraScratch;
1119 ch_scratch_t *new_scratch;
1120 ch_error_t ret = ch_clone_scratch(chimeraScratch, &new_scratch);
1121 if (ret != CH_SUCCESS) {
1122 DEBUG_PRINTF("failure to clone %d\n", ret);
1123 return false;
1124 }
1125 chimeraScratch = new_scratch;
1126 ret = ch_free_scratch(old_scratch);
1127 if (ret != CH_SUCCESS) {
1128 DEBUG_PRINTF("failure to free %d\n", ret);
1129 return false;
1130 }
1131 DEBUG_PRINTF("hybrid scratch cloned from %p to %p\n",
1132 old_scratch, chimeraScratch);
1133 #endif // HS_HYBRID
1134 } else {
1135 hs_scratch_t *old_scratch = scratch;
1136 hs_scratch_t *new_scratch;
1137 hs_error_t ret = hs_clone_scratch(scratch, &new_scratch);
1138 if (ret != HS_SUCCESS) {
1139 DEBUG_PRINTF("failure to clone %d\n", ret);
1140 return false;
1141 }
1142 scratch = new_scratch;
1143 ret = hs_free_scratch(old_scratch);
1144 if (ret != HS_SUCCESS) {
1145 DEBUG_PRINTF("failure to free %d\n", ret);
1146 return false;
1147 }
1148 DEBUG_PRINTF("scratch cloned from %p to %p\n", old_scratch, scratch);
1149 }
1150 return true;
1151 }
1152
1153 // Return an appropriately aligned (modulo max align) copy of the given buffer
setupScanBuffer(const char * begin,size_t len,size_t align)1154 char * UltimateTruth::setupScanBuffer(const char *begin, size_t len,
1155 size_t align) {
1156 if (align >= MAX_MAX_UE2_ALIGN) {
1157 return nullptr;
1158 }
1159
1160 // Realloc if necessary
1161 size_t maxBufSize = len + MAX_MAX_UE2_ALIGN;
1162 if (maxBufSize > m_scanBuf.size()) {
1163 m_scanBuf.resize(maxBufSize);
1164 }
1165
1166 uintptr_t currentAlign = (uintptr_t)(m_scanBuf.data()) % MAX_MAX_UE2_ALIGN;
1167 char *ptr;
1168
1169 ptrdiff_t diff = align - currentAlign;
1170 if (diff >= 0) {
1171 ptr = (m_scanBuf.data() + diff);
1172 } else {
1173 ptr = (m_scanBuf.data() + (MAX_MAX_UE2_ALIGN + diff));
1174 }
1175 assert((uintptr_t)(ptr) % MAX_MAX_UE2_ALIGN == align);
1176
1177 // copy the buffer
1178 memcpy(ptr, begin, len);
1179 return ptr;
1180 }
1181
setupVecScanBuffer(const char * begin,size_t len,size_t align,u32 block_id)1182 char *UltimateTruth::setupVecScanBuffer(const char *begin, size_t len,
1183 size_t align, u32 block_id) {
1184 if (align >= MAX_MAX_UE2_ALIGN) {
1185 return nullptr;
1186 }
1187
1188 assert(block_id < raw_blocks.size());
1189 vector<char> &raw = raw_blocks[block_id];
1190
1191 // Realloc if necessary
1192 size_t maxBufSize = len + MAX_MAX_UE2_ALIGN;
1193 if (maxBufSize > raw.size()) {
1194 raw.resize(maxBufSize);
1195 }
1196 assert(maxBufSize <= raw.size());
1197
1198 uintptr_t currentAlign = (uintptr_t)(&raw[0]) % MAX_MAX_UE2_ALIGN;
1199 char *ptr;
1200
1201 ptrdiff_t diff = align - currentAlign;
1202 if (diff >= 0) {
1203 ptr = (&raw[0] + diff);
1204 } else {
1205 ptr = (&raw[0] + (MAX_MAX_UE2_ALIGN + diff));
1206 }
1207 assert((uintptr_t)(ptr) % MAX_MAX_UE2_ALIGN == align);
1208
1209 // copy the buffer
1210 memcpy(ptr, begin, len);
1211 return ptr;
1212 }
1213
saveDatabase(const BaseDB & bdb,const string & filename) const1214 bool UltimateTruth::saveDatabase(const BaseDB &bdb,
1215 const string &filename) const {
1216 if (colliderMode == MODE_HYBRID) {
1217 cerr << "Hybrid mode doesn't support serialization." << endl;
1218 abort();
1219 } else {
1220 return ::saveDatabase(reinterpret_cast<const HyperscanDB *>(&bdb)->db,
1221 filename.c_str(), g_verbose);
1222 }
1223 return false;
1224 }
1225
1226 shared_ptr<BaseDB>
loadDatabase(const string & filename,const std::set<unsigned> & ids) const1227 UltimateTruth::loadDatabase(const string &filename,
1228 const std::set<unsigned> &ids) const {
1229 shared_ptr<BaseDB> db;
1230
1231 if (colliderMode == MODE_HYBRID) {
1232 cerr << "Hybrid mode doesn't support deserialization." << endl;
1233 abort();
1234 } else {
1235 hs_database_t *hs_db = ::loadDatabase(filename.c_str(), g_verbose);
1236 if (!hs_db) {
1237 return nullptr;
1238 }
1239
1240 db = make_shared<HyperscanDB>(hs_db, ids.begin(), ids.end());
1241 }
1242
1243 assert(db);
1244
1245 // Fill db::ordered with the expressions that require the ordered flag.
1246 for (const auto &id : ids) {
1247 auto j = m_expr.find(id);
1248 if (j == m_expr.end()) {
1249 cerr << "Can't find expression with ID " << id << endl;
1250 assert(0);
1251 db.reset();
1252 return db;
1253 }
1254 string expr;
1255 hs_expr_ext ext;
1256 unsigned int flags;
1257 if (!readExpression(j->second, expr, &flags, &ext)) {
1258 cerr << "Can't parse expression with ID " << id << ": "
1259 << j->second << endl;
1260 assert(0);
1261 db.reset();
1262 return db;
1263 }
1264 if (isOrdered(expr, flags)) {
1265 db->ordered.insert(id);
1266 }
1267 }
1268
1269 return db;
1270 }
1271
describe() const1272 unsigned int UltimateTruth::describe() const {
1273 return m_mode;
1274 }
1275
1276 // Hash the settings used to compile a database, returning a string that can be
1277 // used as a filename.
dbSettingsHash(const set<unsigned int> & ids) const1278 string UltimateTruth::dbSettingsHash(const set<unsigned int> &ids) const {
1279 // create a single string to contain a description of the db
1280 ostringstream info_oss;
1281
1282 // settings from UltimateTruth::describe()
1283 info_oss << ' ' << describe() << ' ';
1284
1285 // our set
1286 for (unsigned int id : ids) {
1287 info_oss << id << ' ';
1288 }
1289
1290 string info = info_oss.str();
1291
1292 u32 crc = Crc32c_ComputeBuf(0, info.data(), info.size());
1293
1294 // return STL string with printable version of digest
1295 ostringstream oss;
1296 oss << hex << setw(8) << setfill('0') << crc << dec;
1297
1298 return oss.str();
1299 }
1300
dbFilename(const set<unsigned int> & ids) const1301 string UltimateTruth::dbFilename(const set<unsigned int> &ids) const {
1302 ostringstream oss;
1303 oss << serializePath << '/' << dbSettingsHash(ids) << ".db";
1304 return oss.str();
1305 }
1306