1 /*
2 * Copyright (c) 2015-2019, Intel Corporation
3 *
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions are met:
6 *
7 * * Redistributions of source code must retain the above copyright notice,
8 * this list of conditions and the following disclaimer.
9 * * Redistributions in binary form must reproduce the above copyright
10 * notice, this list of conditions and the following disclaimer in the
11 * documentation and/or other materials provided with the distribution.
12 * * Neither the name of Intel Corporation nor the names of its contributors
13 * may be used to endorse or promote products derived from this software
14 * without specific prior written permission.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
20 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26 * POSSIBILITY OF SUCH DAMAGE.
27 */
28
29 #ifdef _WIN32
30 #define PCRE_STATIC
31 #endif
32 #include "config.h"
33
34 #include "common.h"
35 #include "ExpressionParser.h"
36 #include "expressions.h"
37 #include "GroundTruth.h"
38 #include "pcre_util.h"
39
40 #include "hs_compile.h" // for hs_expr_ext
41 #include "ue2common.h"
42 #include "parser/control_verbs.h"
43 #include "parser/Parser.h"
44 #include "parser/parse_error.h"
45 #include "util/make_unique.h"
46 #include "util/string_util.h"
47 #include "util/unicode_def.h"
48 #include "util/unordered.h"
49
50 #include <algorithm>
51 #include <cassert>
52 #include <cstdio>
53 #include <cstdlib>
54 #include <cstring>
55 #include <ostream>
56 #include <sstream>
57 #include <string>
58 #include <vector>
59
60 #include <pcre.h>
61
62 /* -X, -Y support
63 * as PCRE performance is `non-linear' and these options add a large amount of
64 * scanning, the following short cuts are used:
65 * 1: the suffix is not scanned - we are more interested in the matches from
66 * the original corpora.
67 * 2: only the last 50 bytes of the prefix is scanned. This may lead to some
68 * minor correctness issues for a few patterns.
69 */
70
71 using namespace std;
72 using namespace ue2;
73
74 // We store matches in a hash table as we're likely to see lots of them. These
75 // are moved into a ResultSet at the end.
76 using PcreMatchSet = ue2::ue2_unordered_set<pair<unsigned, unsigned>>;
77
78 namespace {
79 struct CalloutContext {
CalloutContext__anonfb33cbd20111::CalloutContext80 explicit CalloutContext(ostream &os) : out(os) {}
81 ostream &out;
82 PcreMatchSet matches;
83 };
84 }
85
86 static
pcreCallOut(pcre_callout_block * block)87 int pcreCallOut(pcre_callout_block *block) {
88 assert(block);
89 assert(block->callout_data);
90 CalloutContext *ctx = static_cast<CalloutContext *>(block->callout_data);
91
92 if (echo_matches) {
93 ctx->out << "PCRE Match @ (" << block->start_match << ","
94 << block->current_position << ")" << endl;
95 }
96
97 unsigned int from = block->start_match;
98 unsigned int to = block->current_position;
99 assert(from <= to);
100
101 ctx->matches.insert(make_pair(from, to));
102 return 1;
103 }
104
105 static
decodeExprPcre(string & expr,unsigned * flags,bool * highlander,bool * prefilter,bool * som,bool * combination,bool * quiet,hs_expr_ext * ext)106 bool decodeExprPcre(string &expr, unsigned *flags, bool *highlander,
107 bool *prefilter, bool *som, bool *combination,
108 bool *quiet, hs_expr_ext *ext) {
109 string regex;
110 unsigned int hs_flags = 0;
111 if (!readExpression(expr, regex, &hs_flags, ext)) {
112 return false;
113 }
114
115 if (use_literal_api) {
116 // filter out flags not supported by pure literal API.
117 u32 not_supported = HS_FLAG_DOTALL | HS_FLAG_ALLOWEMPTY | HS_FLAG_UTF8 |
118 HS_FLAG_UCP | HS_FLAG_PREFILTER;
119 hs_flags &= ~not_supported;
120 force_utf8 = false;
121 force_prefilter = false;
122 }
123
124 expr.swap(regex);
125
126 if (!getPcreFlags(hs_flags, flags, highlander, prefilter, som,
127 combination, quiet)) {
128 return false;
129 }
130
131 if (force_utf8) {
132 *flags |= PCRE_UTF8;
133 }
134
135 if (force_prefilter) {
136 *prefilter = true;
137 }
138
139 return true;
140 }
141
142 static
pcreErrStr(int err)143 string pcreErrStr(int err) {
144 switch (err) {
145 case PCRE_ERROR_NOMATCH:
146 return "PCRE_ERROR_NOMATCH";
147 case PCRE_ERROR_NULL:
148 return "PCRE_ERROR_NULL";
149 case PCRE_ERROR_BADOPTION:
150 return "PCRE_ERROR_BADOPTION";
151 case PCRE_ERROR_BADMAGIC:
152 return "PCRE_ERROR_BADMAGIC";
153 #if defined(PCRE_ERROR_UNKNOWN_OPCODE)
154 case PCRE_ERROR_UNKNOWN_OPCODE:
155 return "PCRE_ERROR_UNKNOWN_OPCODE";
156 #else
157 case PCRE_ERROR_UNKNOWN_NODE:
158 return "PCRE_ERROR_UNKNOWN_NODE";
159 #endif
160 case PCRE_ERROR_NOMEMORY:
161 return "PCRE_ERROR_NOMEMORY";
162 case PCRE_ERROR_NOSUBSTRING:
163 return "PCRE_ERROR_NOSUBSTRING";
164 case PCRE_ERROR_MATCHLIMIT:
165 return "PCRE_ERROR_MATCHLIMIT";
166 case PCRE_ERROR_CALLOUT:
167 return "PCRE_ERROR_CALLOUT";
168 case PCRE_ERROR_BADUTF8:
169 return "PCRE_ERROR_BADUTF8";
170 case PCRE_ERROR_BADUTF8_OFFSET:
171 return "PCRE_ERROR_BADUTF8_OFFSET";
172 case PCRE_ERROR_PARTIAL:
173 return "PCRE_ERROR_PARTIAL";
174 case PCRE_ERROR_BADPARTIAL:
175 return "PCRE_ERROR_BADPARTIAL";
176 case PCRE_ERROR_INTERNAL:
177 return "PCRE_ERROR_INTERNAL";
178 case PCRE_ERROR_BADCOUNT:
179 return "PCRE_ERROR_BADCOUNT";
180 #if defined(PCRE_ERROR_RECURSIONLIMIT)
181 case PCRE_ERROR_RECURSIONLIMIT:
182 return "PCRE_ERROR_RECURSIONLIMIT";
183 #endif
184 case PCRE_ERROR_DFA_UITEM:
185 return "PCRE_ERROR_DFA_UITEM";
186 case PCRE_ERROR_DFA_UCOND:
187 return "PCRE_ERROR_DFA_UCOND";
188 case PCRE_ERROR_DFA_UMLIMIT:
189 return "PCRE_ERROR_DFA_UMLIMIT";
190 case PCRE_ERROR_DFA_WSSIZE:
191 return "PCRE_ERROR_DFA_WSSIZE";
192 case PCRE_ERROR_DFA_RECURSE:
193 return "PCRE_ERROR_DFA_RECURSE";
194 default:
195 {
196 ostringstream oss;
197 oss << "Unknown PCRE error (value: " << err << ")";
198 return oss.str();
199 }
200 }
201 }
202
203 /* that is, a mode provided by native hyperscan */
204 static
isStandardMode(unsigned int mode)205 bool isStandardMode(unsigned int mode) {
206 return mode == MODE_BLOCK
207 || mode == MODE_STREAMING
208 || mode == MODE_VECTORED;
209 }
210
GroundTruth(ostream & os,const ExpressionMap & expr,unsigned long int limit,unsigned long int limit_recursion)211 GroundTruth::GroundTruth(ostream &os, const ExpressionMap &expr,
212 unsigned long int limit,
213 unsigned long int limit_recursion)
214 : out(os), m_expr(expr), matchLimit(limit),
215 matchLimitRecursion(limit_recursion) {}
216
global_prep()217 void GroundTruth::global_prep() {
218 if (isStandardMode(colliderMode)) {
219 // We're using pcre callouts
220 pcre_callout = &pcreCallOut;
221 }
222 }
223
224 static
addCallout(string & re)225 void addCallout(string &re) {
226 // If the string begins with "(*UTF8)" or "(*UTF8)(*UCP)", we want to keep
227 // it at the front. We reuse the control verbs mini-parser for this.
228 size_t startpos = 0;
229 try {
230 ue2::ParseMode mode;
231 const char *ptr = ue2::read_control_verbs(
232 re.c_str(), re.c_str() + re.size(), 0, mode);
233 startpos = ptr - re.c_str();
234 } catch (const ue2::ParseError &err) {
235 // fall through
236 }
237 assert(startpos <= re.length());
238 re.insert(startpos, "(?:");
239 // We include a \E to close any open \Q quoted block. If there isn't
240 // one, pcre will ignore the \E.
241 re.append("\\E)(?C)");
242 }
243
244 static
isUtf8(const CompiledPcre & compiled)245 bool isUtf8(const CompiledPcre &compiled) {
246 unsigned long int options = 0;
247 pcre_fullinfo(compiled.bytecode, NULL, PCRE_INFO_OPTIONS, &options);
248 return options & PCRE_UTF8;
249 }
250
251 unique_ptr<CompiledPcre>
compile(unsigned id,bool no_callouts)252 GroundTruth::compile(unsigned id, bool no_callouts) {
253 bool highlander = false;
254 bool prefilter = false;
255 bool som = false;
256 bool combination = false;
257 bool quiet = false;
258
259 // we can still match approximate matching patterns with PCRE if edit
260 // distance 0 is requested
261 if (force_edit_distance && edit_distance) {
262 throw SoftPcreCompileFailure("Edit distance not supported by PCRE.");
263 }
264
265 ExpressionMap::const_iterator i = m_expr.find(id);
266 if (i == m_expr.end()) {
267 throw PcreCompileFailure("ID not found in expression map.");
268 }
269
270 string re(i->second);
271 unsigned flags;
272 hs_expr_ext ext;
273
274 // Decode the flags
275 if (!decodeExprPcre(re, &flags, &highlander, &prefilter, &som,
276 &combination, &quiet, &ext)) {
277 throw PcreCompileFailure("Unable to decode flags.");
278 }
279
280 // When hyperscan literal api is on, transfer the regex string into hex.
281 if (use_literal_api && !combination) {
282 unsigned char *pat
283 = reinterpret_cast<unsigned char *>(const_cast<char *>(re.c_str()));
284 char *str = makeHex(pat, re.length());
285 if (!str) {
286 throw PcreCompileFailure("makeHex() malloc failure.");
287 }
288 re.assign(str);
289 free(str);
290 }
291
292 // filter out flags not supported by PCRE
293 u64a supported = HS_EXT_FLAG_MIN_OFFSET | HS_EXT_FLAG_MAX_OFFSET |
294 HS_EXT_FLAG_MIN_LENGTH;
295 if (use_literal_api) {
296 ext.flags &= 0ULL;
297 ext.min_offset = 0;
298 ext.max_offset = MAX_OFFSET;
299 ext.min_length = 0;
300 ext.edit_distance = 0;
301 ext.hamming_distance = 0;
302 }
303 if (ext.flags & ~supported) {
304 // edit distance is a known unsupported flag, so just throw a soft error
305 if (ext.flags & HS_EXT_FLAG_EDIT_DISTANCE) {
306 throw SoftPcreCompileFailure("Edit distance not supported by PCRE.");
307 }
308 if (ext.flags & HS_EXT_FLAG_HAMMING_DISTANCE) {
309 throw SoftPcreCompileFailure(
310 "Hamming distance not supported by PCRE.");
311 }
312 throw PcreCompileFailure("Unsupported extended flags.");
313 }
314
315 // Hybrid mode implies SOM.
316 if (colliderMode == MODE_HYBRID) {
317 assert(!use_NFA);
318 som = true;
319 }
320
321 // SOM flags might be set globally.
322 som |= !!somFlags;
323
324 // For traditional Hyperscan, add global callout to pattern.
325 if (!combination && !no_callouts && isStandardMode(colliderMode)) {
326 addCallout(re);
327 }
328
329 // Compile the pattern
330 const char *errptr = nullptr;
331 int errloc = 0;
332 int errcode = 0;
333
334 unique_ptr<CompiledPcre> compiled = make_unique<CompiledPcre>();
335 compiled->utf8 = flags & PCRE_UTF8;
336 compiled->highlander = highlander;
337 compiled->prefilter = prefilter;
338 compiled->som = som;
339 compiled->combination = combination;
340 compiled->quiet = quiet;
341 compiled->min_offset = ext.min_offset;
342 compiled->max_offset = ext.max_offset;
343 compiled->min_length = ext.min_length;
344 compiled->expression = i->second; // original PCRE
345 flags |= PCRE_NO_AUTO_POSSESS;
346
347 if (compiled->combination) {
348 compiled->pl.parseLogicalCombination(id, re.c_str(), ~0U, 0, ~0ULL);
349 compiled->pl.logicalKeyRenumber();
350 compiled->report = id;
351 return compiled;
352 }
353
354 compiled->bytecode =
355 pcre_compile2(re.c_str(), flags, &errcode, &errptr, &errloc, nullptr);
356
357 if (!compiled->bytecode || errptr) {
358 assert(errcode);
359 ostringstream oss;
360 oss << "Failed to compile expression '" << re << '\'';
361 oss << " (" << errptr << " at " << errloc << ").";
362 if (errcode == 20) { // "regular expression is too large"
363 throw SoftPcreCompileFailure(oss.str());
364 } else if (errcode == 25) { // "lookbehind assertion is not fixed length"
365 throw SoftPcreCompileFailure(oss.str());
366 } else {
367 throw PcreCompileFailure(oss.str());
368 }
369 }
370
371 // Study the pattern
372 shared_ptr<pcre_extra> extra(pcre_study(compiled->bytecode, 0, &errptr),
373 free);
374 if (errptr) {
375 ostringstream oss;
376 oss << "Error studying pattern (" << errptr << ").";
377 throw PcreCompileFailure(oss.str());
378 }
379
380 int infoRes =
381 pcre_fullinfo(compiled->bytecode, extra.get(), PCRE_INFO_CAPTURECOUNT,
382 &compiled->captureCount);
383 if (infoRes < PCRE_ERROR_NOMATCH) {
384 ostringstream oss;
385 oss << "Error determining number of capturing subpatterns ("
386 << pcreErrStr(infoRes) << ").";
387 throw PcreCompileFailure(oss.str());
388 }
389
390 compiled->utf8 |= isUtf8(*compiled);
391
392 return compiled;
393 }
394
395 static
filterLeftmostSom(ResultSet & rs)396 void filterLeftmostSom(ResultSet &rs) {
397 if (rs.matches.size() <= 1) {
398 return;
399 }
400
401 set<u64a> seen; // End offsets.
402 set<MatchResult>::iterator it = rs.matches.begin();
403 while (it != rs.matches.end()) {
404 if (seen.insert(it->to).second) {
405 ++it; // First time we've seen this end-offset.
406 } else {
407 rs.matches.erase(it++); // Dupe with a "righter" SOM.
408 }
409 }
410 }
411
412 static
filterExtParams(ResultSet & rs,const CompiledPcre & compiled)413 void filterExtParams(ResultSet &rs, const CompiledPcre &compiled) {
414 set<MatchResult>::iterator it = rs.matches.begin();
415 while (it != rs.matches.end()) {
416 unsigned int from = it->from, to = it->to;
417 unsigned int len = to - from;
418 if (to < compiled.min_offset || to > compiled.max_offset ||
419 len < compiled.min_length) {
420 rs.matches.erase(it++);
421 } else {
422 ++it;
423 }
424 }
425 }
426
427 static
scanBasic(const CompiledPcre & compiled,const string & buffer,const pcre_extra & extra,vector<int> & ovector,CalloutContext & ctx)428 int scanBasic(const CompiledPcre &compiled, const string &buffer,
429 const pcre_extra &extra, vector<int> &ovector,
430 CalloutContext &ctx) {
431 const size_t prefix_len = g_corpora_prefix.size();
432 const size_t suffix_len = g_corpora_suffix.size();
433
434 size_t begin_offset = prefix_len - MIN(50, prefix_len);
435 size_t real_len = buffer.size();
436
437 if (suffix_len > 2) {
438 real_len -= suffix_len - 2;
439 }
440
441 int flags = suffix_len ? PCRE_NOTEOL : 0;
442 int ret = pcre_exec(compiled.bytecode, &extra, buffer.c_str(), real_len,
443 begin_offset, flags, &ovector[0], ovector.size());
444
445 if (!g_corpora_prefix.empty()) {
446 PcreMatchSet tmp;
447 tmp.swap(ctx.matches);
448
449 for (const auto &m : tmp) {
450 unsigned from = m.first;
451 unsigned to = m.second;
452 if (to >= prefix_len && to <= buffer.size() - suffix_len) {
453 from = from < prefix_len ? 0 : from - prefix_len;
454 to -= prefix_len;
455 ctx.matches.insert(make_pair(from, to));
456 }
457 }
458 }
459
460 return ret;
461 }
462
463 static
makeCaptureVec(const vector<int> & ovector,int ret)464 CaptureVec makeCaptureVec(const vector<int> &ovector, int ret) {
465 assert(ret > 0);
466
467 CaptureVec cap;
468
469 if (no_groups) {
470 return cap; // No group info requested.
471 }
472
473 cap.reserve(ret * 2);
474 for (int i = 0; i < ret * 2; i += 2) {
475 int from = ovector[i], to = ovector[i + 1];
476 cap.push_back(make_pair(from, to));
477 }
478 return cap;
479 }
480
481 static
scanHybrid(const CompiledPcre & compiled,const string & buffer,const pcre_extra & extra,vector<int> & ovector,ResultSet & rs,ostream & out)482 int scanHybrid(const CompiledPcre &compiled, const string &buffer,
483 const pcre_extra &extra, vector<int> &ovector,
484 ResultSet &rs, ostream &out) {
485 int len = (int)buffer.length();
486 int startoffset = 0;
487 bool utf8 = isUtf8(compiled);
488
489 int flags = 0;
490 int ret;
491 do {
492 ret = pcre_exec(compiled.bytecode, &extra, buffer.c_str(), len,
493 startoffset, flags, &ovector[0], ovector.size());
494
495 if (ret <= PCRE_ERROR_NOMATCH) {
496 return ret;
497 }
498
499 int from = ovector.at(0);
500 int to = ovector.at(1);
501 rs.addMatch(from, to, makeCaptureVec(ovector, ret));
502
503 if (echo_matches) {
504 out << "PCRE Match @ (" << from << "," << to << ")" << endl;
505 }
506
507 // If we only wanted a single match, we're done.
508 if (compiled.highlander) break;
509
510 // Next scan starts at the first codepoint after the match. It's
511 // possible that we have a vacuous match, in which case we must step
512 // past it to ensure that we always progress.
513 if (from != to) {
514 startoffset = to;
515 } else if (utf8) {
516 startoffset = to + 1;
517 while (startoffset < len
518 && ((buffer[startoffset] & 0xc0) == UTF_CONT_BYTE_HEADER)) {
519 ++startoffset;
520 }
521 } else {
522 startoffset = to + 1;
523 }
524 } while (startoffset <= len);
525
526 return ret;
527 }
528
529 static
scanOffset(const CompiledPcre & compiled,const string & buffer,const pcre_extra & extra,vector<int> & ovector,CalloutContext & ctx)530 int scanOffset(const CompiledPcre &compiled, const string &buffer,
531 const pcre_extra &extra, vector<int> &ovector,
532 CalloutContext &ctx) {
533 size_t offset = MIN(100, g_streamOffset);
534 assert(offset > 0);
535
536 const string buf(string(offset, '\0') + buffer);
537
538 // First, scan our preamble so that we can discard any matches therein
539 // after the real scan, later. We use PCRE_NOTEOL so that end-anchors in
540 // our expression don't match at the end of the preamble.
541 int ret = pcre_exec(compiled.bytecode, &extra, buf.c_str(), offset, 0,
542 PCRE_NOTEOL, &ovector[0], ovector.size());
543 if (ret < PCRE_ERROR_NOMATCH) {
544 return ret;
545 }
546
547 PcreMatchSet pre_matches;
548 pre_matches.swap(ctx.matches);
549
550 // Real scan.
551 ret = pcre_exec(compiled.bytecode, &extra, buf.c_str(), buf.size(), 0, 0,
552 &ovector[0], ovector.size());
553 if (ret < PCRE_ERROR_NOMATCH) {
554 return ret;
555 }
556
557 // Erase any matches due entirely to the preamble.
558 for (const auto &m : pre_matches) {
559 ctx.matches.erase(m);
560 }
561
562 return ret;
563 }
564
565 /** \brief Returns 1 if compliant to all logical combinations. */
566 static
isLogicalCombination(vector<char> & lv,const vector<LogicalOp> & comb,size_t lkeyCount,unsigned start,unsigned result)567 char isLogicalCombination(vector<char> &lv, const vector<LogicalOp> &comb,
568 size_t lkeyCount, unsigned start, unsigned result) {
569 assert(start <= result);
570 for (unsigned i = start; i <= result; i++) {
571 const LogicalOp &op = comb[i - lkeyCount];
572 assert(i == op.id);
573 switch (op.op) {
574 case LOGICAL_OP_NOT:
575 lv[op.id] = !lv[op.ro];
576 break;
577 case LOGICAL_OP_AND:
578 lv[op.id] = lv[op.lo] & lv[op.ro]; // &&
579 break;
580 case LOGICAL_OP_OR:
581 lv[op.id] = lv[op.lo] | lv[op.ro]; // ||
582 break;
583 default:
584 assert(0);
585 break;
586 }
587 }
588 return lv[result];
589 }
590
591 /** \brief Returns 1 if combination matches when no sub-expression matches. */
592 static
isPurelyNegativeMatch(vector<char> & lv,const vector<LogicalOp> & comb,size_t lkeyCount,unsigned start,unsigned result)593 char isPurelyNegativeMatch(vector<char> &lv, const vector<LogicalOp> &comb,
594 size_t lkeyCount, unsigned start, unsigned result) {
595 assert(start <= result);
596 for (unsigned i = start; i <= result; i++) {
597 const LogicalOp &op = comb[i - lkeyCount];
598 assert(i == op.id);
599 switch (op.op) {
600 case LOGICAL_OP_NOT:
601 if ((op.ro < lkeyCount) && lv[op.ro]) {
602 // sub-expression not negative
603 return 0;
604 }
605 lv[op.id] = !lv[op.ro];
606 break;
607 case LOGICAL_OP_AND:
608 if (((op.lo < lkeyCount) && lv[op.lo]) ||
609 ((op.ro < lkeyCount) && lv[op.ro])) {
610 // sub-expression not negative
611 return 0;
612 }
613 lv[op.id] = lv[op.lo] & lv[op.ro]; // &&
614 break;
615 case LOGICAL_OP_OR:
616 if (((op.lo < lkeyCount) && lv[op.lo]) ||
617 ((op.ro < lkeyCount) && lv[op.ro])) {
618 // sub-expression not negative
619 return 0;
620 }
621 lv[op.id] = lv[op.lo] | lv[op.ro]; // ||
622 break;
623 default:
624 assert(0);
625 break;
626 }
627 }
628 return lv[result];
629 }
630
run(unsigned,const CompiledPcre & compiled,const string & buffer,ResultSet & rs,string & error)631 bool GroundTruth::run(unsigned, const CompiledPcre &compiled,
632 const string &buffer, ResultSet &rs, string &error) {
633 if (compiled.quiet) {
634 return true;
635 }
636
637 if (compiled.combination) {
638 // Compile and run sub-expressions, store match results.
639 map<unsigned long long, set<MatchResult>> offset_to_matches;
640 map<unsigned long long, set<unsigned>> offset_to_lkeys;
641 set<unsigned> sub_exps;
642 const auto &m_lkey = compiled.pl.getLkeyMap();
643 for (const auto &it_lkey : m_lkey) {
644 if (sub_exps.find(it_lkey.first) == sub_exps.end()) {
645 sub_exps.emplace(it_lkey.first);
646 ResultSet sub_rs(RESULT_FROM_PCRE);
647 shared_ptr<CompiledPcre> sub_pcre;
648 try {
649 sub_pcre = compile(it_lkey.first);
650 }
651 catch (const SoftPcreCompileFailure &err) {
652 return false;
653 }
654 catch (const PcreCompileFailure &err) {
655 return false;
656 }
657 sub_pcre->quiet = false; // force not quiet in sub-exp.
658 if (!run(it_lkey.first, *sub_pcre, buffer, sub_rs, error)) {
659 rs.clear();
660 return false;
661 }
662 for (const auto &it_mr : sub_rs.matches) {
663 offset_to_matches[it_mr.to].emplace(it_mr);
664 offset_to_lkeys[it_mr.to].emplace(it_lkey.second);
665 if (sub_pcre->highlander) {
666 break;
667 }
668 }
669 }
670 }
671 // Calculate rs for combination expression.
672 vector<char> lv;
673 const auto &comb = compiled.pl.getLogicalTree();
674 lv.resize(m_lkey.size() + comb.size());
675 const auto &li = compiled.pl.getCombInfoById(compiled.report);
676 for (const auto &it : offset_to_lkeys) {
677 for (auto report : it.second) {
678 lv[report] = 1;
679 }
680 if (isLogicalCombination(lv, comb, m_lkey.size(),
681 li.start, li.result)) {
682 for (const auto &mr : offset_to_matches.at(it.first)) {
683 if ((mr.to >= compiled.min_offset) &&
684 (mr.to <= compiled.max_offset)) {
685 rs.addMatch(mr.from, mr.to);
686 }
687 }
688 }
689 }
690 if (isPurelyNegativeMatch(lv, comb, m_lkey.size(),
691 li.start, li.result)) {
692 u64a to = buffer.length();
693 if ((to >= compiled.min_offset) && (to <= compiled.max_offset)) {
694 rs.addMatch(0, to);
695 }
696 }
697 return true;
698 }
699
700 CalloutContext ctx(out);
701
702 pcre_extra extra;
703 extra.flags = 0;
704
705 // If running in traditional HyperScan mode, switch on callouts.
706 bool usingCallouts = isStandardMode(colliderMode);
707 if (usingCallouts) {
708 // Switch on callouts.
709 extra.flags |= PCRE_EXTRA_CALLOUT_DATA;
710 extra.callout_data = &ctx;
711 }
712
713 // Set the match_limit (in order to bound execution time on very complex
714 // patterns)
715 extra.flags |= (PCRE_EXTRA_MATCH_LIMIT | PCRE_EXTRA_MATCH_LIMIT_RECURSION);
716 if (colliderMode == MODE_HYBRID) {
717 extra.match_limit = 10000000;
718 extra.match_limit_recursion = 1500;
719 } else {
720 extra.match_limit = matchLimit;
721 extra.match_limit_recursion = matchLimitRecursion;
722 }
723
724 #ifdef PCRE_NO_START_OPTIMIZE
725 // Switch off optimizations that may result in callouts not occurring.
726 extra.flags |= PCRE_NO_START_OPTIMIZE;
727 #endif
728
729 // Ensure there's enough room in the ovector for the capture groups in this
730 // pattern.
731 int ovecsize = (compiled.captureCount + 1) * 3;
732 ovector.resize(ovecsize);
733
734 int ret;
735 bool hybrid = false;
736 switch (colliderMode) {
737 case MODE_BLOCK:
738 case MODE_STREAMING:
739 case MODE_VECTORED:
740 if (g_streamOffset) {
741 ret = scanOffset(compiled, buffer, extra, ovector, ctx);
742 } else {
743 ret = scanBasic(compiled, buffer, extra, ovector, ctx);
744 }
745 break;
746 case MODE_HYBRID:
747 ret = scanHybrid(compiled, buffer, extra, ovector, rs, out);
748 hybrid = true;
749 break;
750 default:
751 assert(0);
752 ret = PCRE_ERROR_NULL;
753 break;
754 }
755
756 if (ret < PCRE_ERROR_NOMATCH) {
757 error = pcreErrStr(ret);
758 return false;
759 }
760
761 // Move matches into a ResultSet.
762 for (const auto &m : ctx.matches) {
763 unsigned long long from = m.first;
764 unsigned long long to = m.second;
765
766 if (g_streamOffset) {
767 // Subtract stream offset imposed by offset test.
768 unsigned long long offset = min(100ull, g_streamOffset);
769 assert(to >= offset);
770 from -= min(offset, from);
771 to -= offset;
772 }
773
774 rs.addMatch(from, to);
775 }
776
777 // If we have no matches, there's no further work to do.
778 if (rs.matches.empty()) {
779 return true;
780 }
781
782 if (compiled.som && !hybrid) {
783 filterLeftmostSom(rs);
784 }
785
786 filterExtParams(rs, compiled);
787
788 // If we haven't been asked for SOM, strip the from offsets.
789 if (!compiled.som) {
790 set<MatchResult> endonly;
791 for (const auto &m : rs.matches) {
792 endonly.insert(MatchResult(0, m.to));
793 }
794 rs.matches.swap(endonly);
795 }
796
797 return true;
798 }
799