1 /*
2  * Copyright (c) 2015-2019, Intel Corporation
3  *
4  * Redistribution and use in source and binary forms, with or without
5  * modification, are permitted provided that the following conditions are met:
6  *
7  *  * Redistributions of source code must retain the above copyright notice,
8  *    this list of conditions and the following disclaimer.
9  *  * Redistributions in binary form must reproduce the above copyright
10  *    notice, this list of conditions and the following disclaimer in the
11  *    documentation and/or other materials provided with the distribution.
12  *  * Neither the name of Intel Corporation nor the names of its contributors
13  *    may be used to endorse or promote products derived from this software
14  *    without specific prior written permission.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
20  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26  * POSSIBILITY OF SUCH DAMAGE.
27  */
28 
29 #ifdef _WIN32
30 #define PCRE_STATIC
31 #endif
32 #include "config.h"
33 
34 #include "common.h"
35 #include "ExpressionParser.h"
36 #include "expressions.h"
37 #include "GroundTruth.h"
38 #include "pcre_util.h"
39 
40 #include "hs_compile.h" // for hs_expr_ext
41 #include "ue2common.h"
42 #include "parser/control_verbs.h"
43 #include "parser/Parser.h"
44 #include "parser/parse_error.h"
45 #include "util/make_unique.h"
46 #include "util/string_util.h"
47 #include "util/unicode_def.h"
48 #include "util/unordered.h"
49 
50 #include <algorithm>
51 #include <cassert>
52 #include <cstdio>
53 #include <cstdlib>
54 #include <cstring>
55 #include <ostream>
56 #include <sstream>
57 #include <string>
58 #include <vector>
59 
60 #include <pcre.h>
61 
62 /* -X, -Y support
63  * as PCRE performance is `non-linear' and these options add a large amount of
64  * scanning, the following short cuts are used:
65  * 1: the suffix is not scanned - we are more interested in the matches from
66  *    the original corpora.
67  * 2: only the last 50 bytes of the prefix is scanned. This may lead to some
68  *    minor correctness issues for a few patterns.
69  */
70 
71 using namespace std;
72 using namespace ue2;
73 
74 // We store matches in a hash table as we're likely to see lots of them. These
75 // are moved into a ResultSet at the end.
76 using PcreMatchSet = ue2::ue2_unordered_set<pair<unsigned, unsigned>>;
77 
78 namespace {
79 struct CalloutContext {
CalloutContext__anonfb33cbd20111::CalloutContext80     explicit CalloutContext(ostream &os) : out(os) {}
81     ostream &out;
82     PcreMatchSet matches;
83 };
84 }
85 
86 static
pcreCallOut(pcre_callout_block * block)87 int pcreCallOut(pcre_callout_block *block) {
88     assert(block);
89     assert(block->callout_data);
90     CalloutContext *ctx = static_cast<CalloutContext *>(block->callout_data);
91 
92     if (echo_matches) {
93         ctx->out << "PCRE Match @ (" << block->start_match << ","
94                  << block->current_position << ")" << endl;
95     }
96 
97     unsigned int from = block->start_match;
98     unsigned int to = block->current_position;
99     assert(from <= to);
100 
101     ctx->matches.insert(make_pair(from, to));
102     return 1;
103 }
104 
105 static
decodeExprPcre(string & expr,unsigned * flags,bool * highlander,bool * prefilter,bool * som,bool * combination,bool * quiet,hs_expr_ext * ext)106 bool decodeExprPcre(string &expr, unsigned *flags, bool *highlander,
107                     bool *prefilter, bool *som, bool *combination,
108                     bool *quiet, hs_expr_ext *ext) {
109     string regex;
110     unsigned int hs_flags = 0;
111     if (!readExpression(expr, regex, &hs_flags, ext)) {
112         return false;
113     }
114 
115     if (use_literal_api) {
116         // filter out flags not supported by pure literal API.
117         u32 not_supported = HS_FLAG_DOTALL | HS_FLAG_ALLOWEMPTY | HS_FLAG_UTF8 |
118                              HS_FLAG_UCP | HS_FLAG_PREFILTER;
119         hs_flags &= ~not_supported;
120         force_utf8 = false;
121         force_prefilter = false;
122     }
123 
124     expr.swap(regex);
125 
126     if (!getPcreFlags(hs_flags, flags, highlander, prefilter, som,
127                       combination, quiet)) {
128         return false;
129     }
130 
131     if (force_utf8) {
132         *flags |= PCRE_UTF8;
133     }
134 
135     if (force_prefilter) {
136         *prefilter = true;
137     }
138 
139     return true;
140 }
141 
142 static
pcreErrStr(int err)143 string pcreErrStr(int err) {
144     switch (err) {
145         case PCRE_ERROR_NOMATCH:
146             return "PCRE_ERROR_NOMATCH";
147         case PCRE_ERROR_NULL:
148             return "PCRE_ERROR_NULL";
149         case PCRE_ERROR_BADOPTION:
150             return "PCRE_ERROR_BADOPTION";
151         case PCRE_ERROR_BADMAGIC:
152             return "PCRE_ERROR_BADMAGIC";
153 #if defined(PCRE_ERROR_UNKNOWN_OPCODE)
154         case PCRE_ERROR_UNKNOWN_OPCODE:
155             return "PCRE_ERROR_UNKNOWN_OPCODE";
156 #else
157          case PCRE_ERROR_UNKNOWN_NODE:
158              return "PCRE_ERROR_UNKNOWN_NODE";
159 #endif
160         case PCRE_ERROR_NOMEMORY:
161             return "PCRE_ERROR_NOMEMORY";
162         case PCRE_ERROR_NOSUBSTRING:
163             return "PCRE_ERROR_NOSUBSTRING";
164         case PCRE_ERROR_MATCHLIMIT:
165             return "PCRE_ERROR_MATCHLIMIT";
166         case PCRE_ERROR_CALLOUT:
167             return "PCRE_ERROR_CALLOUT";
168         case PCRE_ERROR_BADUTF8:
169             return "PCRE_ERROR_BADUTF8";
170         case PCRE_ERROR_BADUTF8_OFFSET:
171             return "PCRE_ERROR_BADUTF8_OFFSET";
172         case PCRE_ERROR_PARTIAL:
173             return "PCRE_ERROR_PARTIAL";
174         case PCRE_ERROR_BADPARTIAL:
175             return "PCRE_ERROR_BADPARTIAL";
176         case PCRE_ERROR_INTERNAL:
177             return "PCRE_ERROR_INTERNAL";
178         case PCRE_ERROR_BADCOUNT:
179             return "PCRE_ERROR_BADCOUNT";
180 #if defined(PCRE_ERROR_RECURSIONLIMIT)
181         case PCRE_ERROR_RECURSIONLIMIT:
182             return "PCRE_ERROR_RECURSIONLIMIT";
183 #endif
184         case PCRE_ERROR_DFA_UITEM:
185             return "PCRE_ERROR_DFA_UITEM";
186         case PCRE_ERROR_DFA_UCOND:
187             return "PCRE_ERROR_DFA_UCOND";
188         case PCRE_ERROR_DFA_UMLIMIT:
189             return "PCRE_ERROR_DFA_UMLIMIT";
190         case PCRE_ERROR_DFA_WSSIZE:
191             return "PCRE_ERROR_DFA_WSSIZE";
192         case PCRE_ERROR_DFA_RECURSE:
193             return "PCRE_ERROR_DFA_RECURSE";
194         default:
195             {
196                 ostringstream oss;
197                 oss << "Unknown PCRE error (value: " << err << ")";
198                 return oss.str();
199             }
200     }
201 }
202 
203 /* that is, a mode provided by native hyperscan */
204 static
isStandardMode(unsigned int mode)205 bool isStandardMode(unsigned int mode) {
206     return mode == MODE_BLOCK
207         || mode == MODE_STREAMING
208         || mode == MODE_VECTORED;
209 }
210 
GroundTruth(ostream & os,const ExpressionMap & expr,unsigned long int limit,unsigned long int limit_recursion)211 GroundTruth::GroundTruth(ostream &os, const ExpressionMap &expr,
212                          unsigned long int limit,
213                          unsigned long int limit_recursion)
214     : out(os), m_expr(expr), matchLimit(limit),
215       matchLimitRecursion(limit_recursion) {}
216 
global_prep()217 void GroundTruth::global_prep() {
218     if (isStandardMode(colliderMode)) {
219         // We're using pcre callouts
220         pcre_callout = &pcreCallOut;
221     }
222 }
223 
224 static
addCallout(string & re)225 void addCallout(string &re) {
226     // If the string begins with "(*UTF8)" or "(*UTF8)(*UCP)", we want to keep
227     // it at the front. We reuse the control verbs mini-parser for this.
228     size_t startpos = 0;
229     try {
230         ue2::ParseMode mode;
231         const char *ptr = ue2::read_control_verbs(
232             re.c_str(), re.c_str() + re.size(), 0, mode);
233         startpos = ptr - re.c_str();
234     } catch (const ue2::ParseError &err) {
235         // fall through
236     }
237     assert(startpos <= re.length());
238     re.insert(startpos, "(?:");
239     // We include a \E to close any open \Q quoted block. If there isn't
240     // one, pcre will ignore the \E.
241     re.append("\\E)(?C)");
242 }
243 
244 static
isUtf8(const CompiledPcre & compiled)245 bool isUtf8(const CompiledPcre &compiled) {
246     unsigned long int options = 0;
247     pcre_fullinfo(compiled.bytecode, NULL, PCRE_INFO_OPTIONS, &options);
248     return options & PCRE_UTF8;
249 }
250 
251 unique_ptr<CompiledPcre>
compile(unsigned id,bool no_callouts)252 GroundTruth::compile(unsigned id, bool no_callouts) {
253     bool highlander = false;
254     bool prefilter = false;
255     bool som = false;
256     bool combination = false;
257     bool quiet = false;
258 
259     // we can still match approximate matching patterns with PCRE if edit
260     // distance 0 is requested
261     if (force_edit_distance && edit_distance) {
262         throw SoftPcreCompileFailure("Edit distance not supported by PCRE.");
263     }
264 
265     ExpressionMap::const_iterator i = m_expr.find(id);
266     if (i == m_expr.end()) {
267         throw PcreCompileFailure("ID not found in expression map.");
268     }
269 
270     string re(i->second);
271     unsigned flags;
272     hs_expr_ext ext;
273 
274     // Decode the flags
275     if (!decodeExprPcre(re, &flags, &highlander, &prefilter, &som,
276                         &combination, &quiet, &ext)) {
277         throw PcreCompileFailure("Unable to decode flags.");
278     }
279 
280     // When hyperscan literal api is on, transfer the regex string into hex.
281     if (use_literal_api && !combination) {
282         unsigned char *pat
283             = reinterpret_cast<unsigned char *>(const_cast<char *>(re.c_str()));
284         char *str = makeHex(pat, re.length());
285         if (!str) {
286             throw PcreCompileFailure("makeHex() malloc failure.");
287         }
288         re.assign(str);
289         free(str);
290     }
291 
292     // filter out flags not supported by PCRE
293     u64a supported = HS_EXT_FLAG_MIN_OFFSET | HS_EXT_FLAG_MAX_OFFSET |
294                      HS_EXT_FLAG_MIN_LENGTH;
295     if (use_literal_api) {
296         ext.flags &= 0ULL;
297         ext.min_offset = 0;
298         ext.max_offset = MAX_OFFSET;
299         ext.min_length = 0;
300         ext.edit_distance = 0;
301         ext.hamming_distance = 0;
302     }
303     if (ext.flags & ~supported) {
304         // edit distance is a known unsupported flag, so just throw a soft error
305         if (ext.flags & HS_EXT_FLAG_EDIT_DISTANCE) {
306             throw SoftPcreCompileFailure("Edit distance not supported by PCRE.");
307         }
308         if (ext.flags & HS_EXT_FLAG_HAMMING_DISTANCE) {
309             throw SoftPcreCompileFailure(
310                 "Hamming distance not supported by PCRE.");
311         }
312         throw PcreCompileFailure("Unsupported extended flags.");
313     }
314 
315     // Hybrid mode implies SOM.
316     if (colliderMode == MODE_HYBRID) {
317         assert(!use_NFA);
318         som = true;
319     }
320 
321     // SOM flags might be set globally.
322     som |= !!somFlags;
323 
324     // For traditional Hyperscan, add global callout to pattern.
325     if (!combination && !no_callouts && isStandardMode(colliderMode)) {
326         addCallout(re);
327     }
328 
329     // Compile the pattern
330     const char *errptr = nullptr;
331     int errloc = 0;
332     int errcode = 0;
333 
334     unique_ptr<CompiledPcre> compiled = make_unique<CompiledPcre>();
335     compiled->utf8 = flags & PCRE_UTF8;
336     compiled->highlander = highlander;
337     compiled->prefilter = prefilter;
338     compiled->som = som;
339     compiled->combination = combination;
340     compiled->quiet = quiet;
341     compiled->min_offset = ext.min_offset;
342     compiled->max_offset = ext.max_offset;
343     compiled->min_length = ext.min_length;
344     compiled->expression = i->second; // original PCRE
345     flags |= PCRE_NO_AUTO_POSSESS;
346 
347     if (compiled->combination) {
348         compiled->pl.parseLogicalCombination(id, re.c_str(), ~0U, 0, ~0ULL);
349         compiled->pl.logicalKeyRenumber();
350         compiled->report = id;
351         return compiled;
352     }
353 
354     compiled->bytecode =
355         pcre_compile2(re.c_str(), flags, &errcode, &errptr, &errloc, nullptr);
356 
357     if (!compiled->bytecode || errptr) {
358         assert(errcode);
359         ostringstream oss;
360         oss << "Failed to compile expression '" << re << '\'';
361         oss << " (" << errptr << " at " << errloc << ").";
362         if (errcode == 20) { // "regular expression is too large"
363             throw SoftPcreCompileFailure(oss.str());
364         } else if (errcode == 25) { // "lookbehind assertion is not fixed length"
365             throw SoftPcreCompileFailure(oss.str());
366         } else {
367             throw PcreCompileFailure(oss.str());
368         }
369     }
370 
371     // Study the pattern
372     shared_ptr<pcre_extra> extra(pcre_study(compiled->bytecode, 0, &errptr),
373                                  free);
374     if (errptr) {
375         ostringstream oss;
376         oss << "Error studying pattern (" << errptr << ").";
377         throw PcreCompileFailure(oss.str());
378     }
379 
380     int infoRes =
381         pcre_fullinfo(compiled->bytecode, extra.get(), PCRE_INFO_CAPTURECOUNT,
382                       &compiled->captureCount);
383     if (infoRes < PCRE_ERROR_NOMATCH) {
384         ostringstream oss;
385         oss << "Error determining number of capturing subpatterns ("
386             << pcreErrStr(infoRes) << ").";
387         throw PcreCompileFailure(oss.str());
388     }
389 
390     compiled->utf8 |= isUtf8(*compiled);
391 
392     return compiled;
393 }
394 
395 static
filterLeftmostSom(ResultSet & rs)396 void filterLeftmostSom(ResultSet &rs) {
397     if (rs.matches.size() <= 1) {
398         return;
399     }
400 
401     set<u64a> seen; // End offsets.
402     set<MatchResult>::iterator it = rs.matches.begin();
403     while (it != rs.matches.end()) {
404         if (seen.insert(it->to).second) {
405             ++it; // First time we've seen this end-offset.
406         } else {
407             rs.matches.erase(it++); // Dupe with a "righter" SOM.
408         }
409     }
410 }
411 
412 static
filterExtParams(ResultSet & rs,const CompiledPcre & compiled)413 void filterExtParams(ResultSet &rs, const CompiledPcre &compiled) {
414     set<MatchResult>::iterator it = rs.matches.begin();
415     while (it != rs.matches.end()) {
416         unsigned int from = it->from, to = it->to;
417         unsigned int len = to - from;
418         if (to < compiled.min_offset || to > compiled.max_offset ||
419                 len < compiled.min_length) {
420             rs.matches.erase(it++);
421         } else {
422             ++it;
423         }
424     }
425 }
426 
427 static
scanBasic(const CompiledPcre & compiled,const string & buffer,const pcre_extra & extra,vector<int> & ovector,CalloutContext & ctx)428 int scanBasic(const CompiledPcre &compiled, const string &buffer,
429               const pcre_extra &extra, vector<int> &ovector,
430               CalloutContext &ctx) {
431     const size_t prefix_len = g_corpora_prefix.size();
432     const size_t suffix_len = g_corpora_suffix.size();
433 
434     size_t begin_offset = prefix_len - MIN(50, prefix_len);
435     size_t real_len = buffer.size();
436 
437     if (suffix_len > 2) {
438         real_len -= suffix_len - 2;
439     }
440 
441     int flags = suffix_len ? PCRE_NOTEOL : 0;
442     int ret = pcre_exec(compiled.bytecode, &extra, buffer.c_str(), real_len,
443                         begin_offset, flags, &ovector[0], ovector.size());
444 
445     if (!g_corpora_prefix.empty()) {
446         PcreMatchSet tmp;
447         tmp.swap(ctx.matches);
448 
449         for (const auto &m : tmp) {
450             unsigned from = m.first;
451             unsigned to = m.second;
452             if (to >= prefix_len && to <= buffer.size() - suffix_len) {
453                 from = from < prefix_len ? 0 : from - prefix_len;
454                 to -= prefix_len;
455                 ctx.matches.insert(make_pair(from, to));
456             }
457         }
458     }
459 
460     return ret;
461 }
462 
463 static
makeCaptureVec(const vector<int> & ovector,int ret)464 CaptureVec makeCaptureVec(const vector<int> &ovector, int ret) {
465     assert(ret > 0);
466 
467     CaptureVec cap;
468 
469     if (no_groups) {
470         return cap; // No group info requested.
471     }
472 
473     cap.reserve(ret * 2);
474     for (int i = 0; i < ret * 2; i += 2) {
475         int from = ovector[i], to = ovector[i + 1];
476         cap.push_back(make_pair(from, to));
477     }
478     return cap;
479 }
480 
481 static
scanHybrid(const CompiledPcre & compiled,const string & buffer,const pcre_extra & extra,vector<int> & ovector,ResultSet & rs,ostream & out)482 int scanHybrid(const CompiledPcre &compiled, const string &buffer,
483                const pcre_extra &extra, vector<int> &ovector,
484                ResultSet &rs, ostream &out) {
485     int len = (int)buffer.length();
486     int startoffset = 0;
487     bool utf8 = isUtf8(compiled);
488 
489     int flags = 0;
490     int ret;
491     do {
492         ret = pcre_exec(compiled.bytecode, &extra, buffer.c_str(), len,
493                         startoffset, flags, &ovector[0], ovector.size());
494 
495         if (ret <= PCRE_ERROR_NOMATCH) {
496             return ret;
497         }
498 
499         int from = ovector.at(0);
500         int to = ovector.at(1);
501         rs.addMatch(from, to, makeCaptureVec(ovector, ret));
502 
503         if (echo_matches) {
504             out << "PCRE Match @ (" << from << "," << to << ")" << endl;
505         }
506 
507         // If we only wanted a single match, we're done.
508         if (compiled.highlander) break;
509 
510         // Next scan starts at the first codepoint after the match. It's
511         // possible that we have a vacuous match, in which case we must step
512         // past it to ensure that we always progress.
513         if (from != to) {
514             startoffset = to;
515         } else if (utf8) {
516             startoffset = to + 1;
517             while (startoffset < len
518                    && ((buffer[startoffset] & 0xc0) == UTF_CONT_BYTE_HEADER)) {
519                 ++startoffset;
520             }
521         } else {
522             startoffset = to + 1;
523         }
524     } while (startoffset <= len);
525 
526     return ret;
527 }
528 
529 static
scanOffset(const CompiledPcre & compiled,const string & buffer,const pcre_extra & extra,vector<int> & ovector,CalloutContext & ctx)530 int scanOffset(const CompiledPcre &compiled, const string &buffer,
531                const pcre_extra &extra, vector<int> &ovector,
532                CalloutContext &ctx) {
533     size_t offset = MIN(100, g_streamOffset);
534     assert(offset > 0);
535 
536     const string buf(string(offset, '\0') + buffer);
537 
538     // First, scan our preamble so that we can discard any matches therein
539     // after the real scan, later. We use PCRE_NOTEOL so that end-anchors in
540     // our expression don't match at the end of the preamble.
541     int ret = pcre_exec(compiled.bytecode, &extra, buf.c_str(), offset, 0,
542                         PCRE_NOTEOL, &ovector[0], ovector.size());
543     if (ret < PCRE_ERROR_NOMATCH) {
544         return ret;
545     }
546 
547     PcreMatchSet pre_matches;
548     pre_matches.swap(ctx.matches);
549 
550     // Real scan.
551     ret = pcre_exec(compiled.bytecode, &extra, buf.c_str(), buf.size(), 0, 0,
552                     &ovector[0], ovector.size());
553     if (ret < PCRE_ERROR_NOMATCH) {
554         return ret;
555     }
556 
557     // Erase any matches due entirely to the preamble.
558     for (const auto &m : pre_matches) {
559         ctx.matches.erase(m);
560     }
561 
562     return ret;
563 }
564 
565 /** \brief Returns 1 if compliant to all logical combinations. */
566 static
isLogicalCombination(vector<char> & lv,const vector<LogicalOp> & comb,size_t lkeyCount,unsigned start,unsigned result)567 char isLogicalCombination(vector<char> &lv, const vector<LogicalOp> &comb,
568                           size_t lkeyCount, unsigned start, unsigned result) {
569     assert(start <= result);
570     for (unsigned i = start; i <= result; i++) {
571         const LogicalOp &op = comb[i - lkeyCount];
572         assert(i == op.id);
573         switch (op.op) {
574         case LOGICAL_OP_NOT:
575             lv[op.id] = !lv[op.ro];
576             break;
577         case LOGICAL_OP_AND:
578             lv[op.id] = lv[op.lo] & lv[op.ro]; // &&
579             break;
580         case LOGICAL_OP_OR:
581             lv[op.id] = lv[op.lo] | lv[op.ro]; // ||
582             break;
583         default:
584             assert(0);
585             break;
586         }
587     }
588     return lv[result];
589 }
590 
591 /** \brief Returns 1 if combination matches when no sub-expression matches. */
592 static
isPurelyNegativeMatch(vector<char> & lv,const vector<LogicalOp> & comb,size_t lkeyCount,unsigned start,unsigned result)593 char isPurelyNegativeMatch(vector<char> &lv, const vector<LogicalOp> &comb,
594                            size_t lkeyCount, unsigned start, unsigned result) {
595     assert(start <= result);
596     for (unsigned i = start; i <= result; i++) {
597         const LogicalOp &op = comb[i - lkeyCount];
598         assert(i == op.id);
599         switch (op.op) {
600         case LOGICAL_OP_NOT:
601             if ((op.ro < lkeyCount) && lv[op.ro]) {
602                 // sub-expression not negative
603                 return 0;
604             }
605             lv[op.id] = !lv[op.ro];
606             break;
607         case LOGICAL_OP_AND:
608             if (((op.lo < lkeyCount) && lv[op.lo]) ||
609                 ((op.ro < lkeyCount) && lv[op.ro])) {
610                 // sub-expression not negative
611                 return 0;
612             }
613             lv[op.id] = lv[op.lo] & lv[op.ro]; // &&
614             break;
615         case LOGICAL_OP_OR:
616             if (((op.lo < lkeyCount) && lv[op.lo]) ||
617                 ((op.ro < lkeyCount) && lv[op.ro])) {
618                 // sub-expression not negative
619                 return 0;
620             }
621             lv[op.id] = lv[op.lo] | lv[op.ro]; // ||
622             break;
623         default:
624             assert(0);
625             break;
626         }
627     }
628     return lv[result];
629 }
630 
run(unsigned,const CompiledPcre & compiled,const string & buffer,ResultSet & rs,string & error)631 bool GroundTruth::run(unsigned, const CompiledPcre &compiled,
632                       const string &buffer, ResultSet &rs, string &error) {
633     if (compiled.quiet) {
634         return true;
635     }
636 
637     if (compiled.combination) {
638         // Compile and run sub-expressions, store match results.
639         map<unsigned long long, set<MatchResult>> offset_to_matches;
640         map<unsigned long long, set<unsigned>> offset_to_lkeys;
641         set<unsigned> sub_exps;
642         const auto &m_lkey = compiled.pl.getLkeyMap();
643         for (const auto &it_lkey : m_lkey) {
644             if (sub_exps.find(it_lkey.first) == sub_exps.end()) {
645                 sub_exps.emplace(it_lkey.first);
646                 ResultSet sub_rs(RESULT_FROM_PCRE);
647                 shared_ptr<CompiledPcre> sub_pcre;
648                 try {
649                     sub_pcre = compile(it_lkey.first);
650                 }
651                 catch (const SoftPcreCompileFailure &err) {
652                     return false;
653                 }
654                 catch (const PcreCompileFailure &err) {
655                     return false;
656                 }
657                 sub_pcre->quiet = false; // force not quiet in sub-exp.
658                 if (!run(it_lkey.first, *sub_pcre, buffer, sub_rs, error)) {
659                     rs.clear();
660                     return false;
661                 }
662                 for (const auto &it_mr : sub_rs.matches) {
663                     offset_to_matches[it_mr.to].emplace(it_mr);
664                     offset_to_lkeys[it_mr.to].emplace(it_lkey.second);
665                     if (sub_pcre->highlander) {
666                         break;
667                     }
668                 }
669             }
670         }
671         // Calculate rs for combination expression.
672         vector<char> lv;
673         const auto &comb = compiled.pl.getLogicalTree();
674         lv.resize(m_lkey.size() + comb.size());
675         const auto &li = compiled.pl.getCombInfoById(compiled.report);
676         for (const auto &it : offset_to_lkeys) {
677             for (auto report : it.second) {
678                 lv[report] = 1;
679             }
680             if (isLogicalCombination(lv, comb, m_lkey.size(),
681                                      li.start, li.result)) {
682                 for (const auto &mr : offset_to_matches.at(it.first)) {
683                     if ((mr.to >= compiled.min_offset) &&
684                         (mr.to <= compiled.max_offset)) {
685                         rs.addMatch(mr.from, mr.to);
686                     }
687                 }
688             }
689         }
690         if (isPurelyNegativeMatch(lv, comb, m_lkey.size(),
691                                   li.start, li.result)) {
692             u64a to = buffer.length();
693             if ((to >= compiled.min_offset) && (to <= compiled.max_offset)) {
694                 rs.addMatch(0, to);
695             }
696         }
697         return true;
698     }
699 
700     CalloutContext ctx(out);
701 
702     pcre_extra extra;
703     extra.flags = 0;
704 
705     // If running in traditional HyperScan mode, switch on callouts.
706     bool usingCallouts = isStandardMode(colliderMode);
707     if (usingCallouts) {
708         // Switch on callouts.
709         extra.flags |= PCRE_EXTRA_CALLOUT_DATA;
710         extra.callout_data = &ctx;
711     }
712 
713     // Set the match_limit (in order to bound execution time on very complex
714     // patterns)
715     extra.flags |= (PCRE_EXTRA_MATCH_LIMIT | PCRE_EXTRA_MATCH_LIMIT_RECURSION);
716     if (colliderMode == MODE_HYBRID) {
717         extra.match_limit = 10000000;
718         extra.match_limit_recursion = 1500;
719     } else {
720         extra.match_limit = matchLimit;
721         extra.match_limit_recursion = matchLimitRecursion;
722     }
723 
724 #ifdef PCRE_NO_START_OPTIMIZE
725     // Switch off optimizations that may result in callouts not occurring.
726     extra.flags |= PCRE_NO_START_OPTIMIZE;
727 #endif
728 
729     // Ensure there's enough room in the ovector for the capture groups in this
730     // pattern.
731     int ovecsize = (compiled.captureCount + 1) * 3;
732     ovector.resize(ovecsize);
733 
734     int ret;
735     bool hybrid = false;
736     switch (colliderMode) {
737     case MODE_BLOCK:
738     case MODE_STREAMING:
739     case MODE_VECTORED:
740         if (g_streamOffset) {
741             ret = scanOffset(compiled, buffer, extra, ovector, ctx);
742         } else {
743             ret = scanBasic(compiled, buffer, extra, ovector, ctx);
744         }
745         break;
746     case MODE_HYBRID:
747         ret = scanHybrid(compiled, buffer, extra, ovector, rs, out);
748         hybrid = true;
749         break;
750     default:
751         assert(0);
752         ret = PCRE_ERROR_NULL;
753         break;
754     }
755 
756     if (ret < PCRE_ERROR_NOMATCH) {
757         error = pcreErrStr(ret);
758         return false;
759     }
760 
761     // Move matches into a ResultSet.
762     for (const auto &m : ctx.matches) {
763         unsigned long long from = m.first;
764         unsigned long long to = m.second;
765 
766         if (g_streamOffset) {
767             // Subtract stream offset imposed by offset test.
768             unsigned long long offset = min(100ull, g_streamOffset);
769             assert(to >= offset);
770             from -= min(offset, from);
771             to -= offset;
772         }
773 
774         rs.addMatch(from, to);
775     }
776 
777     // If we have no matches, there's no further work to do.
778     if (rs.matches.empty()) {
779         return true;
780     }
781 
782     if (compiled.som && !hybrid) {
783         filterLeftmostSom(rs);
784     }
785 
786     filterExtParams(rs, compiled);
787 
788     // If we haven't been asked for SOM, strip the from offsets.
789     if (!compiled.som) {
790         set<MatchResult> endonly;
791         for (const auto &m : rs.matches) {
792             endonly.insert(MatchResult(0, m.to));
793         }
794         rs.matches.swap(endonly);
795     }
796 
797     return true;
798 }
799