1 /*
2  * Copyright (c) 2018, Intel Corporation
3  *
4  * Redistribution and use in source and binary forms, with or without
5  * modification, are permitted provided that the following conditions are met:
6  *
7  *  * Redistributions of source code must retain the above copyright notice,
8  *    this list of conditions and the following disclaimer.
9  *  * Redistributions in binary form must reproduce the above copyright
10  *    notice, this list of conditions and the following disclaimer in the
11  *    documentation and/or other materials provided with the distribution.
12  *  * Neither the name of Intel Corporation nor the names of its contributors
13  *    may be used to endorse or promote products derived from this software
14  *    without specific prior written permission.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
20  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26  * POSSIBILITY OF SUCH DAMAGE.
27  */
28 
29 #ifdef _WIN32
30 #define PCRE_STATIC
31 #endif
32 #include "config.h"
33 
34 #include "common.h"
35 #include "engine_pcre.h"
36 #include "heapstats.h"
37 #include "huge.h"
38 #include "sqldb.h"
39 #include "timer.h"
40 
41 #include "util/make_unique.h"
42 #include "util/unicode_def.h"
43 
44 #include <algorithm>
45 
46 using namespace std;
47 
EnginePCREContext(int capture_cnt)48 EnginePCREContext::EnginePCREContext(int capture_cnt) {
49     ovec = (int *)malloc((capture_cnt + 1)* sizeof(int) * 3);
50 }
51 
~EnginePCREContext()52 EnginePCREContext::~EnginePCREContext() {
53     free(ovec);
54 }
55 
56 namespace /* anonymous */ {
57 
58 /** Scan context structure passed to the onMatch callback function. */
59 struct ScanPCREContext {
ScanPCREContext__anon7e26c13d0111::ScanPCREContext60     ScanPCREContext(unsigned id_in, ResultEntry &result_in)
61         : id(id_in), result(result_in) {}
62     unsigned id;
63     ResultEntry &result;
64 };
65 
66 } // namespace
67 
68 /**
69  * Function called for every match that PCRE produces, used when
70  * "echo matches" is off.
71  */
72 static
onMatch(ScanPCREContext * sc)73 int onMatch(ScanPCREContext *sc) {
74     assert(sc);
75     sc->result.matches++;
76 
77     return 0;
78 }
79 
80 /**
81  * Function called for every match that PCRE produces when "echo
82  * matches" is enabled.
83  */
84 static
onMatchEcho(unsigned int id,unsigned long long,unsigned long long to,ScanPCREContext * sc)85 int onMatchEcho(unsigned int id, unsigned long long, unsigned long long to,
86                 ScanPCREContext *sc) {
87     assert(sc);
88     sc->result.matches++;
89 
90     printf("Match @%u:%llu for %u\n", sc->id, to, id);
91 
92     return 0;
93 }
94 
EnginePCRE(vector<unique_ptr<PcreDB>> dbs_in,CompilePCREStats cs,int capture_cnt_in)95 EnginePCRE::EnginePCRE(vector<unique_ptr<PcreDB>> dbs_in, CompilePCREStats cs,
96                        int capture_cnt_in)
97     : dbs(move(dbs_in)), compile_stats(move(cs)),
98       capture_cnt(capture_cnt_in) {}
99 
~EnginePCRE()100 EnginePCRE::~EnginePCRE() {
101     for (auto &pcreDB : dbs) {
102         free(pcreDB->extra);
103         free(pcreDB->db);
104     }
105 }
106 
makeContext() const107 unique_ptr<EngineContext> EnginePCRE::makeContext() const {
108     return ue2::make_unique<EnginePCREContext>(capture_cnt);
109 }
110 
scan(const char * data,unsigned int len,unsigned int id,ResultEntry & result,EngineContext & ectx) const111 void EnginePCRE::scan(const char *data, unsigned int len, unsigned int id,
112                       ResultEntry &result, EngineContext &ectx) const {
113     assert(data);
114 
115     ScanPCREContext sc(id, result);
116     auto &ctx = static_cast<EnginePCREContext &>(ectx);
117     int *ovec = ctx.ovec;
118     int ovec_size = (capture_cnt + 1) * 3;
119     for (const auto &pcreDB : dbs) {
120         int startoffset = 0;
121         bool utf8 = pcreDB->utf8;
122         bool highlander = pcreDB->highlander;
123 
124         int flags = 0;
125         int ret;
126         do {
127             ret = pcre_exec(pcreDB->db, pcreDB->extra, data, len,
128                             startoffset, flags, ovec, ovec_size);
129             if (ret <= PCRE_ERROR_NOMATCH) {
130                 break;
131             }
132 
133             int from = ovec[0];
134             int to = ovec[1];
135             assert(from <= to);
136 
137             if (echo_matches) {
138                 onMatchEcho(pcreDB->id, from, to, &sc);
139             } else {
140                 onMatch(&sc);
141             }
142 
143             // If we only wanted a single match, we're done.
144             if (highlander) {
145                 break;
146             }
147 
148             // Next scan starts at the first codepoint after the match. It's
149             // possible that we have a vacuous match, in which case we must step
150             // past it to ensure that we always progress.
151             if (from != to) {
152                 startoffset = to;
153             } else if (utf8) {
154                 startoffset = to + 1;
155                 while (startoffset < (int)len &&
156                        ((data[startoffset] & 0xc0) == UTF_CONT_BYTE_HEADER)) {
157                     ++startoffset;
158                 }
159             } else {
160                 startoffset = to + 1;
161             }
162         } while (startoffset <= (int)len);
163 
164         if (ret < PCRE_ERROR_NOMATCH) {
165             printf("Fatal error: pcre returned error %d\n", ret);
166             abort();
167         }
168     }
169 }
170 
171 // vectoring scan
scan_vectored(UNUSED const char * const * data,UNUSED const unsigned int * len,UNUSED unsigned int count,UNUSED unsigned int streamId,UNUSED ResultEntry & result,UNUSED EngineContext & ectx) const172 void EnginePCRE::scan_vectored(UNUSED const char *const *data,
173                                UNUSED const unsigned int *len,
174                                UNUSED unsigned int count,
175                                UNUSED unsigned int streamId,
176                                UNUSED ResultEntry &result,
177                                UNUSED EngineContext &ectx) const {
178     printf("PCRE matcher can't support vectored mode.\n");
179     abort();
180 }
181 
streamOpen(UNUSED EngineContext & ectx,UNUSED unsigned id) const182 unique_ptr<EngineStream> EnginePCRE::streamOpen(UNUSED EngineContext &ectx,
183                                                 UNUSED unsigned id) const {
184     printf("PCRE matcher can't stream.\n");
185     abort();
186 }
187 
streamClose(UNUSED unique_ptr<EngineStream> stream,UNUSED ResultEntry & result) const188 void EnginePCRE::streamClose(UNUSED unique_ptr<EngineStream> stream,
189                              UNUSED ResultEntry &result) const {
190     printf("PCRE matcher can't stream.\n");
191     abort();
192 }
193 
streamScan(UNUSED EngineStream & stream,UNUSED const char * data,UNUSED unsigned len,UNUSED unsigned id,UNUSED ResultEntry & result) const194 void EnginePCRE::streamScan(UNUSED EngineStream &stream,
195                             UNUSED const char *data,
196                             UNUSED unsigned len, UNUSED unsigned id,
197                             UNUSED ResultEntry &result) const {
198     printf("PCRE matcher can't stream.\n");
199     abort();
200 }
201 
streamCompressExpand(UNUSED EngineStream & stream,UNUSED vector<char> & temp) const202 void EnginePCRE::streamCompressExpand(UNUSED EngineStream &stream,
203                                       UNUSED vector<char> &temp) const {
204     printf("PCRE matcher can't stream.\n");
205     abort();
206 }
207 
printStats() const208 void EnginePCRE::printStats() const {
209     // Output summary information.
210     if (!compile_stats.sigs_name.empty()) {
211         printf("Signature set:        %s\n", compile_stats.sigs_name.c_str());
212     }
213     printf("Signatures:        %s\n", compile_stats.signatures.c_str());
214     printf("PCRE info:         %s\n", compile_stats.db_info.c_str());
215 #ifndef _WIN32
216     printf("Expression count:  %'zu\n", compile_stats.expressionCount);
217     printf("Bytecode size:     %'zu bytes\n", compile_stats.compiledSize);
218     printf("Scratch size:      %'zu bytes\n", compile_stats.scratchSize);
219     printf("Compile time:      %'0.3Lf seconds\n", compile_stats.compileSecs);
220     printf("Peak heap usage:   %'u bytes\n", compile_stats.peakMemorySize);
221 #else
222     printf("Expression count:  %zu\n", compile_stats.expressionCount);
223     printf("Bytecode size:     %zu bytes\n", compile_stats.compiledSize);
224     printf("Scratch size:      %zu bytes\n", compile_stats.scratchSize);
225     printf("Compile time:      %0.3Lf seconds\n", compile_stats.compileSecs);
226     printf("Peak heap usage:   %u bytes\n", compile_stats.peakMemorySize);
227 #endif
228 }
229 
printCsvStats() const230 void EnginePCRE::printCsvStats() const {
231     printf(",\"%s\"", compile_stats.signatures.c_str());
232     printf(",\"%zu\"", compile_stats.expressionCount);
233     printf(",\"%zu\"", compile_stats.compiledSize);
234     printf(",\"%zu\"", compile_stats.scratchSize);
235     printf(",\"%0.3Lf\"", compile_stats.compileSecs);
236     printf(",\"%u\"", compile_stats.peakMemorySize);
237 }
238 
sqlStats(SqlDB & sqldb) const239 void EnginePCRE::sqlStats(SqlDB &sqldb) const {
240     ostringstream crc;
241 
242     static const string Q =
243         "INSERT INTO Compile ("
244             "sigsName, signatures, dbInfo, exprCount, dbSize, crc,"
245             "scratchSize, compileSecs, peakMemory) "
246         "VALUES (?1, ?2, ?3, ?4, ?5, ?6, ?7, ?8, ?9)";
247 
248     sqldb.insert_all(Q, compile_stats.sigs_name, compile_stats.signatures,
249                      compile_stats.db_info, compile_stats.expressionCount,
250                      compile_stats.compiledSize, crc.str(),
251                      compile_stats.scratchSize, compile_stats.compileSecs,
252                      compile_stats.peakMemorySize);
253 }
254 
255 static
decodeExprPCRE(string & expr,unsigned * flags,struct PcreDB & db)256 bool decodeExprPCRE(string &expr, unsigned *flags, struct PcreDB &db) {
257     if (expr[0] != '/') {
258         return false;
259     }
260 
261     size_t end = expr.find_last_of('/');
262     if (end == string::npos) {
263         return false;
264     }
265     string strFlags = expr.substr(end + 1, expr.length() - end - 1);
266 
267     // strip starting and trailing slashes and the flags
268     expr.erase(end, expr.length() - end);
269     expr.erase(0, 1);
270 
271     // decode the flags
272     *flags = 0;
273     for (size_t i = 0; i != strFlags.length(); ++i) {
274         switch (strFlags[i]) {
275             case 's':
276                 *flags |= PCRE_DOTALL;
277                 break;
278             case 'm':
279                 *flags |= PCRE_MULTILINE;
280                 break;
281             case 'i':
282                 *flags |= PCRE_CASELESS;
283                 break;
284             case '8':
285                 *flags |= PCRE_UTF8;
286                 db.utf8 = true;
287                 break;
288             case 'W':
289                 *flags |= PCRE_UCP;
290                 break;
291             case 'H':
292                 db.highlander = true;
293                 break;
294             default:
295                 return false;
296         }
297     }
298 
299     return true;
300 }
301 
302 unique_ptr<EnginePCRE>
buildEnginePcre(const ExpressionMap & expressions,const string & name,const string & sigs_name)303 buildEnginePcre(const ExpressionMap &expressions, const string &name,
304                 const string &sigs_name) {
305     if (expressions.empty()) {
306         assert(0);
307         return nullptr;
308     }
309 
310     long double compileSecs = 0.0;
311     size_t compiledSize = 0.0;
312     unsigned int peakMemorySize = 0;
313     string db_info("Version: ");
314     db_info += string(pcre_version());
315 
316     vector<unique_ptr<PcreDB>> dbs;
317     int capture_cnt = 0;
318 
319     Timer timer;
320     timer.start();
321 
322     for (const auto &m : expressions) {
323         string expr(m.second);
324         unsigned int flags = 0;
325         auto pcreDB = ue2::make_unique<PcreDB>();
326         if (!decodeExprPCRE(expr, &flags, *pcreDB)) {
327             printf("Error parsing PCRE: %s (id %u)\n", m.second.c_str(),
328                     m.first);
329             return nullptr;
330         }
331 
332         const char *errp;
333         int erro;
334         pcre *db = pcre_compile(expr.c_str(), flags, &errp, &erro, NULL);
335 
336         if (!db) {
337             printf("Compile error %s\n", errp);
338             return nullptr;
339         }
340 
341         pcre_extra *extra = pcre_study(db, PCRE_STUDY_JIT_COMPILE, &errp);
342         if (errp) {
343             printf("PCRE could not be studied: %s\n", errp);
344             return nullptr;
345         }
346         if (!extra) {
347             extra = (pcre_extra *)malloc(sizeof(pcre_extra));
348         }
349         int cap = 0; // PCRE_INFO_CAPTURECOUNT demands an int
350         if (pcre_fullinfo(db, extra, PCRE_INFO_CAPTURECOUNT, &cap)) {
351             printf("PCRE fullinfo error\n");
352             free(extra);
353             free(db);
354             return nullptr;
355         }
356         assert(cap >= 0);
357         capture_cnt = max(capture_cnt, cap);
358 
359         size_t db_size = 0;
360         if (pcre_fullinfo(db, extra, PCRE_INFO_SIZE, &db_size)) {
361             printf("PCRE fullinfo error\n");
362             free(extra);
363             free(db);
364             return nullptr;
365         }
366 
367         size_t study_size = 0;
368         if (pcre_fullinfo(db, extra, PCRE_INFO_STUDYSIZE,
369             &study_size)) {
370             printf("PCRE fullinfo error\n");
371             free(extra);
372             free(db);
373             return nullptr;
374         }
375         compiledSize += db_size + study_size;
376 
377         pcreDB->id = m.first;
378         pcreDB->db = db;
379 
380         extra->flags =
381             PCRE_EXTRA_MATCH_LIMIT | PCRE_EXTRA_MATCH_LIMIT_RECURSION;
382         extra->match_limit = 10000000;
383         extra->match_limit_recursion = 1500;
384 
385         pcreDB->extra = extra;
386         dbs.push_back(move(pcreDB));
387     }
388 
389     timer.complete();
390     compileSecs = timer.seconds();
391     peakMemorySize = getPeakHeap();
392 
393     // Collect summary information.
394     CompilePCREStats cs;
395     cs.sigs_name = sigs_name;
396     if (!sigs_name.empty()) {
397         const auto pos = name.find_last_of('/');
398         cs.signatures = name.substr(pos + 1);
399     } else {
400         cs.signatures = name;
401     }
402     cs.db_info = db_info;
403     cs.expressionCount = expressions.size();
404     cs.compiledSize = compiledSize;
405     cs.scratchSize = (capture_cnt  + 1) * sizeof(int) * 3;
406     cs.compileSecs = compileSecs;
407     cs.peakMemorySize = peakMemorySize;
408 
409     return ue2::make_unique<EnginePCRE>(move(dbs), move(cs), capture_cnt);
410 }
411